### Explore series lengths

To perform a good pre-preprocessing with target lengths, this notebook explores the lengths of recorded data. 

In [None]:
# Imports

import os
import sys

# Add project root to Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)


import json
from typing import List

import matplotlib.pyplot as plt
import pandas as pd

from schema import ExperimentData
from utils import (get_data_path, get_screw_driving_static_data,
                   get_screw_driving_serial_data)

### Screw driving: 

In [None]:

def get_screw_driving_lengths(pos:str = None) -> List[int]: 

    # Load univariate data for the screw driving recordings 
    static_data = pd.read_csv(get_screw_driving_static_data(), sep=";")

    # Filter by workpiece location 
    static_data = static_data[static_data.workpiece_location==pos]

    step_counts = []

    # Iterate all file names and check lengths
    for file_name in static_data.file_name.tolist():

        # Load the json file as dict
        with open(get_screw_driving_serial_data(file_name)) as file:
            screw_dict = json.load(file)

        # Check for length in each file 
        step_count_angle = 0
        step_count_torque = 0
        for step in screw_dict["tightening steps"]:
            step_count_angle += len(step["graph"]["angle values"])
            step_count_torque += len(step["graph"]["torque values"])

        # Double check 
        assert step_count_angle == step_count_torque
        step_counts.append(step_count_angle)

    return step_counts

def plot_hist(pos:str) -> None: 
    plt.hist(get_screw_driving_lengths(pos), bins=35)
    plt.title(f"Lengths of {pos} screw runs")
    plt.xlim(0,2200)
    plt.show()

- LEFT

In [None]:
plot_hist("left")

- RIGHT

In [None]:
plot_hist("right")

### Injection modling:

In [None]:
def get_injection_molding_lengths(workpiece_type: str) -> List[int]:
   
    if workpiece_type == "upper":
        data_dir = get_data_path("injection_molding", "upper_workpiece", "serial_data")
        file_pattern = "*.csv"
    elif workpiece_type == "lower":
        data_dir = get_data_path("injection_molding", "lower_workpiece", "serial_data") 
        file_pattern = "*.txt"
    else:
        raise ValueError(f"workpiece_type must be 'upper' or 'lower', got: {workpiece_type}")
    
    lengths = []
    
    # Get all files matching the pattern
    files = list(data_dir.glob(file_pattern))
    
    for file_path in files:
        try:
            if workpiece_type == "upper":
                # Load CSV file
                df = pd.read_csv(file_path, index_col=0)
                length = len(df)
                
            elif workpiece_type == "lower":
                # Load TXT file and parse like in LowerInjectionMoldingData
                with open(file_path, "r") as file:
                    lines = file.readlines()
                
                # Find where data starts (after "-start data-")
                data_start_idx = None
                for i, line in enumerate(lines):
                    if "-start data-" in line:
                        data_start_idx = i + 1
                        break
                
                if data_start_idx is None:
                    continue  # Skip files without data section
                    
                # Count data lines (skip empty lines)
                data_lines = lines[data_start_idx:]
                length = sum(1 for line in data_lines if line.strip())
            
            lengths.append(length)
            
        except (FileNotFoundError, pd.errors.EmptyDataError, ValueError) as e:
            print(f"Warning: Could not process {file_path.name}: {e}")
            continue
    
    return lengths



def plot_injection_molding_hist(workpiece_type: str) -> None:
    lengths = get_injection_molding_lengths(workpiece_type)
    plt.hist(lengths, bins=35)
    plt.title(f"Lengths of {workpiece_type} injection molding recordings")
    plt.xlabel("Time series length")
    plt.ylabel("Count")
    plt.show()

- UPPER: 

In [None]:
plot_injection_molding_hist("upper")

- LOWER:

In [None]:
plot_injection_molding_hist("lower")

### Comparison

Using boxplots to check the lengths


In [None]:
def plot_all_lengths_comparison() -> None:
    """Create a box plot comparing time series lengths across all recording types."""
    # Get lengths for all four types
    upper_lengths = get_injection_molding_lengths("upper")
    lower_lengths = get_injection_molding_lengths("lower")
    screw_left_lengths = get_screw_driving_lengths("left")
    screw_right_lengths = get_screw_driving_lengths("right")
    
    # Prepare data for box plot
    data = [upper_lengths, lower_lengths, screw_left_lengths, screw_right_lengths]
    labels = ['Upper Injection', 'Lower Injection', 'Screw Left', 'Screw Right']
    
    # Create box plot
    plt.figure(figsize=(10, 6))
    box_plot = plt.boxplot(data, tick_labels=labels, patch_artist=True)
    
    # Color the boxes for better distinction
    colors = ['lightblue', 'lightgreen', 'orange', 'pink']
    for patch, color in zip(box_plot['boxes'], colors):
        patch.set_facecolor(color)
    
    plt.title('Time Series Length Comparison Across Recording Types')
    plt.ylabel('Number of Data Points')
    plt.xlabel('Recording Type')
    plt.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    
    # Add some statistics as text
    for i, (lengths, label) in enumerate(zip(data, labels)):
        if lengths:  # Only if we have data
            median_val = pd.Series(lengths).median()
            plt.text(i+1, median_val, f'{int(median_val)}', 
                    ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("\nSummary Statistics:")
    print("-" * 50)
    for lengths, label in zip(data, labels):
        if lengths:
            series = pd.Series(lengths)
            print(f"{label}:")
            print(f"  Count: {len(lengths)}")
            print(f"  Min: {series.min()}")
            print(f"  Max: {series.max()}")
            print(f"  Median: {series.median():.0f}")
            print(f"  Mean: {series.mean():.0f}")
            print()

plot_all_lengths_comparison()