In [10]:
import pandas as pd
import numpy as np
from pathlib import Path

In [11]:
# Define the base path and input files
base_path = r"C:\Users\wanglab\Desktop\Club Like Endings\102725_1\102725_1_shortened"

input_files = [
    Path(base_path) / "mask_lines.csv",
    Path(base_path) / "mask_lines+media_mod.csv",
    Path(base_path) / "lines+media_mod.csv",
    Path(base_path) / "lines.csv"
]

# Verify files exist
for file in input_files:
    if file.exists():
        print(f"✓ Found: {file.name}")
    else:
        print(f"✗ Missing: {file}")

✓ Found: mask_lines.csv
✓ Found: mask_lines+media_mod.csv
✓ Found: lines+media_mod.csv
✓ Found: lines.csv


In [12]:
# Read all input files and check their structure
dataframes = []
for i, file_path in enumerate(input_files):
    print(f"\nReading {file_path.name}...")
    df = pd.read_csv(file_path)
    print(f"  - Shape: {df.shape}")
    print(f"  - Columns: {list(df.columns)}")
    
    # Add source identifier to track which file each line came from
    df['source_file'] = file_path.name
    df['source_index'] = i
    
    dataframes.append(df)
    
    # Show sample (first 3 rows)
    display(df.head(3))


Reading mask_lines.csv...
  - Shape: (315763, 3)
  - Columns: ['Frame', 'X', 'Y']
  - Shape: (315763, 3)
  - Columns: ['Frame', 'X', 'Y']


Unnamed: 0,Frame,X,Y,source_file,source_index
0,0,"571.8,558.2,451.0,338.0,239.9,180.0,147.0","216.4,210.5,173.1,140.9,121.4,113.4,111.4",mask_lines.csv,0
1,0,"623.9,604.5,593.7","137.1,106.2,78.1",mask_lines.csv,0
2,0,"615.0,599.8,571.1,555.7,543.6","145.0,125.1,81.0,54.2,27.1",mask_lines.csv,0



Reading mask_lines+media_mod.csv...
  - Shape: (308306, 3)
  - Columns: ['Frame', 'X', 'Y']
  - Shape: (308306, 3)
  - Columns: ['Frame', 'X', 'Y']


Unnamed: 0,Frame,X,Y,source_file,source_index
0,0,"571.9,512.0,440.1,380.1,308.0,248.1,179.0,158....","216.4,193.9,169.5,151.8,134.0,122.7,113.3,111....",mask_lines+media_mod.csv,1
1,0,"603.2,468.0","161.9,5.0",mask_lines+media_mod.csv,1
2,0,"613.0,582.6,556.7,542.3,537.0","143.0,99.3,56.2,23.9,3.0",mask_lines+media_mod.csv,1



Reading lines+media_mod.csv...
  - Shape: (390104, 3)
  - Columns: ['Frame', 'X', 'Y']
  - Shape: (390104, 3)
  - Columns: ['Frame', 'X', 'Y']


Unnamed: 0,Frame,X,Y,source_file,source_index
0,0,"437.0,429.9,421.5,399.1,394.3,389.6","423.0,422.4,419.3,397.9,388.9,368.1",lines+media_mod.csv,2
1,0,"571.8,512.0,440.1,381.0,306.1,247.1,179.0,158....","216.4,193.9,169.5,152.0,133.6,122.5,113.3,111....",lines+media_mod.csv,2
2,0,"603.2,468.0","161.8,5.0",lines+media_mod.csv,2



Reading lines.csv...
  - Shape: (294469, 3)
  - Columns: ['Frame', 'X', 'Y']
  - Shape: (294469, 3)
  - Columns: ['Frame', 'X', 'Y']


Unnamed: 0,Frame,X,Y,source_file,source_index
0,0,"571.9,463.9,384.9,338.0,240.0,180.0,147.0","216.4,177.2,153.2,140.9,121.4,113.4,111.4",lines.csv,3
1,0,"623.9,611.8,603.5,593.7","137.1,119.5,104.2,78.1",lines.csv,3
2,0,"615.0,604.7,575.8,556.1,543.7","145.0,132.2,89.5,54.9,27.1",lines.csv,3


In [None]:
# Combine all dataframes - this merges all lines from all files
combined_df = pd.concat(dataframes, ignore_index=True)

# Detect the frame column name (could be 'frame', 'Frame', or similar)
frame_col = None
for col in combined_df.columns:
    if col.lower() == 'frame':
        frame_col = col
        break

# Sort by frame first (if frame column exists), then by source to keep organized
if frame_col:
    combined_df = combined_df.sort_values([frame_col, 'source_index']).reset_index(drop=True)
    print(f"Sorting by '{frame_col}' column")
else:
    print("Warning: No 'frame' column found. Data will be sorted by source only.")
    combined_df = combined_df.sort_values('source_index').reset_index(drop=True)

print(f"Combined dataframe shape: {combined_df.shape}")
print(f"Total rows: {len(combined_df)}")

Sorting by 'Frame' column
Combined dataframe shape: (1308642, 5)
Total rows: 1308642

This means all lines from all 4 files are now combined for each frame.


In [14]:
# Show summary statistics
if frame_col:
    frames = combined_df[frame_col].unique()
    lines_per_frame = combined_df.groupby(frame_col).size()
    
    print("\nSummary Statistics:")
    print(f"  - Total frames: {len(frames)}")
    print(f"  - Lines per frame (mean): {lines_per_frame.mean():.1f}")
    print(f"  - Lines per frame (min): {lines_per_frame.min()}")
    print(f"  - Lines per frame (max): {lines_per_frame.max()}")
    print(f"  - Lines per frame (std): {lines_per_frame.std():.2f}")
    
    # Show distribution
    print("\nLines per frame distribution:")
    print(lines_per_frame.value_counts().sort_index())
    
    # Show breakdown by source file for first frame
    first_frame = frames[0]
    print(f"\nExample - Frame {first_frame} breakdown by source:")
    frame_data = combined_df[combined_df[frame_col] == first_frame]
    print(frame_data.groupby('source_file').size())
    
    # Preview combined data - show one complete frame
    print("\nSample: All lines from first frame:")
    display(combined_df[combined_df[frame_col] == combined_df[frame_col].min()])
else:
    print("No frame column found - showing first 20 rows:")
    display(combined_df.head(20))


Summary Statistics:
  - Total frames: 60000
  - Lines per frame (mean): 21.8
  - Lines per frame (min): 10
  - Lines per frame (max): 35
  - Lines per frame (std): 2.85

Lines per frame distribution:
10       2
11      13
12      40
13     111
14     265
15     604
16    1079
17    1762
18    2984
19    4808
20    6631
21    8618
22    9264
23    8321
24    6081
25    3930
26    2389
27    1554
28     802
29     430
30     207
31      64
32      30
33       7
34       2
35       2
Name: count, dtype: int64

Example - Frame 0 breakdown by source:
source_file
lines+media_mod.csv         6
lines.csv                   5
mask_lines+media_mod.csv    5
mask_lines.csv              5
dtype: int64

Sample: All lines from first frame:


Unnamed: 0,Frame,X,Y,source_file,source_index
0,0,"571.8,558.2,451.0,338.0,239.9,180.0,147.0","216.4,210.5,173.1,140.9,121.4,113.4,111.4",mask_lines.csv,0
1,0,"623.9,604.5,593.7","137.1,106.2,78.1",mask_lines.csv,0
2,0,"615.0,599.8,571.1,555.7,543.6","145.0,125.1,81.0,54.2,27.1",mask_lines.csv,0
3,0,"602.1,468.1","160.9,5.9",mask_lines.csv,0
4,0,"590.2,530.0,392.2","176.8,121.0,0.8",mask_lines.csv,0
5,0,"571.9,512.0,440.1,380.1,308.0,248.1,179.0,158....","216.4,193.9,169.5,151.8,134.0,122.7,113.3,111....",mask_lines+media_mod.csv,1
6,0,"603.2,468.0","161.9,5.0",mask_lines+media_mod.csv,1
7,0,"613.0,582.6,556.7,542.3,537.0","143.0,99.3,56.2,23.9,3.0",mask_lines+media_mod.csv,1
8,0,"623.0,603.0,596.0,591.5,591.7,599.3,605.2","136.0,103.0,86.0,69.1,40.0,15.1,2.1",mask_lines+media_mod.csv,1
9,0,"590.6,531.0,392.1","177.3,122.0,0.9",mask_lines+media_mod.csv,1


In [None]:
# Define output file
output_file = Path(base_path) / "combined_traces.csv"

# Create a clean version without the tracking columns
output_df = combined_df.drop(columns=['source_file', 'source_index'])

# Save combined data
output_df.to_csv(output_file, index=False)
print(f"Saved combined traces to: {output_file}")
print(f"File size: {output_file.stat().st_size / 1024 / 1024:.2f} MB")
print(f"\nOutput contains {len(output_df)} rows")
if frame_col:
    print(f"Across {output_df[frame_col].nunique()} frames")
else:
    print("All lines from all 4 source files are combined!")

Saved combined traces to: C:\Users\wanglab\Desktop\Club Like Endings\102725_1\102725_1_shortened\combined_traces.csv
File size: 90.96 MB

Output contains 1308642 rows
Across 60000 frames
Each frame now has lines from all 4 source files combined!
