In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path


In [2]:
root = Path("/kaggle/input/mabe-merged-data")
for dir in root.rglob("*"):
    if dir.is_dir():
        print(dir)


/kaggle/input/mabe-merged-data/kaggle
/kaggle/input/mabe-merged-data/kaggle/working
/kaggle/input/mabe-merged-data/kaggle/working/tracking
/kaggle/input/mabe-merged-data/kaggle/working/annotated


In [3]:
afp = Path("/kaggle/input/mabe-merged-data/kaggle/working/annotated/1009459450.parquet")
tfp = Path("/kaggle/input/mabe-merged-data/kaggle/working/tracking/1000217804.parquet")
dfa = pd.read_parquet(afp)
dft = pd.read_parquet(tfp)

In [4]:
print(dft.head())
print(dft.info())

  lateral_left_y tail_middle_1_y  hindpaw_right_x lateral_left_x  \
0           None            None            380.0           None   
1           None            None            171.0           None   
2           None            None            172.0           None   
3           None            None            381.0           None   
4           None            None            171.0           None   

   body_center_x headpiece_bottombackleft_x spine_1_x hip_right_y  \
0          380.0                       None      None        None   
1          157.0                       None      None        None   
2          170.0                       None      None        None   
3          380.0                       None      None        None   
4          157.0                       None      None        None   

  tail_middle_2_x  ear_left_x  ... spine_2_y headpiece_bottomfrontleft_y  \
0            None       352.0  ...      None                        None   
1            None       

In [5]:
print(afp.resolve())
print(dfa.head())
print(dfa.info())

/kaggle/input/mabe-merged-data/kaggle/working/annotated/1009459450.parquet
  lateral_left_y tail_middle_1_y hindpaw_right_x lateral_left_x body_center_x  \
0           None            None            None           None          None   
1           None            None            None           None          None   
2           None            None            None           None          None   
3           None            None            None           None          None   
4           None            None            None           None          None   

  headpiece_bottombackleft_x spine_1_x  hip_right_y tail_middle_2_x  \
0                       None      None   370.541107            None   
1                       None      None   369.221863            None   
2                       None      None   369.698364            None   
3                       None      None   363.496368            None   
4                       None      None   370.619690            None   

   ear_left

In [6]:
# Define the folder path
comp_afp = Path("/kaggle/input/MABe-mouse-behavior-detection/train_annotation")

# Collect all parquet files matching the pattern
files = list(comp_afp.rglob("1009459450.parquet"))

# Make sure at least one file was found
print(f"Found {len(files)} files")

# Load the first one
if files:
    actual_file_data = pd.read_parquet(files[0])
    print(actual_file_data.info())
    print(actual_file_data.head())
else:
    print("No parquet files found matching that name.")

Found 1 files
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   agent_id     126 non-null    int8  
 1   target_id    126 non-null    int8  
 2   action       126 non-null    object
 3   start_frame  126 non-null    int16 
 4   stop_frame   126 non-null    int16 
dtypes: int16(2), int8(2), object(1)
memory usage: 1.9+ KB
None
   agent_id  target_id    action  start_frame  stop_frame
0         1          2  approach          964         991
1         1          2     sniff          992        1130
2         1          2     sniff         1155        1165
3         1          2     sniff         1229        1275
4         1          2     sniff         1287        1381


In [7]:
# check = Path("/kaggle/working/annotated_fixed/1009459450_fixed.parquet")
# check_df=pd.read_parquet(check)
# print(check_df.head())
# print(check_df.info())

In [8]:
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm  # Progress bar for notebooks

broken_dir = Path("/kaggle/input/mabe-merged-data/kaggle/working/annotated")
anno_dir = Path("/kaggle/input/MABe-mouse-behavior-detection/train_annotation")
output_dir = Path("/kaggle/working/annotated_fixed")
output_dir.mkdir(exist_ok=True, parents=True)

def expand_annotations_vectorized(anno_df):
    """
    Expand start/stop frame intervals to per-frame rows.
    Keeps agent_id separate from mouse_id to avoid overwriting coordinates owner.
    """
    lengths = anno_df['stop_frame'] - anno_df['start_frame'] + 1
    frames = np.concatenate([np.arange(start, stop + 1) for start, stop in zip(anno_df['start_frame'], anno_df['stop_frame'])])
    
    agent_ids = np.repeat(anno_df['agent_id'].values, lengths)
    target_ids = np.repeat(anno_df['target_id'].values, lengths)
    actions = np.repeat(anno_df['action'].values, lengths)
    
    return pd.DataFrame({
        'video_frame': frames,
        'agent_id': agent_ids,       # mouse performing the action
        'target_id': target_ids,     # target of the action
        'action': actions
    })

# Build a dictionary mapping video_id -> annotation file path (recursively)
anno_files = {f.stem: f for f in anno_dir.rglob("*.parquet")}

# Loop over all broken merged files with a progress bar
for broken_file in tqdm(list(broken_dir.glob("*.parquet")), desc="Fixing files"):
    vid_id = broken_file.stem

    # Skip if fixed file already exists
    out_path = output_dir / f"{vid_id}_fixed.parquet"
    if out_path.exists():
        continue

    # Skip if there is no broken file to fix
    if vid_id not in anno_files:
        continue

    anno_file = anno_files[vid_id]

    try:
        # Load merged tracking data and annotation file
        merged_df = pd.read_parquet(broken_file)
        anno_df = pd.read_parquet(anno_file)

        # Expand annotations to per-frame rows
        anno_expanded = expand_annotations_vectorized(anno_df)

        # Merge annotations on video_frame only, preserving mouse_id
        fixed_df = merged_df.drop(columns=["action", "target_id", "agent_id"], errors="ignore").merge(
            anno_expanded, on="video_frame", how="left"
        )

        # Save fixed file
        out_path = output_dir / f"{vid_id}_fixed.parquet"
        fixed_df.to_parquet(out_path, index=False)

    except Exception as e:
        print(f"Error processing {vid_id}: {e}")

print("🎉 All done! Check /kaggle/working/annotated_fixed for repaired files.")


Fixing files:   0%|          | 0/847 [00:00<?, ?it/s]

🎉 All done! Check /kaggle/working/annotated_fixed for repaired files.


In [9]:
df = pd.read_parquet("/kaggle/input/fixed-merged-files/annotated_fixed/1009459450_fixed.parquet")
print(df.head())
print(df.info())


  lateral_left_y tail_middle_1_y hindpaw_right_x lateral_left_x body_center_x  \
0           None            None            None           None          None   
1           None            None            None           None          None   
2           None            None            None           None          None   
3           None            None            None           None          None   
4           None            None            None           None          None   

  headpiece_bottombackleft_x spine_1_x  hip_right_y tail_middle_2_x  \
0                       None      None   370.541107            None   
1                       None      None   369.221863            None   
2                       None      None   369.698364            None   
3                       None      None   363.496368            None   
4                       None      None   370.619690            None   

   ear_left_x  ... headpiece_bottomfrontright_y mouse_id  forepaw_right_x  \
0  285.13

In [10]:
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm

# Directories
data_dir = Path("/kaggle/working/annotated_fixed")  # input fixed merged files
output_dir = Path("/kaggle/working/annotated_core")  # output cleaned files
output_dir.mkdir(exist_ok=True, parents=True)

files = list(data_dir.glob("*.parquet"))

# Step 1: Identify core body parts
core_columns = None

for f in tqdm(files, desc="Finding core body parts"):
    # If the cleaned data file exists, skip it
    out_path = output_dir / f.name
    if out_path.exists():
        continue
    df = pd.read_parquet(f)
    coord_cols = [c for c in df.columns if c not in ['video_frame', 'mouse_id', 'agent_id', 'target_id', 'action']]
    tracked_cols = [c for c in coord_cols if df[c].notna().any()]
    
    if core_columns is None:
        core_columns = set(tracked_cols)
    else:
        core_columns &= set(tracked_cols)

core_columns = sorted(core_columns)
print(f"✅ Core tracked body parts ({len(core_columns)}): {core_columns}")

# Step 2: Save updated files with only core features
for f in tqdm(files, desc="Saving cleaned parquet files"):
    df = pd.read_parquet(f)
    df_core = df[core_columns + ['mouse_id', 'video_frame', 'agent_id', 'target_id', 'action']]
    
    out_path = output_dir / f.name
    df_core.to_parquet(out_path, index=False)

print(f"🎉 All done! Cleaned files are in: {output_dir}")


Finding core body parts:   0%|          | 0/847 [00:00<?, ?it/s]

✅ Core tracked body parts (6): ['ear_left_x', 'ear_left_y', 'ear_right_x', 'ear_right_y', 'tail_base_x', 'tail_base_y']


Saving cleaned parquet files:   0%|          | 0/847 [00:00<?, ?it/s]

🎉 All done! Cleaned files are in: /kaggle/working/annotated_core


In [11]:
from pathlib import Path
import pandas as pd

core_dir = Path("/kaggle/working/annotated_core")
files = list(core_dir.glob("*.parquet"))

# Collect columns from all files
all_columns = []

for f in files:
    df = pd.read_parquet(f)
    all_columns.append(tuple(df.columns))  # use tuple to make it hashable

# Check if all column sets are identical
unique_column_sets = set(all_columns)

if len(unique_column_sets) == 1:
    print("✅ All files have consistent columns.")
    print("Columns:", list(unique_column_sets.pop()))
else:
    print(f"⚠️ Found {len(unique_column_sets)} different column sets across files.")
    for cols in unique_column_sets:
        print(cols)


✅ All files have consistent columns.
Columns: ['ear_left_x', 'ear_left_y', 'ear_right_x', 'ear_right_y', 'tail_base_x', 'tail_base_y', 'mouse_id', 'video_frame', 'agent_id', 'target_id', 'action']


In [12]:
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm

# Paths
tracking_dir = Path("/kaggle/input/mabe-merged-data/kaggle/working/tracking")
output_tracking_dir = Path("/kaggle/working/tracking_core")
output_tracking_dir.mkdir(exist_ok=True, parents=True)

tracking_files = list(tracking_dir.glob("*.parquet"))

# Step 1: Identify core body parts in tracking files
core_columns = None

for f in tqdm(tracking_files, desc="Finding core tracking body parts"):
    
     # If the cleaned data file exists, skip it
    out_path = output_tracking_dir / f.name
    if out_path.exists():
        continue
        
    df = pd.read_parquet(f)

    # Exclude ID/annotation columns if present
    exclude_cols = ['video_frame', 'mouse_id', 'agent_id', 'target_id', 'action']
    coord_cols = [c for c in df.columns if c not in exclude_cols]
    
    tracked_cols = [c for c in coord_cols if df[c].notna().any()]
    
    if core_columns is None:
        core_columns = set(tracked_cols)
    else:
        core_columns &= set(tracked_cols)  # intersection
    
core_columns = sorted(core_columns)
print(f"✅ Core tracked body parts ({len(core_columns)}): {core_columns}")

# Step 2: Save cleaned tracking files with only core features
for f in tqdm(tracking_files, desc="Saving cleaned tracking files"):
    df = pd.read_parquet(f)
    
    # Keep only core body parts + essential IDs if present
    keep_cols = core_columns.copy()
    for col in ['mouse_id', 'video_frame', 'agent_id', 'target_id', 'action']:
        if col in df.columns:
            keep_cols.append(col)
    
    df_core = df[keep_cols]
    
    out_path = output_tracking_dir / f.name
    df_core.to_parquet(out_path, index=False)

print(f"🎉 All done! Cleaned tracking files are in: {output_tracking_dir}")


Finding core tracking body parts:   0%|          | 0/7942 [00:00<?, ?it/s]

✅ Core tracked body parts (10): ['body_center_x', 'body_center_y', 'ear_left_x', 'ear_left_y', 'ear_right_x', 'ear_right_y', 'nose_x', 'nose_y', 'tail_base_x', 'tail_base_y']


Saving cleaned tracking files:   0%|          | 0/7942 [00:00<?, ?it/s]

🎉 All done! Cleaned tracking files are in: /kaggle/working/tracking_core


In [13]:
from pathlib import Path
import pandas as pd

tracking_core_dir = Path("/kaggle/working/tracking_core")
files = list(tracking_core_dir.glob("*.parquet"))

# Check column consistency
all_columns = [tuple(pd.read_parquet(f).columns) for f in files]
unique_column_sets = set(all_columns)

if len(unique_column_sets) == 1:
    print("✅ All files have consistent columns.")
    column_list = list(unique_column_sets.pop())
    print("Columns:", column_list)
else:
    print(f"⚠️ Found {len(unique_column_sets)} different column sets across files.")
    for cols in unique_column_sets:
        print(cols)

# Check for NaNs in core body parts
core_columns = [c for c in column_list if c not in ['video_frame', 'mouse_id', 'agent_id', 'target_id', 'action']]
nan_counts = {f.name: pd.read_parquet(f)[core_columns].isna().sum().sum() for f in files}

files_with_nans = {k: v for k, v in nan_counts.items() if v > 0}

if len(files_with_nans) == 0:
    print("✅ No NaNs found in core body parts across all files.")
else:
    print(f"⚠️ Found NaNs in the following files:")
    for fname, count in files_with_nans.items():
        print(f"{fname}: {count} NaNs")


✅ All files have consistent columns.
Columns: ['body_center_x', 'body_center_y', 'ear_left_x', 'ear_left_y', 'ear_right_x', 'ear_right_y', 'nose_x', 'nose_y', 'tail_base_x', 'tail_base_y', 'mouse_id', 'video_frame', 'agent_id', 'target_id', 'action']
⚠️ Found NaNs in the following files:
1375833299.parquet: 15268 NaNs
