## Add new behavior sessions to metadata sheet from data folders

* Notebook to automatically add basepath, basename and video name to metadata.csv for SNLab behavior sessions


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from typing import List

### Functions

In [5]:
def populate_metadata_from_datafolders(
    dataset_path: List[str],
    metadata_path: str,
    ignore_tag: str = "to_split",
    save_metadata: bool = True,
    video_extensions: List[str] = [".avi", ".mp4", ".mov"]
) -> pd.DataFrame:
    """
    Populates or creates a metadata CSV following the specified template structure.
    
    Parameters:
    -----------
    dataset_path : List[str]
        List of root paths to search for data folders
    metadata_path : str
        Path to metadata CSV (will be created if doesn't exist)
    ignore_tag : str, optional
        Folders containing this string will be ignored
    save_metadata : bool, optional
        Whether to save the updated metadata
    video_extensions : List[str], optional
        Video file extensions to search for
    
    Returns:
    --------
    pd.DataFrame
        Updated metadata dataframe following template structure
    """
    
    # Define the complete template columns
    template_columns = [
        "subid", "basepath", "genotype", "age", "session_date", "dob", 
        "vidname", "basename", "pixel_distance", 
        "pixel_dist_reference", "maze_length_cm", "maze_width_cm", 
        "maze_type", "maze_color", "cue_position", "task_exposure", 
        "task_name", "condition", "treatment", "treatment_day", 
        "objects", "paradigm", "trials_ID", "trial_start_1", 
        "trial_stop_1", "trial_start_2", "trial_stop_2", 
        "trial_start_3", "trial_stop_3", "trial_start_4", 
        "trial_stop_4", "notes"
    ]
    
    # Initialize metadata dataframe if file doesn't exist
    if not os.path.exists(metadata_path):
        print(f"Initializing new metadata file at {metadata_path}")
        metadata_df = pd.DataFrame(columns=template_columns)
    else:
        metadata_df = pd.read_csv(metadata_path)
        # Ensure all template columns exist
        for col in template_columns:
            if col not in metadata_df.columns:
                metadata_df[col] = np.nan
    
    # Normalize paths in existing metadata
    if "basepath" in metadata_df.columns:
        metadata_df["basepath"] = metadata_df["basepath"].apply(
            lambda x: os.path.normpath(x) if pd.notna(x) else x
        )
    
    # Find all data folders
    datafolders = []
    for datapath in dataset_path:
        datafolders.append(glob.glob(datapath + "**\*\**\*", recursive=True)) #**\*\**\*

    # unnest list 
    datafolders = [item for sublist in datafolders for item in sublist]

    # remove paths that are not folders
    datafolders = [x for x in datafolders if os.path.isdir(x)]

    # remove folders within to_split folder
    datafolders = [os.path.normpath(x) for x in datafolders if ignore_tag not in x]
    
    # Process each folder
    new_entries = []
    for basepath in datafolders:
        basename = os.path.basename(basepath)
        subid = basename.split("_")[0]
        
        # Skip if already in metadata and complete
        existing_entry = metadata_df[metadata_df["basepath"] == basepath]
        if not existing_entry.empty:
            continue
            
        # Find video files
        vid_files = []
        for ext in video_extensions:
            vid_files.extend(glob.glob(os.path.join(basepath, f"*{ext}")))
        
        # Create base entry structure
        base_entry = {
            "subid": subid,
            "basepath": basepath,
            "basename": basename,
            "vidname": "MISSING" if not vid_files else None,
            # Initialize other fields with None
            **{col: None for col in template_columns if col not in ["subid", "basepath", "basename", "vidname"]}
        }
        
        # Handle video files
        if vid_files:
            for vid in vid_files:
                entry = base_entry.copy()
                entry["vidname"] = os.path.splitext(os.path.basename(vid))[0]
                new_entries.append(entry)
        else:
            new_entries.append(base_entry)
            print(f"No video files found in {basepath}")
    
    # Add new entries to metadata
    if new_entries:
        new_df = pd.DataFrame(new_entries, columns=template_columns)
        metadata_df = pd.concat([metadata_df, new_df], ignore_index=True)
        
        # Clean up duplicates and sort
        metadata_df = metadata_df.drop_duplicates(
            subset=["basepath", "vidname"], 
            keep="last"
        ).sort_values(by=["subid", "basepath"])
    
    # Save if requested
    if save_metadata:
        os.makedirs(os.path.dirname(metadata_path), exist_ok=True)
        metadata_df.to_csv(metadata_path, index=False)
        print(f"Metadata saved to {metadata_path} with {len(metadata_df)} entries")
    
    return metadata_df

### Set paths to dataset and Metadata 


In [14]:
dataset_path = [r"Y:\laura_berkowitz\behavior_validation\appps1_cheeseboard\data\cohort_2"]
metadata_path = r"Y:\laura_berkowitz\behavior_validation\appps1_cheeseboard\metadata3.csv"

In [7]:
dataset_path

['Y:\\laura_berkowitz\\behavior_validation\\appps1_cheeseboard\\data\\cohort_2']

In [15]:
metadata_df = populate_metadata_from_datafolders(dataset_path, metadata_path, save_metadata=True)

Metadata saved to Y:\laura_berkowitz\behavior_validation\appps1_cheeseboard\metadata3.csv with 96 entries


  metadata_df = pd.concat([metadata_df, new_df], ignore_index=True)


In [12]:
metadata_df

Unnamed: 0,subid,basepath,sex,genotype,age,include_bool,exclude_reason,cohort,session_date,dob,...,trial_stop_4,trial_start_5,trial_stop_5,notes,cue_position,task_exposure,treatment_day,objects,paradigm,trials_ID
0,4441,Y:\laura_berkowitz\behavior_validation\appps1_...,F,WT,59,,,2.0,,6/28/2024,...,,,,,,,,,,
1,4441,Y:\laura_berkowitz\behavior_validation\appps1_...,F,WT,59,,,2.0,,6/28/2024,...,,,,,,,,,,
2,4441,Y:\laura_berkowitz\behavior_validation\appps1_...,F,WT,59,,,2.0,8/11/2025,6/28/2024,...,,,,,,,,,,
3,4441,Y:\laura_berkowitz\behavior_validation\appps1_...,F,WT,59,,,2.0,8/12/2025,6/28/2024,...,,,,,,,,,,
4,4441,Y:\laura_berkowitz\behavior_validation\appps1_...,F,WT,59,,,2.0,8/13/2025,6/28/2024,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,4487,Y:\laura_berkowitz\behavior_validation\appps1_...,,,,,,,,,...,,,,,,,,,,
92,4489,Y:\laura_berkowitz\behavior_validation\appps1_...,,,,,,,,,...,,,,,,,,,,
93,4489,Y:\laura_berkowitz\behavior_validation\appps1_...,,,,,,,,,...,,,,,,,,,,
94,4489,Y:\laura_berkowitz\behavior_validation\appps1_...,,,,,,,,,...,,,,,,,,,,


### appps1_ephys / alz_stim

In [41]:
data_dir = [
    r"Y:\laura_berkowitz\app_ps1_ephys\data",
    r"Y:\laura_berkowitz\alz_stim\data",
]

metadata_path = r"Y:\laura_berkowitz\behavior_metadata.csv"

In [46]:
metadata_path = populate_metadata_from_datafolders(data_dir, metadata_path, save_metadata=False)

No video files found in Y:\laura_berkowitz\app_ps1_ephys\data\hpc08\hpc08_day01_220424_120935
No video files found in Y:\laura_berkowitz\app_ps1_ephys\data\hpc13\hpc13_day47_240904_093125
No video files found in Y:\laura_berkowitz\app_ps1_ephys\data\hpc13\hpc13_day56_241016_144140
No video files found in Y:\laura_berkowitz\app_ps1_ephys\data\hpc17\hpc17_241022_100026
No video files found in Y:\laura_berkowitz\alz_stim\data\beta\beta_day01_230626_094432
No video files found in Y:\laura_berkowitz\alz_stim\data\dyno\dyno_test_231106_112841
No video files found in Y:\laura_berkowitz\alz_stim\data\hpstim01\hpstim01_day00_250217_110019
No video files found in Y:\laura_berkowitz\alz_stim\data\hpstim01\hpstim01_day01_250218_122253
No video files found in Y:\laura_berkowitz\alz_stim\data\hpstim01\hpstim01_day02_250219_095818
No video files found in Y:\laura_berkowitz\alz_stim\data\hpstim01\hpstim01_day03_250220_130550
No video files found in Y:\laura_berkowitz\alz_stim\data\hpstim01\hpstim01_da

In [48]:
metadata_path.to_csv(r"Y:\laura_berkowitz\metadata_test.csv", index=False)

### Testing below 

In [None]:
dataset_path = r"Y:\Becka Irwin\IVD\2025 Practice Open Field APP KI Mice\data"
metadata_path = r"Y:\Becka Irwin\IVD\2025 Practice Open Field APP KI Mice\metadata.csv"

In [4]:
metadata_df = pd.read_csv(metadata_path)

# normalize the basepath in metadata_df
metadata_df['basepath'] =[os.path.normpath(x) for x in metadata_df['basepath'].values] 
datafolders = glob.glob(dataset_path+'**\*\**\*', recursive=True)

In [5]:
# remove paths that are not folders
datafolders = [x for x in datafolders if os.path.isdir(x)]

# remove folders within to_split folder 
datafolders = [os.path.normpath(x) for x in datafolders if 'to_split' not in x]
datafolders

['Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\164N\\164N_open_field_day01',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\164N\\164N_open_field_day02',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\164N\\164N_open_field_day03',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\165L\\165L_open_field_day01',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\165L\\165L_open_field_day02',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\165L\\165L_open_field_day03',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\166R\\166R_open_field_day01',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\166R\\166R_open_field_day02',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\166R\\166R_open_field_day03',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\172N\\172N_open_field_day01',
 'Y:\\Becka Irwin\\I

In [None]:
dataset_path = r"Y:\Becka Irwin\IVD\2025 Practice Open Field APP KI Mice\data"
metadata_path = r"Y:\Becka Irwin\IVD\2025 Practice Open Field APP KI Mice\metadata.csv"

#### Iterater through each folder, if a video exists, add each video to the metadata dataframe, including basepath, subid, basename and video name. 

In [None]:
for basepath in datafolders: 
    # check if the folder contains a video file 
    vid_files = glob.glob(basepath + '\*.avi')
    basename = os.path.basename(basepath)
    subid = basename.split('_')[0]
    

    # if the folder contains a video file & the subid is not already in the metadata dataframe
    if (len(vid_files) > 0) & (basepath not in metadata_df['basepath'].values):
        # get index of basepath 
        idx = metadata_df[metadata_df['basepath'] == basepath].index
        # add each video file to the metadata dataframe
        for vid in vid_files:

            # concatenate the metadata dataframe with the new video file
            metadata_df = pd.concat([metadata_df, pd.DataFrame({'subid': subid, 'vidname': os.path.basename(vid).split('.')[0], 'basepath': basepath,'basename':basename}, index=[0])], ignore_index=True)

   # if the folder contains a video file & the subid is already in the metadata dataframe
    elif(len(vid_files) > 0) & (basepath in metadata_df['basepath'].values):
        # get index of basepath 
        idx = metadata_df[metadata_df['basepath'] == basepath].index
        # add each video file to the metadata dataframe
        for vid in vid_files:
            metadata_df.loc[idx,"vidname"] = os.path.basename(vid).split('.')[0]

    elif basepath not in metadata_df['basepath'].values:
        
        metadata_df = pd.concat([metadata_df, pd.DataFrame({'subid': subid, 'vidname': 'MISSING', 'basepath': basepath,'basename':basename}, index=idx)], ignore_index=True)
        print(f'No video files found in {basepath}')

        continue
        
        

In [9]:
metadata_df

Unnamed: 0,subid,basepath,genotype,age,session_date,dob,vidname,basename,exposure,pixel_distance,...,trials_ID,trial_start_1,trial_stop_1,trial_start_2,trial_stop_2,trial_start_3,trial_stop_3,trial_start_4,trial_stop_4,notes
0,164N,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,3/25/2025,6/20/2024,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,164N_open_field_day01,,,...,open_field,474.0,5107.0,,,,,,,
1,164N,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,,6/20/2024,MISSING,164N_open_field_day02,,,...,open_field,,,,,,,,,
2,164N,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,,6/20/2024,MISSING,164N_open_field_day03,,,...,open_field,,,,,,,,,
3,165L,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,3/25/2025,6/20/2024,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,165L_open_field_day01,,,...,open_field,824.0,5582.0,,,,,,,
4,165L,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,,6/20/2024,MISSING,165L_open_field_day02,,,...,open_field,,,,,,,,,
5,165L,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,,6/20/2024,MISSING,165L_open_field_day03,,,...,open_field,,,,,,,,,
6,166R,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,3/25/2025,6/20/2024,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,166R_open_field_day01,,,...,open_field,649.0,5268.0,,,,,,,
7,166R,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,,6/20/2024,MISSING,166R_open_field_day02,,,...,open_field,,,,,,,,,
8,166R,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,,6/20/2024,MISSING,166R_open_field_day03,,,...,open_field,,,,,,,,,
9,172N,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,3/25/2025,7/1/2024,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,172N_open_field_day01,,,...,open_field,679.0,5469.0,,,,,,,


#### Save the updated metadata dataframe back to folder

In [10]:
## Save the metadata to a csv file and don't save the index
metadata_df.to_csv(metadata_path, index=False)