## Add new behavior sessions to metadata sheet from data folders

* Notebook to automatically add basepath, basename and video name to metadata.csv for SNLab behavior sessions


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob

### Functions

In [14]:

### Find all folders in dataset_path. Add all basepaths to dataframe, add video details if available
def populate_metadata_from_datafolders(
    dataset_path: list, metadata_path: str, ignore_tag: str = "to_split", save_metadata: bool = False,
    ):

    metadata_df = pd.read_csv(metadata_path)

    # normalize the basepath in metadata_df
    metadata_df["basepath"] = [
        os.path.normpath(x) for x in metadata_df["basepath"].values
    ]

    datafolders = []
    for datapath in dataset_path:
        datafolders.append(glob.glob(datapath + "**\*\*", recursive=True)) #**\*\**\*


    # unnest list 
    datafolders = [item for sublist in datafolders for item in sublist]

    # remove paths that are not folders
    datafolders = [x for x in datafolders if os.path.isdir(x)]

    # remove folders within to_split folder
    datafolders = [os.path.normpath(x) for x in datafolders if ignore_tag not in x]

    for basepath in datafolders:
        # check if the folder contains a video file
        vid_files = glob.glob(basepath + "\*.avi")
        basename = os.path.basename(basepath)
        subid = basename.split("_")[0]

        # if the folder contains a video file & the subid is not already in the metadata dataframe
        if (len(vid_files) > 0) & (basepath not in metadata_df["basepath"].values):
            # get index of basepath
            idx = metadata_df[metadata_df["basepath"] == basepath].index
            # add each video file to the metadata dataframe
            for vid in vid_files:

                # concatenate the metadata dataframe with the new video file
                metadata_df = pd.concat(
                    [
                        metadata_df,
                        pd.DataFrame(
                            {
                                "subid": subid,
                                "vidname": os.path.basename(vid).split(".")[0],
                                "basepath": basepath,
                                "basename": basename,
                            },
                            index=[0],
                        ),
                    ],
                    ignore_index=True,
                )

        # if the folder contains a video file & the subid is already in the metadata dataframe
        elif (len(vid_files) > 0) & (basepath in metadata_df["basepath"].values):
            # get index of basepath
            idx = metadata_df[metadata_df["basepath"] == basepath].index
            # add each video file to the metadata dataframe
            for vid in vid_files:
                metadata_df.loc[idx, "vidname"] = os.path.basename(vid).split(".")[0]

        # if the folder does not contain a video file, still add it to the dataframe
        elif basepath not in metadata_df["basepath"].values:

            # add the folder to the dataframe
            metadata_df = pd.concat(
                [
                    metadata_df,
                    pd.DataFrame(
                        {
                            "subid": subid,
                            "vidname": "MISSING",
                            "basepath": basepath,
                            "basename": basename,
                        },
                        index=[0],
                    ),
                ],
                ignore_index=True,
            )
            print(f"No video files found in {basepath}")

            continue
    
    if save_metadata:
        metadata_df.to_csv(metadata_path, index=False)

    return metadata_df

        

### Set paths to dataset and Metadata 


In [15]:
dataset_path = [r"Y:\Becka Irwin\IVD\2025 Practice Open Field APP KI Mice\data"]
metadata_path = r"Y:\Becka Irwin\IVD\2025 Practice Open Field APP KI Mice\metadata.csv"

In [16]:
metadata_path = populate_metadata_from_datafolders(dataset_path, metadata_path, save_metadata=False)

### appps1_ephys / alz_stim

In [41]:
data_dir = [
    r"Y:\laura_berkowitz\app_ps1_ephys\data",
    r"Y:\laura_berkowitz\alz_stim\data",
]

metadata_path = r"Y:\laura_berkowitz\behavior_metadata.csv"

In [46]:
metadata_path = populate_metadata_from_datafolders(data_dir, metadata_path, save_metadata=False)

No video files found in Y:\laura_berkowitz\app_ps1_ephys\data\hpc08\hpc08_day01_220424_120935
No video files found in Y:\laura_berkowitz\app_ps1_ephys\data\hpc13\hpc13_day47_240904_093125
No video files found in Y:\laura_berkowitz\app_ps1_ephys\data\hpc13\hpc13_day56_241016_144140
No video files found in Y:\laura_berkowitz\app_ps1_ephys\data\hpc17\hpc17_241022_100026
No video files found in Y:\laura_berkowitz\alz_stim\data\beta\beta_day01_230626_094432
No video files found in Y:\laura_berkowitz\alz_stim\data\dyno\dyno_test_231106_112841
No video files found in Y:\laura_berkowitz\alz_stim\data\hpstim01\hpstim01_day00_250217_110019
No video files found in Y:\laura_berkowitz\alz_stim\data\hpstim01\hpstim01_day01_250218_122253
No video files found in Y:\laura_berkowitz\alz_stim\data\hpstim01\hpstim01_day02_250219_095818
No video files found in Y:\laura_berkowitz\alz_stim\data\hpstim01\hpstim01_day03_250220_130550
No video files found in Y:\laura_berkowitz\alz_stim\data\hpstim01\hpstim01_da

In [48]:
metadata_path.to_csv(r"Y:\laura_berkowitz\metadata_test.csv", index=False)

### Testing below 

In [None]:
dataset_path = r"Y:\Becka Irwin\IVD\2025 Practice Open Field APP KI Mice\data"
metadata_path = r"Y:\Becka Irwin\IVD\2025 Practice Open Field APP KI Mice\metadata.csv"

In [4]:
metadata_df = pd.read_csv(metadata_path)

# normalize the basepath in metadata_df
metadata_df['basepath'] =[os.path.normpath(x) for x in metadata_df['basepath'].values] 
datafolders = glob.glob(dataset_path+'**\*\**\*', recursive=True)

In [5]:
# remove paths that are not folders
datafolders = [x for x in datafolders if os.path.isdir(x)]

# remove folders within to_split folder 
datafolders = [os.path.normpath(x) for x in datafolders if 'to_split' not in x]
datafolders

['Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\164N\\164N_open_field_day01',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\164N\\164N_open_field_day02',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\164N\\164N_open_field_day03',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\165L\\165L_open_field_day01',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\165L\\165L_open_field_day02',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\165L\\165L_open_field_day03',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\166R\\166R_open_field_day01',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\166R\\166R_open_field_day02',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\166R\\166R_open_field_day03',
 'Y:\\Becka Irwin\\IVD\\2025 Practice Open Field APP KI Mice\\data\\172N\\172N_open_field_day01',
 'Y:\\Becka Irwin\\I

In [None]:
dataset_path = r"Y:\Becka Irwin\IVD\2025 Practice Open Field APP KI Mice\data"
metadata_path = r"Y:\Becka Irwin\IVD\2025 Practice Open Field APP KI Mice\metadata.csv"

#### Iterater through each folder, if a video exists, add each video to the metadata dataframe, including basepath, subid, basename and video name. 

In [None]:
for basepath in datafolders: 
    # check if the folder contains a video file 
    vid_files = glob.glob(basepath + '\*.avi')
    basename = os.path.basename(basepath)
    subid = basename.split('_')[0]
    

    # if the folder contains a video file & the subid is not already in the metadata dataframe
    if (len(vid_files) > 0) & (basepath not in metadata_df['basepath'].values):
        # get index of basepath 
        idx = metadata_df[metadata_df['basepath'] == basepath].index
        # add each video file to the metadata dataframe
        for vid in vid_files:

            # concatenate the metadata dataframe with the new video file
            metadata_df = pd.concat([metadata_df, pd.DataFrame({'subid': subid, 'vidname': os.path.basename(vid).split('.')[0], 'basepath': basepath,'basename':basename}, index=[0])], ignore_index=True)

   # if the folder contains a video file & the subid is already in the metadata dataframe
    elif(len(vid_files) > 0) & (basepath in metadata_df['basepath'].values):
        # get index of basepath 
        idx = metadata_df[metadata_df['basepath'] == basepath].index
        # add each video file to the metadata dataframe
        for vid in vid_files:
            metadata_df.loc[idx,"vidname"] = os.path.basename(vid).split('.')[0]

    elif basepath not in metadata_df['basepath'].values:
        
        metadata_df = pd.concat([metadata_df, pd.DataFrame({'subid': subid, 'vidname': 'MISSING', 'basepath': basepath,'basename':basename}, index=idx)], ignore_index=True)
        print(f'No video files found in {basepath}')

        continue
        
        

In [9]:
metadata_df

Unnamed: 0,subid,basepath,genotype,age,session_date,dob,vidname,basename,exposure,pixel_distance,...,trials_ID,trial_start_1,trial_stop_1,trial_start_2,trial_stop_2,trial_start_3,trial_stop_3,trial_start_4,trial_stop_4,notes
0,164N,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,3/25/2025,6/20/2024,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,164N_open_field_day01,,,...,open_field,474.0,5107.0,,,,,,,
1,164N,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,,6/20/2024,MISSING,164N_open_field_day02,,,...,open_field,,,,,,,,,
2,164N,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,,6/20/2024,MISSING,164N_open_field_day03,,,...,open_field,,,,,,,,,
3,165L,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,3/25/2025,6/20/2024,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,165L_open_field_day01,,,...,open_field,824.0,5582.0,,,,,,,
4,165L,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,,6/20/2024,MISSING,165L_open_field_day02,,,...,open_field,,,,,,,,,
5,165L,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,,6/20/2024,MISSING,165L_open_field_day03,,,...,open_field,,,,,,,,,
6,166R,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,3/25/2025,6/20/2024,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,166R_open_field_day01,,,...,open_field,649.0,5268.0,,,,,,,
7,166R,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,,6/20/2024,MISSING,166R_open_field_day02,,,...,open_field,,,,,,,,,
8,166R,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,,6/20/2024,MISSING,166R_open_field_day03,,,...,open_field,,,,,,,,,
9,172N,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,APP-KI,,3/25/2025,7/1/2024,Y:\Becka Irwin\IVD\2025 Practice Open Field AP...,172N_open_field_day01,,,...,open_field,679.0,5469.0,,,,,,,


#### Save the updated metadata dataframe back to folder

In [10]:
## Save the metadata to a csv file and don't save the index
metadata_df.to_csv(metadata_path, index=False)