In [1]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
from pathlib import Path
import re

# Load spreadsheets directory
spreadsheets_dir = Path("/blue/npadillacoreano/share/reward_comp_extention/Phy_rce2_rce3/phy_curation/eventSpreadsheets")
# spreadsheets_dir = Path("blue_npadillacoreano/npadillacoreano/share/Phy_rce2_rce3/phy_curation/eventSpreadsheets") # doesn't work for some reason
m_recs = Path("/blue/npadillacoreano/share/reward_comp_extention/Phy_rce2_rce3/phy_curation/megadataset")

extra_recs = [
    "20240322_160946_alone_comp_subj_4-4_t5b5_merged.rec",
    "20240323_122227_alone_comp_subj_5-3_t5b5_merged.rec",
    "20240323_165815_alone_comp_subj_4-4_t6b6_merged.rec",
    "20240318_170933_long_comp_subj_4-4_t5b5_merged.rec",
    "20240319_160457_long_comp_subj_4-4_t6b6_merged.rec",
    "20240320_114629_long_comp_subj_5-4_t5b5_merged.rec",
    "20240321_114851_long_comp_subj_5-2_t6b6_merged.rec"
]

extra_recs = [m_recs / name for name in extra_recs]

# Optional: if you know their directory
merged_recs = list(m_recs.glob("*.rec")) + extra_recs


rce2_exclude_conditions = ['temp', 'rewarded', 'omission']


In [2]:
print(merged_recs)

[PosixPath('/blue/npadillacoreano/share/reward_comp_extention/Phy_rce2_rce3/phy_curation/megadataset/20230621_111240_standard_comp_to_omission_D5_subj_1-4_t3b3L_box1_merged.rec'), PosixPath('/blue/npadillacoreano/share/reward_comp_extention/Phy_rce2_rce3/phy_curation/megadataset/20230614_114041_standard_comp_to_training_D3_subj_1-2_t2b2L_box2_merged.rec'), PosixPath('/blue/npadillacoreano/share/reward_comp_extention/Phy_rce2_rce3/phy_curation/megadataset/20240320_114629_long_comp_subj_5-3_t6b6_merged.rec'), PosixPath('/blue/npadillacoreano/share/reward_comp_extention/Phy_rce2_rce3/phy_curation/megadataset/20240317_172017_long_comp_subj_4-2_t6b6_merged.rec'), PosixPath('/blue/npadillacoreano/share/reward_comp_extention/Phy_rce2_rce3/phy_curation/megadataset/20230620_114347_standard_comp_to_omission_D4_subj_1-1_t1b2L_box_2_merged.rec'), PosixPath('/blue/npadillacoreano/share/reward_comp_extention/Phy_rce2_rce3/phy_curation/megadataset/20230618_100636_standard_comp_to_omission_D2_subj_1-1

### This won't work with novel recordings

### Matching subject to merged.rec recording

#### Make this work when there is no match for subject, get subj name from recording and just don't have matching merged.rec

In [3]:
def matchRec(rec):
    '''
    rec: filename string from session_dir (e.g. "20240317_151922_long_comp_subj_3-1_and_3-3")
    Returns: list of (formatted_subj, matching_merged_file or None) tuples, e.g., [('3.1', file1), ('3.3', None)]
    '''

    matched_subjs = []

    # Extract datetime prefix from vid rec filename
    rec_match = re.match(r"(\d{8}_\d{6})", rec)
    if not rec_match:
        print(f"{rec} not able to regex date time")
        return []

    rec_date_time = rec_match.group(1)

    # Extract subjects from the video recording name
    subj_block = re.search(r"subj_([\w\-\.\d]+)", rec)
    if not subj_block:
        print(f"{rec} does not contain valid subject block")
        return []

    raw_subjects = subj_block.group(1)
    all_subjs = re.split(r"vs|_and_|-and-|and", raw_subjects)
    video_subjs = [s.strip() for s in all_subjs if s.strip()]

    for subj in video_subjs:
        formatted = subj.replace("-", ".")
        matched_file = None

        for file in merged_recs:
            if rec_date_time in file.name:
                subj_match = re.search(r"subj_([\d\-]+)", file.name)
                if subj_match:
                    merged_subj = subj_match.group(1)
                    if merged_subj == subj:
                        matched_file = file.name
                        break

        matched_subjs.append((formatted, matched_file))

    # print(f'\nmatched subjects: {matched_subjs}\n')
    return matched_subjs


In [4]:
def eventCondCreate(rec, df, exclude_conditions=[]):
    print("eventCondCreate ran")
    condition_dict_subj1 = {}
    condition_dict_subj2 = {}
    
    # Define all possible labels you want to guarantee exist
    labels = ['alone_rewarded', 'high_comp_win', 'high_comp_lose', 'low_comp_lose', 'low_comp_win', 'high_comp_tie']
    
    for _, row in df.iterrows():
        cond = str(row['condition']).lower() if pd.notna(row['condition']) else ""

        # Skip excluded or empty conditions
        if not cond or pd.isna(cond) or any(ex_cond in cond for ex_cond in exclude_conditions):
            continue

        matched_subjs = matchRec(str(row['session_dir']))
        if not matched_subjs or len(matched_subjs) < 2:
            print("\n\nERROR: no matched_subjs\n\n")
            continue
            
        #if not matched_subjs[0][0]:
         #   matched_subjs[0][0] == 
        
        subj1, subj2 = matched_subjs[0][0], matched_subjs[1][0]

        # Determine result from condition string
        if subj1 in cond:
            subj1_result = "win"
            subj2_result = "lose"
        elif subj2 in cond:
            subj1_result = "lose"
            subj2_result = "win"
        elif "tie" in cond:
            subj1_result = "tie"
            subj2_result = "tie"
        elif "rewarded" in cond:
            subj1_result = "alone_rewarded"
            subj2_result = "alone_rewarded"
        else:
            continue
            

        # Determine label based on competition closeness
        if subj1_result != "alone_rewarded":
            close_str = str(row.get('competition_closeness', ''))
            comp_level = "low_comp" if "Only" in close_str else "high_comp"
            
            subj1_label = f"{comp_level}_{subj1_result}"
            subj2_label = f"{comp_level}_{subj2_result}"
        else:
            subj1_label = subj1_result  # "alone_rewarded"
            subj2_label = subj2_result
            
        timestamps = [int(row['tone_start_timestamp']), int(row['tone_stop_timestamp'])]

        if subj1_label not in condition_dict_subj1:
            condition_dict_subj1[subj1_label] = []
        
        if subj2_label not in condition_dict_subj2:
            condition_dict_subj2[subj2_label] = []

        condition_dict_subj1[subj1_label].append(timestamps)
        condition_dict_subj2[subj2_label].append(timestamps)

    # Ensure all expected labels exist in the output dict
    for label in labels:
        condition_dict_subj1.setdefault(label, [])
        condition_dict_subj2.setdefault(label, [])
        
        
    return {
        matched_subjs[0][1]: condition_dict_subj1,
        matched_subjs[1][1]: condition_dict_subj2
    }
    
    # (merged.rec: {win_comp_closeness: [[timestamps]]}) (merged.rec: {win_comp_closeness: [[tiemstamps]]})
    # return {matched_subjs[0][1]: condition_dict_subj1}, {matched_subjs[1][1]: condition_dict_subj2}

### Original parsing logic with multiple dicts per experiment

In [5]:
'''
# Main parsing logic
def build_experiment_dicts():
    print("build_experiment_dicts ran")
    experiment_dicts = {
        "long_comp": {},
        "alone_comp": {},
        "standard_comp_training": {},
        "standard_comp_omission": {}
    }

    for file in spreadsheets_dir.glob("*.xlsx"):
        # print(f"running {file}")
        df = pd.read_excel(file, engine='openpyxl')
        filename = file.name.lower()
        
        df.columns = df.columns.str.strip() # stupid col names have spaces breaking stuff

        for recording, group in df.groupby("session_dir"):
            if "rce_pilot_3" in filename:
                print("rce3 found")
                if "long_comp" in filename:
                    result = eventCondCreate(recording, group)
                    if result is not None:
                        experiment_dicts["long_comp"][recording] = result
                elif "alone_comp" in filename:
                    result = eventCondCreate(recording, group)
                    if result is not None:
                        experiment_dicts["alone_comp"][recording] = result

            elif "rce_pilot_2" in filename:
                print("rce2 found")
                if "standard_comp_to_training" in recording:
                    result = eventCondCreate(recording, group)
                    if result is not None:
                        experiment_dicts["standard_comp_training"][recording] = result
                elif "standard_comp_to_omission" in recording:
                    result = eventCondCreate(recording, group, exclude_conditions=rce2_exclude_conditions)
                    if result is not None:
                        experiment_dicts["standard_comp_omission"][recording] = result


    return experiment_dicts
    
'''


'\n# Main parsing logic\ndef build_experiment_dicts():\n    print("build_experiment_dicts ran")\n    experiment_dicts = {\n        "long_comp": {},\n        "alone_comp": {},\n        "standard_comp_training": {},\n        "standard_comp_omission": {}\n    }\n\n    for file in spreadsheets_dir.glob("*.xlsx"):\n        # print(f"running {file}")\n        df = pd.read_excel(file, engine=\'openpyxl\')\n        filename = file.name.lower()\n        \n        df.columns = df.columns.str.strip() # stupid col names have spaces breaking stuff\n\n        for recording, group in df.groupby("session_dir"):\n            if "rce_pilot_3" in filename:\n                print("rce3 found")\n                if "long_comp" in filename:\n                    result = eventCondCreate(recording, group)\n                    if result is not None:\n                        experiment_dicts["long_comp"][recording] = result\n                elif "alone_comp" in filename:\n                    result = eventCondCr

### Updated all in one dict

In [6]:
def build_experiment_dict():
    print("build_experiment_dict ran")
    all_data = {}

    for file in spreadsheets_dir.glob("*.xlsx"):
        df = pd.read_excel(file, engine='openpyxl')
        filename = file.name.lower()
        
        df.columns = df.columns.str.strip()  # remove column name spacing issues

        for recording, group in df.groupby("session_dir"):
            result = None
            
           # if recording == '20230619_115321_standard_comp_to_omission_D3_subj_1-2_and_1-4':
                
                # all_data.update({'20230619_115321_standard_comp_to_omission_D3_subj_1-4_t3b3L_box2_merged.rec': })
            #    continue
            
            #if recording == '20230621_111240_standard_comp_to_omission_D5_subj_1-4_and_1-2'
             #   continue

            if "rce_pilot_3" in filename:
                print("rce3 found")
                if "long_comp" in filename or "alone_comp" in filename:
                    result = eventCondCreate(recording, group) # returns {merged_rec_subj1: {label: [[timestamps]]}, merged_rec_subj2: {label: [[timestamps]]}}

            elif "rce_pilot_2" in filename:
                print("rce2 found")
                if "standard_comp_to_training" in recording:
                    result = eventCondCreate(recording, group) # returns {merged_rec_subj1: {label: [[timestamps]]}, merged_rec_subj2: {label: [[timestamps]]}}
                    
                elif "standard_comp_to_omission" in recording:
                    result = eventCondCreate(recording, group, exclude_conditions=rce2_exclude_conditions) # returns {merged_rec_subj1: {label: [[timestamps]]}, merged_rec_subj2: {label: [[timestamps]]}}
                    
                    
            if result is not None:
                # Remove only the None-keyed item
                cleaned_result = {k: v for k, v in result.items() if k is not None}
                if cleaned_result:
                    all_data.update(cleaned_result)

                
    # manually adding missing recs to dict
    

    return all_data


In [7]:
rce_experiment_dict = build_experiment_dict()

#output_base = Path("output_pickles")
#output_base.mkdir(exist_ok=True)

print(f"\nTotal number of names in dictionary: {len(rce_experiment_dict)}\n")

for name, data in rce_experiment_dict.items():
    print(f"\n\nname: {name}\n\n")
    # Print full raw dict (optional, can get long)
    print(data)

build_experiment_dict ran
rce2 found
eventCondCreate ran
rce2 found
eventCondCreate ran
rce2 found
eventCondCreate ran
rce2 found
eventCondCreate ran
rce2 found
eventCondCreate ran
rce2 found
eventCondCreate ran
rce2 found
eventCondCreate ran
rce2 found
eventCondCreate ran
rce2 found
eventCondCreate ran
rce2 found
eventCondCreate ran
rce2 found
rce2 found
rce2 found
rce2 found
rce2 found
rce2 found
rce3 found
eventCondCreate ran
rce3 found
eventCondCreate ran
rce3 found
eventCondCreate ran
rce3 found
eventCondCreate ran
rce3 found
eventCondCreate ran
rce3 found
eventCondCreate ran
rce3 found
eventCondCreate ran
rce3 found
eventCondCreate ran
rce3 found
eventCondCreate ran
rce3 found
eventCondCreate ran
rce3 found
eventCondCreate ran
rce3 found
eventCondCreate ran
rce3 found
eventCondCreate ran
rce3 found
eventCondCreate ran
rce3 found
eventCondCreate ran

Total number of names in dictionary: 46



name: 20230612_101430_standard_comp_to_training_D1_subj_1-4_t4b2L_box1_merged.rec


{'low

### Pickling

In [8]:
output_path = Path("/blue/npadillacoreano/share/reward_comp_extention")

# Save as a single .pkl file
output_file = output_path / "event_dict.pkl"

with open(output_file, "wb") as f:
    pkl.dump(rce_experiment_dict, f)

print(f"✅ Pickle saved to: {output_file}")

✅ Pickle saved to: /blue/npadillacoreano/share/reward_comp_extention/event_dict.pkl
