In [1]:
import os
from boxsdk import OAuth2, Client
import re
import warnings

from datetime import datetime, date
from uuid import uuid4
from dateutil.tz import tzlocal
from ast import literal_eval

import numpy as np
import pandas as pd
import warnings
import librosa
import pickle

from pynwb import NWBHDF5IO, NWBFile, TimeSeries
from pynwb.file import Subject
from pynwb.epoch import TimeIntervals
from pynwb.image import ImageSeries
from ndx_manoli_meta import AssayMetadata

from nwb_utils import get_date_from_block
from box_utils import *
from behavior_error_checks import *

In [2]:
# ---- Get login credentials from environment variables -----

# - note this needs to be adjusted for each authorized user... ask Nerissa about getting and setting this information
# - developer access tokens expire quickly, so this probably needs to be set at startup and perhaps even while working
client_id = os.environ.get('box_client_id')
client_secret = os.environ.get('box_client_secret')
access_token = 'gQi8ngMk4868ZBUf603wd9P4pAQlWioR'


# ----- Open a connection to the Box server -----

auth = OAuth2(
    client_id=client_id, # put these in environmental variables
    client_secret=client_secret,
    access_token=access_token,
)
client = Client(auth)

# ----- Work on directory structure -----

# get the list of items in the whole Scn2a folder
items = client.folder(folder_id='196168550606').get_items()

# list contents
for item in items:
    print(f'{item.type.capitalize()} {item.id} is named "{item.name}"')

Folder 238576018931 is named "Aggregated_Events"
Folder 202955487033 is named "April2023_ShortCoHab_Females"
Folder 226890392440 is named "BorisFiles"
Folder 196172227913 is named "Female Intros"
Folder 196173015061 is named "Female PPTs"
Folder 196170422974 is named "Female RI"
Folder 196169230497 is named "Female SepReunion"
Folder 196171174766 is named "Female TMs"
Folder 214178119262 is named "June2023_ControlBehaviors"
Folder 196166676922 is named "Male Intros"
Folder 196172624162 is named "Male PPTs"
Folder 196171675763 is named "Male RI"
Folder 196172896956 is named "Male SepReunion"
Folder 196172739371 is named "Male TMs"
Folder 248394520156 is named "Naive_Choice"
Folder 261514259471 is named "NWB"
Folder 229587753245 is named "Oct2023_JuvenileBehavior"
File 1177298175085 is named "Scn2aX1Nov22_Key.xlsx"


In [3]:
# ----- Set up user parameters
directory_keyword = 'RI' # how the assay directories are labeled
metadata_file = 'metadata_aggressions.csv' # where the initial metadata is stored
boris_keyword = 'RI'
score_path = os.path.join('Scn2a_X1Behavior_Nov22','Aggregated_Events')
assayregex = 'R|r'
# colors for plotting
fillcols = [[160, 146, 95],[245, 201, 39],[89, 91, 125],[63, 78, 245]]
linecols = [[96, 87, 57],[147, 120, 23],[53, 54, 75],[33, 41, 131]]

In [4]:
# ----- Figure out which directories have the assay of interest

projectDir = '196168550606'
rootpath = ['Scn2a_X1Behavior_Nov22']

usedirs = []
items = client.folder(folder_id=projectDir).get_items() # top level folder

for item in items:
    iid = item.id
    inm = item.name
    if inm.find(directory_keyword)>-1:
        usedirs.append(iid)

print(usedirs)

# --- collect all the paths and filenames

allpaths = []

for usedir in usedirs: 
    
    # add starting folder to curr_path
    udnm = client.folder(folder_id=usedir).get(fields=['name']).name
    curr_path = rootpath + [udnm]
    
    # search for files
    thesepaths = visit_all_dirs_files(usedir,[],curr_path,client)

    # consolidate
    allpaths += thesepaths
    
print(allpaths)

# ----- Get a list of just file names to hunt through -----
justfiles = []
for pth in allpaths:
    justfiles.append(os.path.basename(pth))
print(justfiles)

['196170422974', '196171675763']
['Scn2a_X1Behavior_Nov22\\Female RI\\01112023\\DoNotScore_Nov22_Pair13_RI_2023-01-11 11-59-53.mp4', 'Scn2a_X1Behavior_Nov22\\Female RI\\01112023\\DoNotScore_Nov22_Pair14_RI_2023-01-11 12-00-45.mp4', 'Scn2a_X1Behavior_Nov22\\Female RI\\01112023\\Nov22_Pair15_RI_2023-01-11 12-01-50.mp4', 'Scn2a_X1Behavior_Nov22\\Female RI\\01112023\\Nov22_Pair16_RI_WIN_20230111_12_03_01_Pro.mp4', 'Scn2a_X1Behavior_Nov22\\Female RI\\02012023\\Nov22_Pair21_RI_2023-02-01 13-34-24.mp4', 'Scn2a_X1Behavior_Nov22\\Female RI\\02012023\\Nov22_Pair22_RI_2023-02-01 13-34-27.mp4', 'Scn2a_X1Behavior_Nov22\\Female RI\\02012023\\Nov22_Pair23_RI_2023-02-01 13-34-27 (2).mp4', 'Scn2a_X1Behavior_Nov22\\Female RI\\02152023\\Nov22_Pair28_RI_2023-02-15 12-19-19.mp4', 'Scn2a_X1Behavior_Nov22\\Female RI\\02152023\\Nov22_Pair29_RI_2023-02-15 12-20-10.mp4', 'Scn2a_X1Behavior_Nov22\\Female RI\\02152023\\Nov22_Pair30_RI_2023-02-15 12-20-56.mp4', 'Scn2a_X1Behavior_Nov22\\Female RI\\02152023\\Nov22_Pa

In [5]:
# ----- Use metadata table to find expected files and add those columns to the table -----

# load up metadata
meta = pd.read_csv(metadata_file)

# generate a regex of possible matches for filename
fileIndex = []
for tag in meta.PairTag:
    match = find_in_list(justfiles,tag,assayregex)
    if len(match)==1:
        fileIndex.append(match[0])
    elif len(match)==0:
        print(f'No match for: {tag}')
    else:
        print(f'Found extra matches for: {tag}')
        for idx in match:
            print(justfiles[idx])
        fileIndex.append(match[0])
        
# use fileIndex to look up file names and paths
metafiles = []
metapaths = []
for i,tag in enumerate(meta.PairTag):
    metafiles.append(justfiles[fileIndex[i]])
    metapaths.append('\\'+os.path.dirname(allpaths[fileIndex[i]]))

# add files and paths to the metadata table
meta.VideoFile = metafiles
meta.VideoPath = metapaths

In [6]:
# ----- Get BORIS scored aggregated events file names -----
aggfiles = []
items = client.folder(folder_id='238576018931').get_items()

for item in items:
    inm = item.name
    if inm.find(boris_keyword)>-1:
        aggfiles.append(inm)
        
print(aggfiles)
print(len(aggfiles))

['Nov22_Pair15_RI.csv', 'Nov22_Pair16_RI.csv', 'Nov22_Pair17_RI.csv', 'Nov22_Pair18_RI.csv', 'Nov22_Pair19_RI.csv', 'Nov22_Pair1_RI.csv', 'Nov22_Pair21_RI.csv', 'Nov22_Pair22_RI.csv', 'Nov22_Pair23_RI.csv', 'Nov22_Pair24_RI.csv', 'Nov22_Pair25_RI.csv', 'Nov22_Pair26_RI.csv', 'Nov22_Pair27_RI.csv', 'Nov22_Pair28_RI.csv', 'Nov22_Pair29_RI.csv', 'Nov22_Pair2_RI.csv', 'Nov22_Pair30_RI.csv', 'Nov22_Pair31_RI.csv', 'Nov22_Pair32_RI.csv', 'Nov22_Pair33_RI.csv', 'Nov22_Pair34_RI.csv', 'Nov22_Pair35_RI.csv', 'Nov22_Pair36_RI.csv', 'Nov22_Pair37_RI.csv', 'Nov22_Pair38_RI.csv', 'Nov22_Pair39_RI.csv', 'Nov22_Pair3_RI.csv', 'Nov22_Pair40_RI.csv', 'Nov22_Pair41_RI.csv', 'Nov22_Pair43_RI.csv', 'Nov22_Pair44_RI.csv', 'Nov22_Pair46_RI.csv', 'Nov22_Pair47_RI.csv', 'Nov22_Pair48_RI.csv', 'Nov22_Pair49_RI.csv', 'Nov22_Pair4_RI.csv', 'Nov22_Pair50_RI.csv', 'Nov22_Pair51_RI.csv', 'Nov22_Pair52_RI.csv', 'Nov22_Pair56_RI.csv', 'Nov22_Pair57_RI.csv', 'Nov22_Pair58_RI.csv', 'Nov22_Pair5_RI.csv', 'Nov22_Pair61_R

In [9]:
# ----- Add Boris csvs and colors to metadata table

# match pair tags
ordered_agg = []
for tag in meta.PairTag:
    for fname in aggfiles:
        if fname.startswith(tag+'_'):
            ordered_agg.append(fname)

meta.ScoreFile = ordered_agg

# add path
score_path_arr = [score_path]*len(aggfiles)
meta.ScorePath = score_path_arr

# ----- Add colors to meta file so they go in the NWB metadata -----

# meta.FocalColor
allcols = []
for i, ptag in enumerate(meta.PairTag):
    if meta.FocalSex[i]=='F':
        if meta.FocalGT[i]=='WT':
            tcol = [fillcols[0],linecols[0]]
        else:
            tcol = [fillcols[1],linecols[1]]
    else:
        if meta.FocalGT[i]=='WT':
            tcol = [fillcols[2],linecols[2]]
        else:
            tcol = [fillcols[3],linecols[3]]
    allcols.append(tcol)
    
meta.FocalColor=allcols

# ----- Fix bad date formatting -----
# dx = meta.pop('Unnamed: 0')

# newrecdates = []
# for i, olddate in enumerate(meta.RecDate):
#     dsplit = olddate.split('/')
#     yr = dsplit[2]
#     mn = dsplit[0]
#     dy = dsplit[1]
#     if len(mn)==1:
#         mn = '0'+mn
#     if len(dy)==1:
#         dy = '0'+dy
#     newrecdates.append(yr+mn+dy)
    
# meta.RecDate=newrecdates

# --- Check on metadata before writing it ---
meta.head()

Unnamed: 0,PairTag,AssayType,RecDate,RecTime,VideoFile,VideoPath,ScoreFile,ScorePath,FemaleID,FemaleGT,...,Timeline,Ethogram,RanBy,ScoredBy,FullTimeline,FocalColor,StrangerID,StrangerGT,PPTlane,PartnerChamber
0,Nov22_Pair1,aggression,20221122,14:33,Nov22_Pair1_RI_WIN_20221122_13_26_39_Pro.mp4,\Scn2a_X1Behavior_Nov22\Female RI\11222022,Nov22_Pair1_RI.csv,Scn2a_X1Behavior_Nov22\Aggregated_Events,B8002,Het,...,,,Gina Williams,Josh Steighner,,"[[245, 201, 39], [147, 120, 23]]",,,,
1,Nov22_Pair15,aggression,20230111,13:06,Nov22_Pair15_RI_2023-01-11 12-01-50.mp4,\Scn2a_X1Behavior_Nov22\Female RI\01112023,Nov22_Pair15_RI.csv,Scn2a_X1Behavior_Nov22\Aggregated_Events,B6614,Het,...,,,Gina Williams,Josh Steighner,,"[[245, 201, 39], [147, 120, 23]]",,,,
2,Nov22_Pair16,aggression,20230111,13:06,Nov22_Pair16_RI_WIN_20230111_12_03_01_Pro.mp4,\Scn2a_X1Behavior_Nov22\Female RI\01112023,Nov22_Pair16_RI.csv,Scn2a_X1Behavior_Nov22\Aggregated_Events,B6615,WT,...,,,Gina Williams,Josh Steighner,,"[[160, 146, 95], [96, 87, 57]]",,,,
3,Nov22_Pair17,aggression,20230113,13:49,Nov22_Pair17_RI_2023-01-13 12-45-42.mp4,\Scn2a_X1Behavior_Nov22\Male RI\01132023,Nov22_Pair17_RI.csv,Scn2a_X1Behavior_Nov22\Aggregated_Events,,,...,,,Gina Williams,Josh Steighner,,"[[63, 78, 245], [33, 41, 131]]",,,,
4,Nov22_Pair18,aggression,20230113,13:49,Nov22_Pair18_RI_2023-01-13 12-46-39.mp4,\Scn2a_X1Behavior_Nov22\Male RI\01132023,Nov22_Pair18_RI.csv,Scn2a_X1Behavior_Nov22\Aggregated_Events,,,...,,,Gina Williams,Josh Steighner,,"[[89, 91, 125], [53, 54, 75]]",,,,


In [10]:
checkPairTags = ['Nov22_Pair37','Nov22_Pair38']
for tag in checkPairTags:
    print(tag)
    print(meta[meta.PairTag==tag].VideoFile)

Nov22_Pair37
24    Nov22_Pair37_RI_2023-03-10 13-42-26.mp4
Name: VideoFile, dtype: object
Nov22_Pair38
25    Nov22_Pair38_RI_2023-03-10 13-41-46.mp4
Name: VideoFile, dtype: object


In [11]:
# --- write metadata table
meta.to_csv(metadata_file,index=False)

In [13]:
# ----- Do data check on aggregated files before writing NWB files -----

meta = pd.read_csv(metadata_file)
writeToDisk=True

# keep track of overloaded files
multifile_log = []

# keep track of annotation overlaps
ol_files = []
ol_behav = []
ol_start = []
ol_end = []

# keep track of annotation gaps
g_files = []
g_b1 = []
g_b2 = []
g_end = []
g_start = []

for i, ptag in enumerate(meta.PairTag):
    
    # get assay duration
    duration = float(meta.AssayDuration[i])
    
    scoretab = pd.read_csv(os.path.join('..',meta.ScorePath[i],meta.ScoreFile[i])) # load up csv of annotations

    scoretab.rename(columns={'Start (s)':'start'}, inplace=True)
    scoretab.rename(columns={'Stop (s)':'end'}, inplace=True)
    scoretab.rename(columns={'Duration (s)':'duration'}, inplace=True)
    scoretab.rename(columns={'Behavior type':'behavior_type'}, inplace=True)

    # --- Check for problems in the annotation file

    # check for extraneous media files
    if len(np.unique(scoretab['Media file']))>1:
        warnings.warn(f'Scored csv {meta.ScoreFile[i]} contains events for multiple media files.')
        multifile_log.append(meta.ScoreFile[i])
        for fp in np.unique(scoretab['Media file']):
            print(fp)

    else:       

        # check for total event duration violations
        totdur = np.sum(scoretab.duration)
        if totdur>1.05*duration:
            warnings.warn(f'Total events duration of {meta.ScoreFile[i]} exceeds the assay duration.')
        elif totdur<0.95*duration:
            warnings.warn(f'Total events duration of {meta.ScoreFile[i]} does not meet the assay duration.')

        # check for overlapping events and check for unscored gaps between events
        # keep track of annotation overlaps
        ols,ole,olb,gs,ge,gb1,gb2 = check_for_event_interactions(scoretab,True)

        # log overlaps
        for j,ol in enumerate(ols):
            ol_files.append(meta.ScoreFile[i])
            ol_behav.append(olb[j])
            ol_start.append(ol)
            ol_end.append(ole[j])

        # log gaps
        for j, gp in enumerate(gs):
            g_files.append(meta.ScoreFile[i])
            g_b1.append(gb1[j])
            g_b2.append(gb2[j])
            g_end.append(ge[j])
            g_start.append(gs[j])

# --- Write annotation violation logs for review
ol_log = {'file':ol_files,'behavior':ol_behav,'start_time':ol_start,'end_time':ol_end}
ol_tab = pd.DataFrame(data=ol_log)
# ol_tab.to_csv(f'overlap_log_{meta.AssayType[i]}.csv',index=False)

gap_durs = []
for i,st in enumerate(g_start):
    gap_durs.append(st-g_end[i])
gap_log = {'file':g_files,'behavior1':g_b1,'behavior1_end':g_end,'behavior2':g_b2,'behavior2_start':g_start,
           'gap_duration':gap_durs}
g_tab = pd.DataFrame(data=gap_log)
# g_tab.to_csv(f'gap_log_{meta.AssayType[i]}.csv',index=False)

if writeToDisk:
    ol_tab.to_csv(f'overlap_log_{meta.AssayType[i]}.csv',index=False)
    g_tab.to_csv(f'gap_log_{meta.AssayType[i]}.csv',index=False)

Gap violation for behavior No interaction ending at 4871.891 and next behavior Sniff starting at 4872.638.
Gap violation for behavior Sniff ending at 3906.787 and next behavior No interaction starting at 3906.793.
Gap violation for behavior No interaction ending at 3986.671 and next behavior Sniff starting at 3986.767.
Gap violation for behavior No interaction ending at 3868.535 and next behavior Sniff starting at 3868.537.
