In [1]:
import os
from boxsdk import OAuth2, Client
import re
import warnings

from datetime import datetime, date
from uuid import uuid4
from dateutil.tz import tzlocal
from ast import literal_eval

import numpy as np
import pandas as pd
import warnings
import librosa
import pickle

from pynwb import NWBHDF5IO, NWBFile, TimeSeries
from pynwb.file import Subject
from pynwb.epoch import TimeIntervals
from pynwb.image import ImageSeries
from ndx_manoli_meta import AssayMetadata

from nwb_utils import get_date_from_block
from box_utils import *
from behavior_error_checks import *

In [3]:
# ---- Get login credentials from environment variables -----

# - note this needs to be adjusted for each authorized user... ask Nerissa about getting and setting this information
# - developer access tokens expire quickly, so this probably needs to be set at startup and perhaps even while working
client_id = os.environ.get('box_client_id')
client_secret = os.environ.get('box_client_secret')
access_token = 'cOazQJCSfNebw1yECeAoKo3F0qjFALe4'


# ----- Open a connection to the Box server -----

auth = OAuth2(
    client_id=client_id, # put these in environmental variables
    client_secret=client_secret,
    access_token=access_token,
)
client = Client(auth)

# ----- Work on directory structure -----

# get the list of items in the whole Scn2a folder
items = client.folder(folder_id='196168550606').get_items()

# list contents
for item in items:
    print(f'{item.type.capitalize()} {item.id} is named "{item.name}"')

Folder 238576018931 is named "Aggregated_Events"
Folder 202955487033 is named "April2023_ShortCoHab_Females"
Folder 226890392440 is named "BorisFiles"
Folder 196172227913 is named "Female Intros"
Folder 196173015061 is named "Female PPTs"
Folder 196170422974 is named "Female RI"
Folder 196169230497 is named "Female SepReunion"
Folder 196171174766 is named "Female TMs"
Folder 214178119262 is named "June2023_ControlBehaviors"
Folder 196166676922 is named "Male Intros"
Folder 196172624162 is named "Male PPTs"
Folder 196171675763 is named "Male RI"
Folder 196172896956 is named "Male SepReunion"
Folder 196172739371 is named "Male TMs"
Folder 248394520156 is named "Naive_Choice"
Folder 261514259471 is named "NWB"
Folder 229587753245 is named "Oct2023_JuvenileBehavior"
File 1177298175085 is named "Scn2aX1Nov22_Key.xlsx"


In [13]:
# ----- Set up user parameters
directory_keyword = 'PPTs' # how the assay directories are labeled
metadata_file = 'metadata_ppts_v2.csv' # where the initial metadata is stored
boris_keyword = 'PPT'
score_path = os.path.join('Scn2a_X1Behavior_Nov22','Aggregated_Events')
assayregex = 'P|p'
# colors for plotting
fillcols = [[160, 146, 95],[245, 201, 39],[89, 91, 125],[63, 78, 245]]
linecols = [[96, 87, 57],[147, 120, 23],[53, 54, 75],[33, 41, 131]]

In [4]:
# ----- Figure out which directories have the assay of interest

projectDir = '196168550606'
rootpath = ['Scn2a_X1Behavior_Nov22']

usedirs = []
items = client.folder(folder_id=projectDir).get_items() # top level folder

for item in items:
    iid = item.id
    inm = item.name
    if inm.find(directory_keyword)>-1:
        usedirs.append(iid)

print(usedirs)

# --- collect all the paths and filenames

allpaths = []

for usedir in usedirs: 
    
    # add starting folder to curr_path
    udnm = client.folder(folder_id=usedir).get(fields=['name']).name
    curr_path = rootpath + [udnm]
    
    # search for files
    thesepaths = visit_all_dirs_files(usedir,[],curr_path,client)

    # consolidate
    allpaths += thesepaths
    
print(allpaths)

# ----- Get a list of just file names to hunt through -----
justfiles = []
for pth in allpaths:
    justfiles.append(os.path.basename(pth))
print(justfiles)

['196173015061', '196172624162']
['Scn2a_X1Behavior_Nov22\\Female PPTs\\01062023\\DoNotScore_Nov22_Pair13_PPT_WIN_20230106_13_17_39_Pro.mp4', 'Scn2a_X1Behavior_Nov22\\Female PPTs\\01062023\\DoNotScore_Nov22_Pair14_PPT_2023-01-06 13-17-47.mp4', 'Scn2a_X1Behavior_Nov22\\Female PPTs\\01062023\\Nov22_Pair15_PPT.mov', 'Scn2a_X1Behavior_Nov22\\Female PPTs\\01062023\\Nov22_Pair16_PPT_2023-01-06 13-18-09.mp4', 'Scn2a_X1Behavior_Nov22\\Female PPTs\\01272023\\Nov22_Pair21_PPT_2023-01-27 13-11-20.mp4', 'Scn2a_X1Behavior_Nov22\\Female PPTs\\01272023\\Nov22_Pair22_PPT_2023-01-27 13-11-22.mp4', 'Scn2a_X1Behavior_Nov22\\Female PPTs\\01272023\\Nov22_Pair23_PPT_2023-01-27 13-11-23.mp4', 'Scn2a_X1Behavior_Nov22\\Female PPTs\\02102023\\Nov22_Pair28_PPT_2023-02-10 13-20-08.mp4', 'Scn2a_X1Behavior_Nov22\\Female PPTs\\02102023\\Nov22_Pair29_PPT_2023-02-10 13-20-06.mp4', 'Scn2a_X1Behavior_Nov22\\Female PPTs\\02102023\\Nov22_Pair30PPT_2023-02-10 13-20-26.mp4', 'Scn2a_X1Behavior_Nov22\\Female PPTs\\02102023\\N

In [5]:
# ----- Use metadata table to find expected files and add those columns to the table -----

# load up metadata
meta = pd.read_csv(metadata_file)

# generate a regex of possible matches for filename
fileIndex = []
for tag in meta.PairTag:
    match = find_in_list(justfiles,tag,assayregex)
    if len(match)==1:
        fileIndex.append(match[0])
    elif len(match)==0:
        print(f'No match for: {tag}')
    else:
        print(f'Found extra matches for: {tag}')
        for idx in match:
            print(justfiles[idx])
        fileIndex.append(match[0])
        
# use fileIndex to look up file names and paths
metafiles = []
metapaths = []
for i,tag in enumerate(meta.PairTag):
    metafiles.append(justfiles[fileIndex[i]])
    metapaths.append('\\'+os.path.dirname(allpaths[fileIndex[i]]))

# add files and paths to the metadata table
meta.VideoFile = metafiles
meta.VideoPath = metapaths

In [6]:
# ----- Get BORIS scored aggregated events file names -----
aggfiles = []
items = client.folder(folder_id='238576018931').get_items()

for item in items:
    inm = item.name
    if inm.find(boris_keyword)>-1:
        aggfiles.append(inm)
        
print(aggfiles)
print(len(aggfiles))

['Nov22_Pair15_PPT.csv', 'Nov22_Pair16_PPT.csv', 'Nov22_Pair17_PPT.csv', 'Nov22_Pair18_PPT.csv', 'Nov22_Pair19_PPT.csv', 'Nov22_Pair1_PPT.csv', 'Nov22_Pair21_PPT.csv', 'Nov22_Pair22_PPT.csv', 'Nov22_Pair23_PPT.csv', 'Nov22_Pair24_PPT.csv', 'Nov22_Pair25_PPT.csv', 'Nov22_Pair26_PPT.csv', 'Nov22_Pair27_PPT.csv', 'Nov22_Pair28_PPT.csv', 'Nov22_Pair29_PPT.csv', 'Nov22_Pair2_PPT.csv', 'Nov22_Pair30_PPT.csv', 'Nov22_Pair31_PPT.csv', 'Nov22_Pair32_PPT.csv', 'Nov22_Pair33_PPT.csv', 'Nov22_Pair34_PPT.csv', 'Nov22_Pair35_PPT.csv', 'Nov22_Pair36_PPT.csv', 'Nov22_Pair37_PPT.csv', 'Nov22_Pair38_PPT.csv', 'Nov22_Pair3_PPT.csv', 'Nov22_Pair40_PPT.csv', 'Nov22_Pair41_PPT.csv', 'Nov22_Pair42_PPT.csv', 'Nov22_Pair43_PPT.csv', 'Nov22_Pair44_PPT.csv', 'Nov22_Pair46_PPT.csv', 'Nov22_Pair47_PPT.csv', 'Nov22_Pair48_PPT.csv', 'Nov22_Pair49_PPT.csv', 'Nov22_Pair4_PPT.csv', 'Nov22_Pair50_PPT.csv', 'Nov22_Pair51_PPT.csv', 'Nov22_Pair52_PPT.csv', 'Nov22_Pair56_PPT.csv', 'Nov22_Pair57_PPT.csv', 'Nov22_Pair58_PPT.c

In [7]:
# ----- Add Boris csvs and colors to metadata table

# match pair tags
ordered_agg = []
for i, tag in enumerate(meta.PairTag):
    for fname in aggfiles:
        if fname.startswith(tag+'_'):
            ordered_agg.append(fname)

meta.ScoreFile = ordered_agg

# add path
score_path_arr = [score_path]*len(aggfiles)
meta.ScorePath = score_path_arr

# ----- Add colors to meta file so they go in the NWB metadata -----

# meta.FocalColor
allcols = []
for i, ptag in enumerate(meta.PairTag):
    if meta.FocalSex[i]=='F':
        if meta.FocalGT[i]=='WT':
            tcol = [fillcols[0],linecols[0]]
        else:
            tcol = [fillcols[1],linecols[1]]
    else:
        if meta.FocalGT[i]=='WT':
            tcol = [fillcols[2],linecols[2]]
        else:
            tcol = [fillcols[3],linecols[3]]
    allcols.append(tcol)
    
meta.FocalColor=allcols

# ----- Fix bad date formatting -----
# dx = meta.pop('Unnamed: 0')

newrecdates = []
for i, olddate in enumerate(meta.RecDate):
    dsplit = olddate.split('/')
    yr = dsplit[2]
    mn = dsplit[0]
    dy = dsplit[1]
    if len(mn)==1:
        mn = '0'+mn
    if len(dy)==1:
        dy = '0'+dy
    newrecdates.append(yr+mn+dy)
    
meta.RecDate=newrecdates

# --- Check on metadata before writing it ---
meta.head()

Unnamed: 0,PairTag,AssayType,RecDate,RecTime,VideoFile,VideoPath,ScoreFile,ScorePath,FemaleID,FemaleGT,...,Timeline,Ethogram,RanBy,ScoredBy,FullTimeline,FocalColor,StrangerID,StrangerGT,PPTlane,PartnerChamber
0,Nov22_Pair1,PPT,20221117,12:26,Nov22_Pair1_PPT_WIN_20221117_12_25_03_Pro.mp4,\Scn2a_X1Behavior_Nov22\Female PPTs\11172022,Nov22_Pair1_PPT.csv,Scn2a_X1Behavior_Nov22\Aggregated_Events,B8002,Het,...,,,Gina Williams,Gina Williams,,"[[245, 201, 39], [147, 120, 23]]",,WT,1,Right
1,Nov22_Pair15,PPT,20230106,13:19,Nov22_Pair15_PPT.mov,\Scn2a_X1Behavior_Nov22\Female PPTs\01062023,Nov22_Pair15_PPT.csv,Scn2a_X1Behavior_Nov22\Aggregated_Events,B6614,Het,...,,,Gina Williams,Gina Williams,,"[[245, 201, 39], [147, 120, 23]]",,WT,4,Right
2,Nov22_Pair16,PPT,20230106,13:19,Nov22_Pair16_PPT_2023-01-06 13-18-09.mp4,\Scn2a_X1Behavior_Nov22\Female PPTs\01062023,Nov22_Pair16_PPT.csv,Scn2a_X1Behavior_Nov22\Aggregated_Events,B6615,WT,...,,,Gina Williams,Gina Williams,,"[[160, 146, 95], [96, 87, 57]]",,WT,3,Left
3,Nov22_Pair17,PPT,20230109,13:11,Nov22_Pair17_PPT_WIN_20230109_13_10_33_Pro.mp4,\Scn2a_X1Behavior_Nov22\Male PPTs\01092023,Nov22_Pair17_PPT.csv,Scn2a_X1Behavior_Nov22\Aggregated_Events,,,...,,,Gina Williams,Gina Williams,,"[[63, 78, 245], [33, 41, 131]]",,WT,1,Left
4,Nov22_Pair18,PPT,20230109,13:11,Nov22_Pair18_PPT_2023-01-09 13-10-28.mp4,\Scn2a_X1Behavior_Nov22\Male PPTs\01092023,Nov22_Pair18_PPT.csv,Scn2a_X1Behavior_Nov22\Aggregated_Events,,,...,,,Gina Williams,Gina Williams,,"[[89, 91, 125], [53, 54, 75]]",,WT,2,Right


In [12]:
# --- write metadata table
meta.to_csv(metadata_file,index=False)

In [14]:
# ----- Do data check on aggregated files before writing NWB files -----

meta = pd.read_csv(metadata_file)

# keep track of overloaded files
multifile_log = []

# keep track of annotation overlaps
ol_files = []
ol_behav = []
ol_start = []
ol_end = []

# keep track of annotation gaps
g_files = []
g_b1 = []
g_b2 = []
g_end = []
g_start = []

# keep track of center violations
c_files = []
c_b1 = []
c_b2 = []
c_end = []
c_start = []

for i, ptag in enumerate(meta.PairTag):
    
    # get assay duration
    duration = float(meta.AssayDuration[i])
    
    scoretab = pd.read_csv(os.path.join('..',meta.ScorePath[i],meta.ScoreFile[i])) # load up csv of annotations

    scoretab.rename(columns={'Start (s)':'start'}, inplace=True)
    scoretab.rename(columns={'Stop (s)':'end'}, inplace=True)
    scoretab.rename(columns={'Duration (s)':'duration'}, inplace=True)
    scoretab.rename(columns={'Behavior type':'behavior_type'}, inplace=True)

    # --- Check for problems in the annotation file

    # check for extraneous media files
    if len(np.unique(scoretab['Media file']))>1:
        warnings.warn(f'Scored csv {meta.ScoreFile[i]} contains events for multiple media files.')
        multifile_log.append(meta.ScoreFile[i])
        for fp in np.unique(scoretab['Media file']):
            print(fp)

    else:       

        # check for total event duration violations
        totdur = np.sum(scoretab.duration)
        if totdur>1.05*duration:
            warnings.warn(f'Total events duration of {meta.ScoreFile[i]} exceeds the assay duration.')
        elif totdur<0.95*duration:
            warnings.warn(f'Total events duration of {meta.ScoreFile[i]} does not meet the assay duration.')

        # check for overlapping events and check for unscored gaps between events
        # keep track of annotation overlaps
        ols,ole,olb,gs,ge,gb1,gb2 = check_for_event_interactions(scoretab,True)

        # log overlaps
        for j,ol in enumerate(ols):
            ol_files.append(meta.ScoreFile[i])
            ol_behav.append(olb[j])
            ol_start.append(ol)
            ol_end.append(ole[j])

        # log gaps
        for j, gp in enumerate(gs):
            g_files.append(meta.ScoreFile[i])
            g_b1.append(gb1[j])
            g_b2.append(gb2[j])
            g_end.append(ge[j])
            g_start.append(gs[j])
            
        # check for center chamber violations in PPT
        cs,ce,cb1,cb2 = check_PPTs_for_center_violations(scoretab,True)
        
        for j, ctr in enumerate(cs):
            c_files.append(meta.ScoreFile[i])
            c_b1.append(cb1[j])
            c_b2.append(cb2[j])
            c_end.append(ce[j])
            c_start.append(cs[j])

In [20]:
# --- Write annotation violation logs for review

# ol_log = {'file':ol_files,'behavior':ol_behav,'start_time':ol_start,'end_time':ol_end}
# ol_tab = pd.DataFrame(data=ol_log)
# ol_tab.to_csv(f'overlap_log_{meta.AssayType[0]}.csv',index=False)

gap_durs = []
for i,st in enumerate(g_start):
    gap_durs.append(st-g_end[i])
gap_log = {'file':g_files,'behavior1':g_b1,'behavior1_end':g_end,'behavior2':g_b2,'behavior2_start':g_start,
           'gap_duration':gap_durs}
g_tab = pd.DataFrame(data=gap_log)
g_tab.to_csv(f'gap_log_{meta.AssayType[0]}.csv',index=False)

ct_log  = {'file':c_files,'behavior1':c_b1,'behavior1_start':c_start,'behavior1_end':c_end,'behavior2':c_b2}
c_tab = pd.DataFrame(data=ct_log)
c_tab.to_csv(f'centerViolation_log_{meta.AssayType[0]}.csv',index=False)

In [10]:
# --- Update new PPT data columns with details ---

meta = pd.read_csv(metadata_file)

nrPPTs = meta.shape[0]

leftGTs = ['WT']*nrPPTs
rightGTs = ['WT']*nrPPTs

leftSex = []
rightSex = []

for fsex in meta.FocalSex:
    if fsex=='F':
        leftSex.append('M')
        rightSex.append('M')
    else:
        leftSex.append('F')
        rightSex.append('F')
        
leftType = []
rightType = []

for pchamb in meta.PartnerChamber:
    if pchamb=='Right':
        rightType.append('Partner')
        leftType.append('Stranger')
    else:
        leftType.append('Partner')
        rightType.append('Stranger')
        
meta.LeftSex = leftSex
meta.RightSex = rightSex
meta.LeftType = leftType
meta.RightType = rightType

meta.head()

Unnamed: 0,PairTag,AssayType,RecDate,RecTime,VideoFile,VideoPath,ScoreFile,ScorePath,FemaleID,FemaleGT,...,RightGT,RightSex,RightDOB,RightType,StrangerID,PPTlane,PartnerChamber,CameraType,FocalGTConfirmed,Pregnant
0,Nov22_Pair1,PPT,20221117,12:26,Nov22_Pair1_PPT_WIN_20221117_12_25_03_Pro.mp4,\Scn2a_X1Behavior_Nov22\Female PPTs\11172022,Nov22_Pair1_PPT.csv,Scn2a_X1Behavior_Nov22\Aggregated_Events,B8002,Het,...,WT,M,,Partner,,1,Right,,,
1,Nov22_Pair15,PPT,20230106,13:19,Nov22_Pair15_PPT.mov,\Scn2a_X1Behavior_Nov22\Female PPTs\01062023,Nov22_Pair15_PPT.csv,Scn2a_X1Behavior_Nov22\Aggregated_Events,B6614,Het,...,WT,M,,Partner,,4,Right,,,
2,Nov22_Pair16,PPT,20230106,13:19,Nov22_Pair16_PPT_2023-01-06 13-18-09.mp4,\Scn2a_X1Behavior_Nov22\Female PPTs\01062023,Nov22_Pair16_PPT.csv,Scn2a_X1Behavior_Nov22\Aggregated_Events,B6615,WT,...,WT,M,,Stranger,,3,Left,,,
3,Nov22_Pair17,PPT,20230109,13:11,Nov22_Pair17_PPT_WIN_20230109_13_10_33_Pro.mp4,\Scn2a_X1Behavior_Nov22\Male PPTs\01092023,Nov22_Pair17_PPT.csv,Scn2a_X1Behavior_Nov22\Aggregated_Events,,,...,WT,F,,Stranger,,1,Left,,,
4,Nov22_Pair18,PPT,20230109,13:11,Nov22_Pair18_PPT_2023-01-09 13-10-28.mp4,\Scn2a_X1Behavior_Nov22\Male PPTs\01092023,Nov22_Pair18_PPT.csv,Scn2a_X1Behavior_Nov22\Aggregated_Events,,,...,WT,F,,Partner,,2,Right,,,


In [13]:
# --- Check PPTs for missing-center violations ---

def check_PPTs_for_center_violations(scoretab,verbose):

    s = scoretab.start
    e = scoretab.end
    b = scoretab.Behavior

    mask = scoretab.behavior_type=='STATE'

    smask = s[mask]
    emask = e[mask]
    bmask = b[mask]
    
    center_s, center_e, center_b1, center_b2 = check_for_Right_Left_issues(smask,emask,bmask)

    if verbose:

        for i,ctrs in enumerate(center_s):
            print(f'Center violation for behavior {center_b1[i]} starting at {ctrs} ending at {center_e[i]}, and following behavior {center_b2[i]}.')

    return center_s,center_e,center_b1,center_b2

def check_for_Right_Left_issues(starts,ends,behaviors):
    
    violation_s = []
    violation_e = []
    violation_b1 = []
    violation_b2 = []

    for i,behav in enumerate(behaviors.iloc[:-2]):
        if np.logical_and(('Right' in behav),('Left' in behaviors.iloc[i+1])):
            violation_s.append(starts.iloc[i])
            violation_e.append(ends.iloc[i])
            violation_b1.append(behaviors.iloc[i])
            violation_b2.append(behaviors.iloc[i+1])
        elif np.logical_and(('Left' in behav),('Right' in behaviors.iloc[i+1])):
            violation_s.append(starts.iloc[i])
            violation_e.append(ends.iloc[i])
            violation_b1.append(behaviors.iloc[i])
            violation_b2.append(behaviors.iloc[i+1])
            
    return violation_s, violation_e, violation_b1, violation_b2

In [15]:
# track down missing events file
allputativecsv = []
for i, tag in enumerate(meta.PairTag):
    allputativecsv.append(f'{tag}_PPT.csv')
    
for fname in allputativecsv:
    if fname not in aggfiles:
        print(fname)

Nov22_Pair39_PPT.csv
