In [2]:
import os
from boxsdk import OAuth2, Client
import re
import warnings

from datetime import datetime, date
from uuid import uuid4
from dateutil.tz import tzlocal
from ast import literal_eval

import numpy as np
import pandas as pd
import warnings
import librosa
import pickle

from pynwb import NWBHDF5IO, NWBFile, TimeSeries
from pynwb.file import Subject
from pynwb.epoch import TimeIntervals
from pynwb.image import ImageSeries
from ndx_manoli_meta import AssayMetadata

from nwb_utils import get_date_from_block
from box_utils import *
from behavior_error_checks import *

In [3]:
# ---- Get login credentials from environment variables -----

# - note this needs to be adjusted for each authorized user... ask Nerissa about getting and setting this information
# - developer access tokens expire quickly, so this probably needs to be set at startup and perhaps even while working
client_id = os.environ.get('box_client_id')
client_secret = os.environ.get('box_client_secret')
access_token = 'PARH7Uob8OpR6bxMkBE67wlW2jjUS0z7'


# ----- Open a connection to the Box server -----

auth = OAuth2(
    client_id=client_id, # put these in environmental variables
    client_secret=client_secret,
    access_token=access_token,
)
client = Client(auth)

# ----- Work on directory structure -----

# get the list of items in the whole Scn2a folder
items = client.folder(folder_id='196168550606').get_items()

# list contents
for item in items:
    print(f'{item.type.capitalize()} {item.id} is named "{item.name}"')

Folder 238576018931 is named "Aggregated_Events"
Folder 202955487033 is named "April2023_ShortCoHab_Females"
Folder 226890392440 is named "BorisFiles"
Folder 196172227913 is named "Female Intros"
Folder 196173015061 is named "Female PPTs"
Folder 196170422974 is named "Female RI"
Folder 196169230497 is named "Female SepReunion"
Folder 196171174766 is named "Female TMs"
Folder 214178119262 is named "June2023_ControlBehaviors"
Folder 196166676922 is named "Male Intros"
Folder 196172624162 is named "Male PPTs"
Folder 196171675763 is named "Male RI"
Folder 196172896956 is named "Male SepReunion"
Folder 196172739371 is named "Male TMs"
Folder 248394520156 is named "Naive_Choice"
Folder 229587753245 is named "Oct2023_JuvenileBehavior"
File 1503021327997 is named "NC11_2024-02-14 11-43-05.mp4"
File 1177298175085 is named "Scn2aX1Nov22_Key.xlsx"


In [4]:
# ----- Set up user parameters
directory_keyword = 'TMs' # how the assay directories are labeled
metadata_file = 'metadata_TMs.csv' # where the initial metadata is stored
boris_keyword = 'TM'
score_path = os.path.join('Scn2a_X1Behavior_Nov22','Aggregated_Events')
assayregex = 'T|t'
# colors for plotting
fillcols = [[160, 146, 95],[245, 201, 39],[89, 91, 125],[63, 78, 245]]
linecols = [[96, 87, 57],[147, 120, 23],[53, 54, 75],[33, 41, 131]]

In [5]:
# ----- Figure out which directories have the assay of interest

projectDir = '196168550606'
rootpath = ['Scn2a_X1Behavior_Nov22']

usedirs = []
items = client.folder(folder_id=projectDir).get_items() # top level folder

for item in items:
    iid = item.id
    inm = item.name
    if inm.find(directory_keyword)>-1:
        usedirs.append(iid)

print(usedirs)

# --- collect all the paths and filenames

allpaths = []

for usedir in usedirs: 
    
    # add starting folder to curr_path
    udnm = client.folder(folder_id=usedir).get(fields=['name']).name
    curr_path = rootpath + [udnm]
    
    # search for files
    thesepaths = visit_all_dirs_files(usedir,[],curr_path,client)

    # consolidate
    allpaths += thesepaths
    
print(allpaths)

['196171174766', '196172739371']
['Scn2a_X1Behavior_Nov22\\Female TMs\\01052023\\DoNotScore_Nov22_Pair13_TM_WIN_20230105_13_12_06_Pro.mp4', 'Scn2a_X1Behavior_Nov22\\Female TMs\\01052023\\DoNotScore_Nov22_Pair14_TM_WIN_20230105_13_10_51_Pro.mp4', 'Scn2a_X1Behavior_Nov22\\Female TMs\\01052023\\Nov22_Pair15_TM.mov', 'Scn2a_X1Behavior_Nov22\\Female TMs\\01052023\\Nov22_Pair16_TM_WIN_20230105_13_07_45_Pro.mp4', 'Scn2a_X1Behavior_Nov22\\Female TMs\\01262023\\Nov22_Pair21_TM_2023-01-26 14-19-32.mp4', 'Scn2a_X1Behavior_Nov22\\Female TMs\\01262023\\Nov22_Pair22_TM_2023-01-26 14-20-52.mp4', 'Scn2a_X1Behavior_Nov22\\Female TMs\\01262023\\Nov22_Pair23_TM_2023-01-26 14-21-39.mp4', 'Scn2a_X1Behavior_Nov22\\Female TMs\\02092023\\Nov22_Pair28_TM_2023-02-09 15-04-39.mp4', 'Scn2a_X1Behavior_Nov22\\Female TMs\\02092023\\Nov22_Pair29_TM_2023-02-09 15-05-59.mp4', 'Scn2a_X1Behavior_Nov22\\Female TMs\\02092023\\Nov22_Pair30_TM_2023-02-09 15-06-59.mp4', 'Scn2a_X1Behavior_Nov22\\Female TMs\\02092023\\Nov22_Pai

In [6]:
# ----- Get a list of just file names to hunt through -----
justfiles = []
for pth in allpaths:
    justfiles.append(os.path.basename(pth))
print(justfiles)

['DoNotScore_Nov22_Pair13_TM_WIN_20230105_13_12_06_Pro.mp4', 'DoNotScore_Nov22_Pair14_TM_WIN_20230105_13_10_51_Pro.mp4', 'Nov22_Pair15_TM.mov', 'Nov22_Pair16_TM_WIN_20230105_13_07_45_Pro.mp4', 'Nov22_Pair21_TM_2023-01-26 14-19-32.mp4', 'Nov22_Pair22_TM_2023-01-26 14-20-52.mp4', 'Nov22_Pair23_TM_2023-01-26 14-21-39.mp4', 'Nov22_Pair28_TM_2023-02-09 15-04-39.mp4', 'Nov22_Pair29_TM_2023-02-09 15-05-59.mp4', 'Nov22_Pair30_TM_2023-02-09 15-06-59.mp4', 'Nov22_Pair31_TM_WIN_20230209_15_07_53_Pro.mp4', 'Nov22_Pair35_TM_2023-03-02 13-53-46.mp4', 'Nov22_Pair36_TM_2023-03-02 13-53-47.mp4', 'Nov22_Pair56_TM_2023-08-10 12-46-53.mp4', 'Nov22_Pair59_TM_2023-09-07 13-06-48.mp4', 'Nov22_Pair60_TM_2023-09-07 13-07-50.mp4', 'Nov22_Pair61_TM_2023-11-09 13-52-22.mp4', 'Nov22_Pair62_TM_2023-11-09 13-51-04.mp4', 'Nov22_Pair63_TM_2023-11-09 13-50-47.mp4', 'Nov22_Pair64_TM.mov', 'Nov22_Pair1_TM_WIN_20221116_13_46_35_Pro.mp4', 'Nov22_Pair2TM_WIN_20221116_13_49_49_Pro.mp4', 'Nov22_Pair3_TM.mov', 'Nov22_Pair3_TM_

In [9]:
# ----- Use metadata table to find expected files and add those columns to the table -----

# load up metadata
meta = pd.read_csv(metadata_file)

# generate a regex of possible matches for filename
fileIndex = []
for tag in meta.PairTag:
    match = find_in_list(justfiles,tag,assayregex)
    if len(match)==1:
        fileIndex.append(match[0])
    elif len(match)==0:
        print(f'No match for: {tag}')
    else:
        print(f'Found extra matches for: {tag}')
        for idx in match:
            print(justfiles[idx])
        
# use fileIndex to look up file names and paths
metafiles = []
metapaths = []
for i,tag in enumerate(meta.PairTag):
    metafiles.append(justfiles[fileIndex[i]])
    metapaths.append('\\'+os.path.dirname(allpaths[fileIndex[i]]))

# add files and paths to the metadata table
# meta.VideoFile = metafiles
# meta.VideoPath = metapaths

Found extra matches for: Nov22_Pair3
Nov22_Pair3_TM.mov
Nov22_Pair3_TM_2.mov
No match for: Nov22_Pair58


IndexError: list index out of range

In [None]:
# ----- Get BORIS scored aggregated events file names -----
aggfiles = []
items = client.folder(folder_id='238576018931').get_items()

for item in items:
    inm = item.name
    if inm.find(boris_keyword)>-1:
        aggfiles.append(inm)
        
print(aggfiles)
print(len(aggfiles))

In [None]:
# ----- Add Boris csvs and colors to metadata table

# match pair tags
ordered_agg = []
for tag in meta.PairTag:
    for fname in aggfiles:
        if fname.startswith(tag+'_'):
            ordered_agg.append(fname)

meta.ScoreFile = ordered_agg

# add path
score_path_arr = [score_path]*len(aggfiles)
meta.ScorePath = score_path_arr

# ----- Add colors to meta file so they go in the NWB metadata -----

# meta.FocalColor
allcols = []
for i, ptag in enumerate(meta.PairTag):
    if meta.FocalSex[i]=='F':
        if meta.FocalGT[i]=='WT':
            tcol = [fillcols[0],linecols[0]]
        else:
            tcol = [fillcols[1],linecols[1]]
    else:
        if meta.FocalGT[i]=='WT':
            tcol = [fillcols[2],linecols[2]]
        else:
            tcol = [fillcols[3],linecols[3]]
    allcols.append(tcol)
    
meta.FocalColor=allcols

# ----- Fix bad date formatting -----
# dx = meta.pop('Unnamed: 0')

newrecdates = []
for i, olddate in enumerate(meta.RecDate):
    dsplit = olddate.split('/')
    yr = dsplit[2]
    mn = dsplit[0]
    dy = dsplit[1]
    if len(mn)==1:
        mn = '0'+mn
    if len(dy)==1:
        dy = '0'+dy
    newrecdates.append(yr+mn+dy)
    
meta.RecDate=newrecdates

# --- write metadata table
meta.to_csv(metadata_file,index=False)

In [None]:
# ----- NWB set up -----

# 1. Set up user parameters
# 2. Loop over metadata file
# 3. Get and set up metadata
# 4. Set up behavior table
# 5. Write nwb file

# -- set parameters
lab = "Manoli @ UCSF"
exclude_flag = False
# path to storage for NWB files
nwbfile_path = os.path.join('M:\\','scn2a-paper-GWJSNH','NWB-files')
# whether to write NWB files to disk yet
write_NWB_to_disk = False

# load metadata
meta = pd.read_csv(metadata_file)
meta.FocalColor = meta.FocalColor.apply(literal_eval) # convert the colors to real arrays

# keep track of overloaded files
multifile_log = []

# keep track of annotation overlaps
ol_files = []
ol_behav = []
ol_start = []
ol_end = []

# keep track of annotation gaps
g_files = []
g_b1 = []
g_b2 = []
g_end = []
g_start = []

# -- loop over metadata
for i, ptag in enumerate(meta.PairTag):
    assay_type = meta.AssayType[i]
    nwbfilename = f'{ptag}_{assay_type}.nwb'
    print(nwbfilename)
    
    # check if file already exists
    wfullpath = os.path.join(nwbfile_path,nwbfilename)    
    if not os.path.exists(wfullpath):    
        
        # get session specific metadata
        thisdate = str(meta.RecDate[i])

        # set up recording time... it would be nice to get actual video data for the times
        datepieces = get_date_from_block(thisdate)
        rtime = meta.RecTime
        timepieces = rtime.split(':')
        sess_start = datetime(datepieces[0],datepieces[1],datepieces[2],timepieces[0],timepieces[1],0,0,tzlocal())

        session_description = f'Behavioral annotations from pair {ptag} in a(n) {assay_type} assay.'

        # make NWB file
        nwbfile = NWBFile(
            session_description=session_description,
            identifier = str(uuid4()),
            session_start_time = sess_start,
            lab=lab,
            experimenter=meta.RanBy[i],
            session_id = nwbfilename[0:-4], # check this
        )

        # add subject info
        nwbfile.subject = Subject(
            subject_id = meta.FocalID[i],
            species = 'Microtus ochrogaster',
            sex = meta.FocalSex[i],
            genotype = meta.FocalGT[i]
        )

        # get assay duration
        duration = float(meta.AssayDuration[i])

        # figure out partner info
        if meta.FocalSex[i]=='F':
            pID = meta.MaleID[i]
            pGT = meta.MaleGT[i]
        elif meta.FocalSex[i]=='M':
            pID = meta.FemaleID[i]
            pGT = meta.MaleGT[i]
        else:
            print(f'Focal sex is neither F nor M; something is wrong with {ptag}.')

        # TODO convert these to date objects and make sure this works
        # figure out time since pairing
        days = meta.RecDate[i] - meta.PairDate[i]

        # add lab metadata
        metaObj = AssayMetadata(
                        assay_type=assay_type,
                        exclude_flag=exclude_flag,
                        duration=duration,
                        room=str(meta.AssayRoom[i]),
                        timeline=str(meta.Timeline[i]),
                        ethogram=str(meta.Ethogram[i]),
                        experimenter=str(meta.ScoredBy[i]),
                        timeline_complete=meta.FullTimeline[i],
                        colors=meta.FocalColor[i],
                        introduction=os.path.join(meta.ScorePath[i],meta.ScoreFile[i]),
                        introduction__partner_ID=str(pID),
                        introduction__partner_GT=str(pGT),
                        )

        # Add the test LabMetaDataExtensionExample to the NWBFile
        nwbfile.add_lab_meta_data(lab_meta_data=metaObj)

        # TODO Add video file
        video_ext_file = ImageSeries(
            name='behaviorVideo',
            description='Raw original video.',
            unit='n.a.',
            external_file=[os.path.join(meta.VideoPath[i],meta.VideoFile[i])],
            format='external',
            starting_time=0.0,
            rate=25.0,
        )

        nwbfile.add_acquisition(video_ext_file)

        # --- Add annotated behavior data

        # Make sure these files are downloaded with the same path configuration as listed in the metadata
        # Or adjust here as I did with telling the code to go up a level
        scoretab = pd.read_csv(os.path.join('..',meta.ScorePath[i],meta.ScoreFile[i])) # load up csv of annotations

        # change problematic column names
        scoretab.rename(columns={'Start (s)':'start'}, inplace=True)
        scoretab.rename(columns={'Stop (s)':'end'}, inplace=True)
        scoretab.rename(columns={'Duration (s)':'duration'}, inplace=True)
        scoretab.rename(columns={'Behavior type':'behavior_type'}, inplace=True)

        # --- Check for problems in the annotation file

        # check for extraneous media files
        if len(np.unique(scoretab['Media file']))>1:
            warnings.warn(f'Scored csv {meta.ScoreFile[i]} contains events for multiple media files.')
            multifile_log.append(meta.ScoreFile[i])
            for fp in np.unique(scoretab['Media file']):
                print(fp)

        else:       

            # check for total event duration violations
            totdur = np.sum(scoretab.duration)
            if totdur>1.05*duration:
                warnings.warn(f'Total events duration of {meta.ScoreFile[i]} exceeds the assay duration.')
            elif totdur<0.95*duration:
                warnings.warn(f'Total events duration of {meta.ScoreFile[i]} does not meet the assay duration.')

            # check for overlapping events and check for unscored gaps between events
            # keep track of annotation overlaps
            ols,ole,olb,gs,ge,gb1,gb2 = check_for_event_interactions(scoretab,True)
            
            # log overlaps
            for j,ol in enumerate(ols):
                ol_files.append(meta.ScoreFile[i])
                ol_behav.append(olb[j])
                ol_start.append(ol)
                ol_end.append(ole[j])

            # log gaps
            for j, gp in enumerate(gs):
                g_files.append(meta.ScoreFile[i])
                g_b1.append(gb1[j])
                g_b2.append(gb2[j])
                g_end.append(ge[j])
                g_start.append(gs[j])

        # make NWB object corresponding to the annotation table
        behavior_intervals = TimeIntervals(name="annotated_behavior",
            description="Intervals of scored behavior.")

        behavior_intervals.add_column(name="behavior", description="The annotation from the ethogram.")
        behavior_intervals.add_column(name="duration", description="Duration of the behavior.")
        behavior_intervals.add_column(name="atype", description="Point or state event.")

        # populate table
        for i, start in enumerate(scoretab.start):
            end = scoretab.end[i]
            behav = scoretab.Behavior[i]
            atype = scoretab.behavior_type[i]
            dur = scoretab.duration[i]
            behavior_intervals.add_row(start_time=start,stop_time=end,behavior=behav,atype=atype,duration=dur)

        # add to NWB file
        nwbfile.add_time_intervals(behavior_intervals)
        
        # write file to disk
        if write_NWB_to_disk:
            with NWBHDF5IO(wfullpath, "w") as io:
                io.write(nwbfile)

# --- Write annotation violation logs for review
ol_log = {'file':ol_files,'behavior':ol_behav,'start_time':ol_start,'end_time':ol_end}
ol_tab = pd.DataFrame(data=ol_log)
ol_tab.to_csv(f'overlap_log_{meta.AssayType[i]}.csv',index=False)

gap_durs = []
for i,st in enumerate(g_start):
    gap_durs.append(st-g_end[i])
gap_log = {'file':g_files,'behavior1':g_b1,'behavior1_end':g_end,'behavior2':g_b2,'behavior2_start':g_start,
           'gap_duration':gap_durs}
g_tab = pd.DataFrame(data=gap_log)
g_tab.to_csv(f'gap_log_{meta.AssayType[i]}.csv',index=False)