In [None]:
import numpy as np
from numpy import matlib as ml
import pandas as pd
import os
import warnings
import librosa
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
from random import shuffle
import scipy.stats as st

from pynwb import NWBHDF5IO
from pynwb.epoch import TimeIntervals

from sklearn.decomposition import PCA

In [None]:
# ----- SET UP FILE HANDLING -----

# !! Josh -- you will need to edit this block with your local path to the NWB files
# You can/should just download the NWB folder from the Scn2a project Box, then you can just point to that directory here
# and everything should just work (haha famous last words)
# If we end up collaborating on files enough to make it stupid to switch back and forth between your computer(s) and mine,
# we can set up environmental variables that will take care of it, but for now this dumb kludge should be okay

# computer = 'Dorian'
computer = 'Ernest'
# computer = 'Josh'

match computer:
    case 'Dorian':
        nwbpath = 'M:\\scn2a-paper-GWJSNH\\NWB-files'
        
    case 'Ernest':
        nwbpath = 'C:\\Users\\nhogl\\Documents\\GitHub\\scn2a-paper-GWJSNH-code\\NWB'
        
    case 'Josh':
        nwbpath = '\your\path\here' # EDIT ME

allnwb = []
allnwb += [os.path.join(nwbpath,each) for each in os.listdir(nwbpath) if each.endswith('.nwb')]

printy = True # EDIT ME if you don't want the loop over each file to print the file name

In [None]:
# ----- COLLATE TIME SPENT IN EACH STATE BEHAVIOR AND COUNTS OF EACH POINT BEHAVIOR FOR EACH ASSAY -----

# This cell looks in each NWB file and creates the summary metrics.  The idea is to do this once at the beginning then
# reconfigure the resulting dataframe for all subsequent analyses.  I did not functionalize it because it should just
# happen once at the beginning, but it might bear refactoring if we want more flexibility on the metrics

for i,fname in enumerate(allnwb): # loop over files
    # open file
    io = NWBHDF5IO(os.path.join(nwbpath,fname), mode="r") # <-- open in write to append snippets?
    nwbfile = io.read()
    
    # for sanity check, print out the name of the file being handled
    pathsplit = fname.split('\\')
    recstem = pathsplit[-1].strip('.nwb')
    if printy: # you can turn off filename printing by setting this to false
        print(recstem)
    
    # filter out files that don't have complete timelines
    if nwbfile.lab_meta_data['vole_metadata'].timeline_complete: # can I just get rid of this?
        data = nwbfile.intervals['annotated_behavior'].to_dataframe() # convert annotation table to dataframe
        df = data.copy()
        df=df[df.behavior!='Start assay']
        # get counts of all events and add a column to label as such
        counts = df.groupby('behavior')['start_time'].count().reset_index(name='number')
        nrcountsentries = counts.shape[0]
        counts['metric_label'] = ['count']*nrcountsentries
        # do median duration for state events and add a column to label as such
        medians = df[df.atype=='STATE'].groupby(['behavior'])['duration'].apply(np.median).reset_index(name='number')
        nrmediansentries = medians.shape[0]
        medians['metric_label'] = ['median']*nrmediansentries
        combined = pd.concat([counts,medians]) # combine table
        # PPT label conversion
        if nwbfile.lab_meta_data['vole_metadata'].assay_type=='PPT':
            if nwbfile.lab_meta_data['vole_metadata'].assay_type__partner_chamber=='Left':
                combined.behavior.replace({'Left':'Partner'}, regex=True,inplace=True)
                combined.behavior.replace({'Right':'Stranger'}, regex=True,inplace=True)
            elif nwbfile.lab_meta_data['vole_metadata'].assay_type__partner_chamber=='Right':
                combined.behavior.replace({'Right':'Partner'}, regex=True,inplace=True)
                combined.behavior.replace({'Left':'Stranger'}, regex=True,inplace=True)
            else:
                print('Invalid value for partner chamber.')
        nrbehav = combined.shape[0]
        # add metadata labels
        combined['ET'] = [nwbfile.subject.subject_id]*nrbehav
        combined['sex'] = [nwbfile.subject.sex]*nrbehav
        combined['GT'] = [nwbfile.subject.genotype]*nrbehav
        combined['assay'] = [nwbfile.lab_meta_data['vole_metadata'].assay_type]*nrbehav
        
        if i==0:
            rundf = combined
        else:
            rundf = pd.concat([rundf,combined])
        
    else:
        print('Incomplete timeline; skipping.')
        
# ----- ADJUST LABELS IN DATAFRAME AND CONVERT TO WIDE FORMAT FOR PCA -----

# add a column to cross reference behaviors to assays and metric
rundf['a-b-m'] = rundf['assay'].add('-').add(rundf['behavior']).add('-').add(rundf['metric_label'])

# rearrange table so that each individual has a row of data
wide = rundf.pivot(index=['ET','sex','GT'], columns='a-b-m', values='number')

# THIS STEP WILL BE A PROBLEM WITH ASSAYS THAT DON'T EXIST
# replace any missing values with zeros (as they did not occur)
filled = wide.fillna(0)

# pull metadata out of indices
filled = filled.reset_index(level=['sex', 'GT'])

# generate master list of observations; inventory of which behaviors appear in which assays
allcols = filled.columns

In [None]:
# ----- SET UP COLORS -----

# I have found that just hardcoding this is most effective in practice, will need to be subbed if we change the colors

acols = [[0.627451,   0.57254905, 0.37254903], # all colors
        [0.9607843,  0.7882353,  0.15294118],
        [0.34901962, 0.35686275, 0.49019608],
        [0.24705882, 0.30588236, 0.9607843 ]]

fcols = acols[0:2] # female colors, WT first
mcols = acols[2:] # male colors, WT first

fpal = sns.color_palette(fcols) # create seaborn palettes for plotting
mpal = sns.color_palette(mcols)
apal = sns.color_palette(acols)

In [None]:
def check_column_labels(toUse,allCols):
    
    use = [] # keep track of which synthesized labels are represented
    
    for i,label in enumerate(toUse): # iterate over labels
        if label in allCols: # check whether the label exists in the column list
            use.append(label)
            
    return use

def generate_prospective_columns(useAssays,useBehaviors,useMetrics):
    colsToUse = [assay+'-'+behav+'-'+metric for metric in useMetrics for behav in useBehaviors for assay in useAssays]
    return colsToUse

def 

In [1]:
useBehaviors = ['Huddle','Sniff','Investigate']
useAssays = ['introduction','aggression']
useMetrics = ['count','median']

In [2]:
colsToUse = [assay+'-'+behav+'-'+metric for metric in useMetrics for behav in useBehaviors for assay in useAssays]

In [3]:
colsToUse

['introduction-Huddle-count',
 'aggression-Huddle-count',
 'introduction-Sniff-count',
 'aggression-Sniff-count',
 'introduction-Investigate-count',
 'aggression-Investigate-count',
 'introduction-Huddle-median',
 'aggression-Huddle-median',
 'introduction-Sniff-median',
 'aggression-Sniff-median',
 'introduction-Investigate-median',
 'aggression-Investigate-median']