A notebook for generating initial statistics across the whole brain for the spontaneous events

In [1]:
%load_ext autoreload
%autoreload

In [2]:
import glob
import itertools
from pathlib import Path
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from janelia_core.dataprocessing.dataset import ROIDataset
from janelia_core.stats.regression import grouped_linear_regression_acm_stats
from janelia_core.stats.regression import grouped_linear_regression_ols_estimator

from keller_zlatic_vnc.data_processing import count_unique_subjs_per_transition
from keller_zlatic_vnc.data_processing import generate_standard_id_for_full_annots
from keller_zlatic_vnc.data_processing import generate_standard_id_for_volume
from keller_zlatic_vnc.data_processing import get_basic_clean_annotations_from_full
from keller_zlatic_vnc.data_processing import read_full_annotations

## Parmaeters go here

In [3]:
ps = dict()

# Folders containing a4 and a9 annotation data
ps['a4_annot_folder'] = r'\\dm11\bishoplab\projects\keller_vnc\data\full_annotations\behavior_csv_cl_A4'
ps['a9_annot_folder'] = r'\\dm11\bishoplab\projects\keller_vnc\data\full_annotations\behavior_csv_cl_A9'
ps['spont_only_annot_folder'] = r'\\dm11\bishoplab\projects\keller_vnc\data\full_annotations\spontaneous_only_annotations'

# File containing locations to registered volumes
ps['volume_loc_file'] = r'\\dm11\bishoplab\projects\keller_vnc\data\experiment_data_locations.xlsx'

# List subjects we do not want to include in the analysis
ps['exclude_subjs'] = set(['CW_17-11-06-L2'])

# Specify the threshold we use (in number of stacks) to determine when a quiet transition has occured
ps['q_th'] = 4

# Subfolder containing the dataset for each subject
ps['dataset_folder'] = 'extracted'

# Base folder where datasets are stored 
ps['dataset_base_folder'] =r'K:\\SV4'

# Data to calculate Delta F/F for in each dataset
ps['f_ts_str'] = 'f_1_5_5'
ps['bl_ts_str'] = 'bl_1_5_5_long'

# Parameters for calculating dff
ps['background'] = 100
ps['ep'] = 20

# Min number of subjects we must observe a transition in for us to analyze it
min_n_subjs = 10

# Alpha value for thresholding p-values when calculating stats
ps['alpha'] = .05

# Specify the window we pull dff from
ps['window_type'] = 'start_locked' #'whole_event' 'start_locked'

# If we are using a window locked to event start or stop, we give the relative offset and window length here
ps['window_offset'] = -18
ps['window_length'] = 15

# Specify if we only consider events where the extracted dff window is entirely contained within the event
ps['enforce_contained_events'] = False

# True if we want to pool preceeding behaviors
ps['pool_preceeding_behaviors'] = False
 
# Specify where we save results
ps['save_folder'] = r'\\dm11\bishoplab\projects\keller_vnc\results\whole_brain_spont_stats'
ps['save_name'] = 'spont_1_5_5_long_bl_co_4_start_locked_neg18_15.pkl'

## Get list of all subjects we can analyze

These are those we have registered volumes for and annotations and they are not in the excluded subjects

In [4]:
# Get list of all annotation files and the subjects they correspond to
annot_file_paths = (glob.glob(str(Path(ps['a4_annot_folder']) / '*.csv')) + 
                    glob.glob(str(Path(ps['a9_annot_folder']) / '*.csv')) +
                    glob.glob(str(Path(ps['spont_only_annot_folder']) / '*.csv')))
annot_file_names = [Path(p).name for p in annot_file_paths]
annot_subjs = [generate_standard_id_for_full_annots(fn) for fn in annot_file_names]

In [5]:
# Read in location of all registered volumes
def c_fcn(str):
    return str.replace("'", "")
converters = {0:c_fcn, 1:c_fcn}

volume_locs = pd.read_excel(ps['volume_loc_file'], header=1, usecols=[1, 2], converters=converters)
volume_subjs = [generate_standard_id_for_volume(volume_locs.loc[i,'Main folder'], 
                                                       volume_locs.loc[i,'Subfolder'])  for i in volume_locs.index]
volume_inds = [i for i in volume_locs.index]

In [6]:
# Update name of one of the volume subjects to match the annotations (this is only needed for one subject)
m_ind = np.argwhere(np.asarray(volume_subjs) == 'CW_17-11-03-L6')[0][0]
volume_subjs[m_ind] = 'CW_17-11-03-L6-2'

In [7]:
analyze_subjs = set(volume_subjs).intersection(set(annot_subjs))
analyze_subjs = analyze_subjs - set(ps['exclude_subjs'])
analyze_subjs = list(np.sort(np.asarray(list(analyze_subjs))))

## For each subject we analyze, determine where it's annotation and volume data is 

In [8]:
subject_dict = dict()
for s_id in analyze_subjs:
    volume_i = np.argwhere(np.asarray(volume_subjs) == s_id)[0][0]
    annot_i = np.argwhere(np.asarray(annot_subjs) == s_id)[0][0]
    subject_dict[s_id] = {'volume_main_folder': volume_locs.loc[volume_inds[volume_i], 'Main folder'],
                          'volume_sub_folder': volume_locs.loc[volume_inds[volume_i], 'Subfolder'],
                          'annot_file': annot_file_paths[annot_i]}

## Read in the annotation data for all subjects we analyze

We also generate cleaned and supplemented annotations here

In [9]:
annotations = []
for s_id, d in subject_dict.items():
    tbl = read_full_annotations(d['annot_file'])
    tbl['subject_id'] = s_id
    annotations.append(tbl)

In [10]:
annotations = [get_basic_clean_annotations_from_full(annot) for annot in annotations]

In [11]:
annotations = pd.concat(annotations, ignore_index=True)

## Now threshold transitions to determine when events were preceeded or succeeded by quiet

In [12]:
annotations.loc[(annotations['start'] - annotations['beh_before_end']) > ps['q_th'], 'beh_before'] = 'Q'
annotations.loc[(annotations['beh_after_start'] - annotations['end']) > ps['q_th'], 'beh_after'] = 'Q'

annotations.drop(['beh_before_start', 'beh_before_end', 'beh_after_start', 'beh_after_end'], axis=1, inplace=True)

## Pool preceeding behaviors into one (G)rouped label if requested

In [13]:
if ps['pool_preceeding_behaviors']:
    annotations['beh_before'] = 'G'

## Now we read in the $\frac{\Delta F}{F}$ data for all subjects 

In [14]:
def calc_dff(f, b, background=ps['background'], ep=ps['ep']):
    return (f-b)/(b-background+ep)

In [15]:
def calc_mean_dff(x, start, stop):

    if ps['window_type'] == 'whole_event':
        take_slice = slice(start, stop)
        starts_within_event = True
        stops_within_event = True
    elif ps['window_type'] == 'start_locked':
        start_offset = start + ps['window_offset']
        stop_offset = start_offset + ps['window_length']
        take_slice = slice(start_offset, stop_offset)
        starts_within_event = ps['window_offset'] >= 0
        stops_within_event = (stop >= stop_offset) and (start <= stop_offset)
    else:
        raise(ValueError('The window_type is not recogonized.'))
    
    if (take_slice.start < 0) or (take_slice.stop > x.shape[0]):
        mn_vls = np.nan
    else:
        mn_vls = np.mean(x[take_slice, :], axis=0)
    
    return mn_vls, starts_within_event, stops_within_event

In [16]:
extracted_dff = dict()
for s_id in analyze_subjs:
    print('Gathering neural data for subject ' + s_id)
    
    # Load the dataset for this subject
    data_main_folder = subject_dict[s_id]['volume_main_folder']
    data_sub_folder = subject_dict[s_id]['volume_sub_folder']
    
    dataset_path = (Path(ps['dataset_base_folder']) / data_main_folder / data_sub_folder / 
                        Path(ps['dataset_folder']) / '*.pkl')
    dataset_file = glob.glob(str(dataset_path))[0]
    
    with open(dataset_file, 'rb') as f:
            dataset = ROIDataset.from_dict(pickle.load(f))
            
    # Calculate dff
    f=dataset.ts_data[ps['f_ts_str']]['vls'][:]
    b=dataset.ts_data[ps['bl_ts_str']]['vls'][:]
    dff = calc_dff(f=f, b=b)
    
    # Get the dff for each event
    s_events = annotations[annotations['subject_id'] == s_id]
    for index in s_events.index:
        event_start = s_events['start'][index]
        event_stop = s_events['end'][index] + 1 # +1 to account for inclusive indexing in table
        extracted_dff[index] = calc_mean_dff(dff, event_start, event_stop)

Gathering neural data for subject CW_17-08-23-L1
Gathering neural data for subject CW_17-08-23-L2
Gathering neural data for subject CW_17-08-23-L4
Gathering neural data for subject CW_17-08-24-L4
Gathering neural data for subject CW_17-08-24-L5
Gathering neural data for subject CW_17-08-26-L1
Gathering neural data for subject CW_17-08-26-L2
Gathering neural data for subject CW_17-08-26-L4
Gathering neural data for subject CW_17-08-26-L5
Gathering neural data for subject CW_17-08-26-L6
Gathering neural data for subject CW_17-08-27-L1
Gathering neural data for subject CW_17-08-27-L2
Gathering neural data for subject CW_17-08-27-L4
Gathering neural data for subject CW_17-08-27-L5
Gathering neural data for subject CW_17-08-28-L1
Gathering neural data for subject CW_17-08-28-L2
Gathering neural data for subject CW_17-08-29-L2
Gathering neural data for subject CW_17-08-31-L1
Gathering neural data for subject CW_17-09-01-L1
Gathering neural data for subject CW_17-09-01-L2
Gathering neural dat

## Remove any events where the $\Delta F /F$ window fell outside of the recorded data

In [17]:
bad_keys = [k for k, vl in extracted_dff.items() if np.all(np.isnan(vl[0]))]
for key in bad_keys:
    del extracted_dff[key]
    
annotations.drop(bad_keys, axis='index', inplace=True)

## Put $\Delta F/F$ into annotations table

In [18]:
annotations['dff'] = pd.Series({i:extracted_dff[i][0] for i in extracted_dff.keys()})
annotations['starts_within_event'] = pd.Series({i:extracted_dff[i][1] for i in extracted_dff.keys()})
annotations['stops_within_event'] = pd.Series({i:extracted_dff[i][2] for i in extracted_dff.keys()})

## Enforce using only contained events if we need to

In [19]:
if ps['enforce_contained_events']:
    keep_events = (annotations['starts_within_event'] == True) & (annotations['stops_within_event'] == True)
    annotations = annotations[keep_events]

## Now see how many subjects we have for each transition

In [20]:
n_subjs_per_trans = count_unique_subjs_per_transition(annotations, before_str='beh_before', after_str='beh')

In [21]:
n_subjs_per_trans

Unnamed: 0,B,F,H,O,P,Q,TL,TR
B,6.0,0.0,12.0,1.0,3.0,0.0,29.0,32.0
F,0.0,48.0,10.0,0.0,37.0,0.0,15.0,22.0
H,9.0,5.0,6.0,2.0,9.0,0.0,32.0,31.0
O,1.0,17.0,2.0,1.0,7.0,0.0,1.0,5.0
P,7.0,35.0,17.0,3.0,3.0,0.0,22.0,22.0
Q,34.0,60.0,49.0,35.0,48.0,0.0,55.0,58.0
TL,25.0,37.0,12.0,2.0,14.0,0.0,4.0,39.0
TR,28.0,35.0,13.0,1.0,21.0,0.0,37.0,1.0


## Get list of transitions we observe in enough subjects to analyze

In [22]:
analyze_trans = [[(bb, ab) for ab in n_subjs_per_trans.loc[bb].index if n_subjs_per_trans[ab][bb] >= min_n_subjs] 
                for bb in n_subjs_per_trans.index]
analyze_trans = list(itertools.chain(*analyze_trans))

## Down-select events in annotations to only those with transitions that we will analyze

In [23]:
keep_codes = [b[0] + b[1] for b in analyze_trans]
annot_trans_codes = [annotations['beh_before'][i] + annotations['beh'][i] for i in annotations.index]
keep_annots = np.asarray([True if code in keep_codes else False for code in annot_trans_codes])

In [24]:
analyze_annotations = annotations[keep_annots]

## Generate our regressors and group indicator variables

In [25]:
n_events = len(analyze_annotations)
n_analyze_trans = len(analyze_trans)

In [26]:
unique_ids = analyze_annotations['subject_id'].unique()
g = np.zeros(n_events)
for u_i, u_id in enumerate(unique_ids):
    g[analyze_annotations['subject_id'] == u_id] = u_i

In [27]:
x = np.zeros([n_events, n_analyze_trans])
for row_i in range(n_events):
    event_trans_code = analyze_annotations.iloc[row_i]['beh_before'] + analyze_annotations.iloc[row_i]['beh']
    event_trans_col = np.argwhere(np.asarray(keep_codes) == event_trans_code)[0][0]
    x[row_i, event_trans_col] = 1

## Now actually calculate our statistics

In [28]:
dff = np.stack(analyze_annotations['dff'].to_numpy())

In [29]:
# Define a function for calculating stats

def stats_f(x_i, y_i, g_i, alpha_i):
    beta, acm, n_grps = grouped_linear_regression_ols_estimator(x=x_i, y=y_i, g=g_i)
    stats = grouped_linear_regression_acm_stats(beta=beta, acm=acm, n_grps=n_grps, alpha=alpha_i)
    stats['beta'] = beta
    stats['acm'] = acm
    stats['n_grps'] = n_grps
    return stats

In [30]:
n_rois = dff.shape[1]
full_stats = [stats_f(x_i=x, y_i=dff[:, r_i], g_i=g, alpha_i=ps['alpha']) for r_i in range(n_rois)]

## Now save our results

In [31]:
rs = {'ps': ps, 'full_stats': full_stats, 'beh_trans': analyze_trans}

In [32]:
save_path = Path(ps['save_folder']) / ps['save_name']
with open(save_path, 'wb') as f:
    pickle.dump(rs, f)

## Debug code here