A notebook for prototyping the single cell analysis code

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import copy
import glob
import itertools
from pathlib import Path
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from janelia_core.dataprocessing.baseline import percentile_filter_1d
from janelia_core.stats.regression import grouped_linear_regression_ols_estimator
from janelia_core.stats.regression import grouped_linear_regression_acm_stats
from janelia_core.stats.regression import grouped_linear_regression_acm_linear_restriction_stats
from janelia_core.stats.regression import visualize_coefficient_stats

from keller_zlatic_vnc.data_processing import calc_dff
from keller_zlatic_vnc.data_processing import count_unique_subjs_per_transition
from keller_zlatic_vnc.data_processing import find_before_and_after_events
from keller_zlatic_vnc.data_processing import generate_standard_id_for_full_annots
from keller_zlatic_vnc.data_processing import read_full_annotations
from keller_zlatic_vnc.data_processing import read_trace_data
from keller_zlatic_vnc.data_processing import single_cell_extract_dff_with_anotations

from keller_zlatic_vnc.linear_modeling import one_hot_from_table



In [3]:
%matplotlib notebook

## Parameters go here 

In [7]:
ps = dict()

# The file specifying which subjects we should include in the analysis
ps['subject_file'] = r'/Volumes/bishoplab/projects/keller_vnc/data/single_cell/subjects.csv'

# Location of files provided by Chen containing the raw fluorescence traces for the single cells
ps['trace_base_folder'] = r'/Volumes/bishoplab/projects/keller_vnc/data/single_cell/single_cell_traces'
ps['a00c_trace_folder'] = 'A00c'
ps['basin_trace_folder'] = 'Basin'
ps['handle_trace_folder'] = 'Handle'

# Location of folders containing annotations
ps['a4_annot_folder'] = r'/Volumes/bishoplab/projects/keller_vnc/data/full_annotations/behavior_csv_cl_A4'
ps['a9_annot_folder'] = r'/Volumes/bishoplab/projects/keller_vnc/data/full_annotations/behavior_csv_cl_A9'

# Specify the type of neurons we analyze
ps['cell_type'] = 'a00c'

# Specfy the cell ids we analyze as a list. If None, we analyze all cell ids
ps['cell_ids']  = ['antL', 'antR']

# Parameters for calculating Delta F/F

ps['baseline_calc_params'] = dict()
ps['baseline_calc_params']['window_length'] = 30001
ps['baseline_calc_params']['filter_start'] = -1500
ps['baseline_calc_params']['write_offset'] = 1500
ps['baseline_calc_params']['p'] = 0.1

ps['dff_calc_params'] = dict()
ps['dff_calc_params']['background'] = 100
ps['dff_calc_params']['ep'] = 20

# Specify which behaviors we are willing to include in the analysis - b/c we have to see each behavior in 
# enough subjects (see below) all of these behaviors may not be included in an analysis, but this gives the
# list of what we are least willing to consider.  If None, all behaviors will be considered

ps['behs'] = ['Q', 'TC', 'B', 'F', 'H']

# The particular behavior we treat as a reference
ps['ref_beh'] = 'Q'

# Specify the minimum number of subjects we have to see preceeding and succeeding behaviors in to include in the
# analysis
ps['min_n_pre_subjs'] = 3

# Specify the minimum number of subjects we have to see preceeding and succeeding behaviors in to include in the
# analysis
ps['min_n_succ_subjs'] = 3

# Specify the manipulation target for subjects we want to analyze, None indicates both A4 and A9
ps['man_tgt'] = None

# Say if we should pool preceeding and succeeding turns
ps['pool_pre_turns'] = True
ps['pool_succ_turns'] = True

# Parameters for declaring preceeding and succeeding quiet behaviors
ps['pre_q_th'] = 50
ps['succ_q_th'] = 9

# The type of window we use
ps['dff_window_type'] = 'start_aligned'
# The reference we use for aligning windows
ps['dff_window_ref'] = 'beh_before_start'

# The offset we applying when placing DFF windows
ps['dff_window_offset'] = 0

# The length of the window we calculate DFF in
ps['dff_window_length'] = 3 #3

# The event we align the end of the window to (if we are not using windows of fixed length)
ps['dff_window_end_ref'] = 'end'

# The offset when aligning the end of the window (if we are not using windows of fixed length)
ps['dff_window_end_offset'] = 1 


## Read in the basic data for each subject.

In [5]:
# Get the list of all subjects we need to process
subjects = list(pd.read_csv(ps['subject_file'])['Subject'])

In [6]:
data = type_subjects = read_trace_data(subjects=subjects, 
                   a00c_trace_folder=Path(ps['trace_base_folder'])/ps['a00c_trace_folder'], 
                   handle_trace_folder=Path(ps['trace_base_folder'])/ps['handle_trace_folder'], 
                   basin_trace_folder=Path(ps['trace_base_folder'])/ps['basin_trace_folder'])

No traces found for handle cells for subject CW_17-08-23-L1.
Done reading in data for subject 1 of 64.
Done reading in data for subject 2 of 64.
Done reading in data for subject 3 of 64.
Done reading in data for subject 4 of 64.
Done reading in data for subject 5 of 64.
Done reading in data for subject 6 of 64.
Done reading in data for subject 7 of 64.
Done reading in data for subject 8 of 64.
Done reading in data for subject 9 of 64.
Done reading in data for subject 10 of 64.
Done reading in data for subject 11 of 64.
Done reading in data for subject 12 of 64.
Done reading in data for subject 13 of 64.
Done reading in data for subject 14 of 64.
Done reading in data for subject 15 of 64.
No traces found for handle cells for subject CW_17-08-27-L3.
Done reading in data for subject 16 of 64.
Done reading in data for subject 17 of 64.
Done reading in data for subject 18 of 64.
Done reading in data for subject 19 of 64.
Done reading in data for subject 20 of 64.
Done reading in data for su

## Down select to only the cells of the type and id we want to analyze

In [8]:
# Down select by cell type
data = data[data['cell_type'] == ps['cell_type']]

# Down select by cell id
if ps['cell_ids'] is not None:
    print('Here')
    data = data[data['cell_id'].isin(ps['cell_ids'])]

Here


## Calculate $\Delta F/F$ for each cell

In [None]:
n_cells = data.shape[0]
dff = [None]*n_cells
for cell_row, cell_idx in enumerate(data.index):
    baseline = percentile_filter_1d(data['f'][cell_idx], **ps['baseline_calc_params']) 
    dff[cell_row] = calc_dff(f=data['f'][cell_idx], b=baseline, **ps['dff_calc_params'])

data['dff'] = dff

## Find stimulus events for the subjects we are analyzing

In [None]:
# Get list of subjects we have annotations for
a4_file_paths = glob.glob(str(Path(ps['a4_annot_folder']) / '*.csv'))
a9_file_paths = glob.glob(str(Path(ps['a9_annot_folder']) / '*.csv'))

n_annot_files = len(a4_file_paths) + len(a9_file_paths)
a4_files = np.zeros(n_annot_files, dtype=np.bool)
a4_files[0:len(a4_file_paths)] = True

annot_file_paths = a4_file_paths + a9_file_paths

annot_file_names = [Path(p).name for p in annot_file_paths]
annot_subjs = [generate_standard_id_for_full_annots(fn) for fn in annot_file_names]

In [None]:
# Get stimulus events for each subject we analyze
analysis_subjs = list(data['subject_id'].unique())
subj_events = pd.DataFrame()

for subj in analysis_subjs:
    
    # Find the annotations for this subject
    ind = np.argwhere(np.asarray(annot_subjs) == subj)
    if len(ind) == 0:
        raise(RuntimeError('Unable to find annotations for subject ' + subj + '.'))
    else:
        ind = ind[0][0]
        
    # Load the annotations for this subject
    tbl = read_full_annotations(annot_file_paths[ind])
    
    # Pull out stimulus events for this subject, noting what comes before and after
    stim_tbl = copy.deepcopy(tbl[tbl['beh'] == 'S'])
    stim_tbl.insert(0, 'subject_id', subj)
    stim_tbl.insert(1, 'event_id', range(stim_tbl.shape[0]))
    if a4_files[ind] == True:
        stim_tbl.insert(2, 'manipulation_tgt', 'A4')
    else:
        stim_tbl.insert(2, 'manipulation_tgt', 'A9')
    before_after_tbl = find_before_and_after_events(events=stim_tbl, all_events=tbl)
    stim_annots = pd.concat([stim_tbl, before_after_tbl], axis=1)
    subj_events = subj_events.append(stim_annots, ignore_index=True)


## Get rid of any events where we could not classify the type of preceeding or succeeding behavior 

In [None]:
subj_events = subj_events.dropna()

## Mark preceeding and succeeding quiet events

In [None]:
delta_before = subj_events['start'] - subj_events['beh_before_end']
delta_after = subj_events['beh_after_start'] - subj_events['end']

subj_events.loc[delta_before > ps['pre_q_th'], 'beh_before'] = 'Q'
subj_events.loc[delta_after > ps['succ_q_th'], 'beh_after'] = 'Q'

## Down select events based on manipulation target

In [None]:
if ps['man_tgt'] is not None:
    subj_events = subj_events[subj_events['manipulation_tgt'] == ps['man_tgt']]

## Pool turns if we are suppose to 

In [None]:
if ps['pool_pre_turns']:
    turn_rows = (subj_events['beh_before'] == 'TL') | (subj_events['beh_before'] == 'TR')
    subj_events.loc[turn_rows, 'beh_before'] = 'TC'

if ps['pool_succ_turns']:
    turn_rows = (subj_events['beh_after'] == 'TL') | (subj_events['beh_after'] == 'TR')
    subj_events.loc[turn_rows, 'beh_after'] = 'TC'

# Down select to only the type of behaviors we are willing to consider 

In [None]:
if ps['behs'] is not None:
    keep_inds = [i for i in subj_events.index if subj_events['beh_before'][i] in set(ps['behs'])]
    subj_events = subj_events.loc[keep_inds]
    
    keep_inds = [i for i in subj_events.index if subj_events['beh_after'][i] in set(ps['behs'])]
    subj_events = subj_events.loc[keep_inds]

## Drop any behaviors that do not appear in enough subjects

In [None]:
subj_trans_counts = count_unique_subjs_per_transition(table=subj_events)
n_before_subjs = subj_trans_counts.sum(axis=1)
n_after_subjs = subj_trans_counts.sum(axis=0)

before_an_behs = set([i for i in n_before_subjs.index if n_before_subjs[i] >= ps['min_n_pre_subjs']])
after_an_behs = set([i for i in n_after_subjs.index if n_after_subjs[i] >= ps['min_n_succ_subjs']])

keep_inds = [i for i in subj_events.index if subj_events['beh_before'][i] in before_an_behs]
subj_events = subj_events.loc[keep_inds]

keep_inds = [i for i in subj_events.index if subj_events['beh_after'][i] in after_an_behs]
subj_events = subj_events.loc[keep_inds]

In [None]:
subj_events

## Pull out $\Delta F /F$ for each event and cell along with all information we need for performing statistics

In [None]:
full_tbl = single_cell_extract_dff_with_anotations(activity_tbl=data, event_tbl=subj_events,
                                                   align_col=ps['dff_window_ref'],
                                                   ref_offset=ps['dff_window_offset'],
                                                   window_l=ps['dff_window_length'],
                                                   window_type=ps['dff_window_type'],
                                                   end_align_col=ps['dff_window_end_ref'],
                                                   end_ref_offset=ps['dff_window_end_offset'])

## Find grouping of data by subject

In [None]:
unique_ids = full_tbl['subject_id'].unique()
g = np.zeros(len(full_tbl))
for u_i, u_id in enumerate(unique_ids):
    g[full_tbl['subject_id'] == u_id] = u_i

## Fit models and calculate stats

In [None]:
before_behs = full_tbl['beh_before'].unique() 
before_behs_ref = list(set(before_behs).difference(ps['ref_beh']))

after_behs = full_tbl['beh_after'].unique() 
after_behs_ref = list(set(after_behs).difference(ps['ref_beh']))


one_hot_data_ref, one_hot_vars_ref = one_hot_from_table(full_tbl, beh_before=before_behs_ref, 
                                                        beh_after=after_behs_ref)

one_hot_data_ref = np.concatenate([one_hot_data_ref, np.ones([one_hot_data_ref.shape[0], 1])], axis=1)
one_hot_vars_ref = one_hot_vars_ref + ['ref'] 

_, v, _ = np.linalg.svd(one_hot_data_ref)
if np.min(v) < .001:
    raise(RuntimeError('regressors are nearly co-linear'))
    
beta, acm, n_gprs = grouped_linear_regression_ols_estimator(x=one_hot_data_ref, y=full_tbl['dff'].to_numpy(), 
                                                                g=g)

mdl_stats = grouped_linear_regression_acm_stats(beta=beta, acm=acm, n_grps=n_gprs, alpha=.05)

In [None]:
## Here we compare coefficients
n_grps = len(np.unique(g))

cmp_vars = one_hot_vars_ref[0:-1]
cmp_p_vls = np.zeros(len(cmp_vars))


before_inds = np.asarray([True if re.match('beh_before*', var) else False for var in one_hot_vars_ref])
after_inds = np.asarray([True if re.match('beh_after*', var) else False for var in one_hot_vars_ref])

n_before_vars = np.sum(before_inds)
n_after_vars = np.sum(after_inds)

for v_i, var in enumerate(cmp_vars):
    if before_inds[v_i] == True:
        cmp_beta = beta[before_inds]
        cmp_acm = acm[np.ix_(before_inds, before_inds)]
        cmp_i = v_i
    else:
        cmp_beta = beta[after_inds]
        cmp_acm = acm[np.ix_(after_inds, after_inds)]
        cmp_i = v_i - n_before_vars
        
    r = np.ones(len(cmp_beta))/(len(cmp_beta) - 1)
    r[cmp_i] = -1
    cmp_p_vls[v_i] = grouped_linear_regression_acm_linear_restriction_stats(beta=cmp_beta, acm=cmp_acm, r=r,
                                                                   q=np.asarray([0]), n_grps=n_grps)

In [None]:
visualize_coefficient_stats(var_strs=one_hot_vars_ref, theta=beta, c_ints=mdl_stats['c_ints'], 
                            sig=mdl_stats['non_zero'],
                            x_axis_rot=90)
plt.ylabel('$\Delta F / F$')
plt.xlabel('Behavior')
plt.tight_layout()
fig = plt.gcf()
fig.set_size_inches(8, 6)

for v_i, var in enumerate(cmp_vars):
    print(var + ': ' + str(cmp_p_vls[v_i]))

## Debug code goes here

In [None]:
plt.figure()
plt.plot(data[data['subject_id'] == 'CW_17-11-03-L6-2']['dff'][1755])

In [None]:
subj_events[subj_events['subject_id'] == 'CW_17-11-03-L6-2']

In [None]:
(subj_events['end'] - subj_events['start']).median()