This is a notebook to see what specimens we have in different forms of data. 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import glob
import itertools 

import numpy as np
import scipy.io

from keller_zlatic_vnc.data_processing import generate_standard_id_for_full_annots
from keller_zlatic_vnc.data_processing import generate_standard_id_for_trace_subject
from keller_zlatic_vnc.data_processing import generate_standard_id_from_matlab_id
from keller_zlatic_vnc.data_processing import read_raw_transitions_from_excel




## Parameters go here 

In [3]:
# Location of file Chen originally provided with events for the single cell, closed-loop analysis
sc_cl_events_file = r'/Users/bishopw/Documents/Janelia_Research/Projects/keller_zlatic_vnc/data/extracted_dff_v2/transition_list.xlsx'

# Locations of files Chen provided with the original single cell, closed loop activity
sc_cl_activity_folder = r'/Users/bishopw/Documents/Janelia_Research/Projects/keller_zlatic_vnc/data/extracted_dff_v2/'

a00c_cl_act_data_files = ['A00c_activity_A4.mat', 'A00c_activity_A9.mat']

basin_cl_act_data_files = ['Basin_activity_A4.mat', 'Basin_activity_A9.mat']

handle_cl_act_data_files = ['Handle_activity_A4.mat', 'Handle_activity_A9.mat']

# Folder containing annotations of all events
all_a4_events_annot_folder = r'/Volumes/bishoplab/projects/keller_vnc/data/full_annotations/behavior_csv_cl_A4'
all_a9_events_annot_folder = r'/Volumes/bishoplab/projects/keller_vnc/data/full_annotations/behavior_csv_cl_A9'

# Location of files provided by Chen containing the raw fluorescence traces for the single cells
trace_base_folder = r'/Volumes/bishoplab/projects/keller_vnc/data/single_cell/single_cell_traces'
a00c_trace_folder = 'A00c'
basin_trace_folder = 'Basin'
handle_trace_folder = 'Handle'

## Compare subjects with annotated events in the original single-cell closed loop data to those with activity.  Both forms of data were provided by Chen.   

What we should see here is that all subjects we have activity for we also have annotated events for.  However, we may have some subjects for which we have annotated events, but no activity.  That is fine because we used the subjects we have activity for as the set of analyzed subjects in the original analysis. 

In [4]:
sc_cl_events = read_raw_transitions_from_excel(sc_cl_events_file)
sc_cl_subjects = sc_cl_events['subject_id'].unique()

Make a small correction

In [5]:
ind = np.argwhere(sc_cl_subjects == 'CW_17-11-03-L6')[0][0]
sc_cl_subjects[ind] = 'CW_17-11-03-L6-2'

See which specimens Chen originally provided activity for in the single cell, closed-loop analysis

In [6]:
type_strings = ['a00c', 'handle', 'basin']
cl_act_subjects = [None]*3
for i, files in enumerate([a00c_cl_act_data_files, handle_cl_act_data_files, basin_cl_act_data_files]):
    
    a4_act = scipy.io.loadmat(Path(sc_cl_activity_folder) / files[0], squeeze_me=True)
    a9_act = scipy.io.loadmat(Path(sc_cl_activity_folder) / files[1], squeeze_me=True)
    
    # Correct mistakes in labeling
    if type_strings[i] == 'basin' or type_strings[i] == 'handle':
        ind = np.argwhere(a4_act['newTransitions'] == '0824L2CL')[1][0]
        a4_act['newTransitions'][ind] = '0824L2-2CL'

    a4_subjects = list(a4_act['newTransitions'])
    a9_subjects = list(a9_act['newTransitions'])
    a4_a9_subjects = list(set(a4_subjects + a9_subjects))
    a4_a9_subjects = [generate_standard_id_from_matlab_id(id) for id in a4_a9_subjects]
    a4_a9_subjects.sort()
    cl_act_subjects[i] = a4_a9_subjects

Make a small correction to labels

In [7]:
for subjs in cl_act_subjects:
    ind = np.argwhere(np.asarray(subjs) == 'CW_17-11-03-L6')[0][0]
    subjs[ind] = 'CW_17-11-03-L6-2'

For each cell type, compare the lists of subjects we have event annotations for to those we have activity for

In [8]:
for type_str, act_subjects in zip(type_strings, cl_act_subjects):
    print('***  For ' + type_str + ' cell types ***')
    print('Subjects with activity but no annotations:')
    print(set(act_subjects) - set(sc_cl_subjects))
    print('Subjects with annotations but no activity:')
    print(set(sc_cl_subjects) - set(act_subjects))

***  For a00c cell types ***
Subjects with activity but no annotations:
set()
Subjects with annotations but no activity:
{'CW_17-11-07-L2'}
***  For handle cell types ***
Subjects with activity but no annotations:
set()
Subjects with annotations but no activity:
{'CW_17-11-28-L2', 'CW_17-11-03-L2', 'CW_17-09-01-L1', 'CW_17-08-23-L1', 'CW_17-08-27-L3', 'CW_17-11-30-L3', 'CW_17-11-07-L2', 'CW_17-11-02-L3', 'CW_17-08-31-L1'}
***  For basin cell types ***
Subjects with activity but no annotations:
set()
Subjects with annotations but no activity:
{'CW_17-11-07-L2'}


## Now we ask if there are full annotations (provided by Nadine) for every subject that we originally analyzed in the single cell, closed loop analysis.



See which specimens are provided in the annotated events Nadine provided 

In [9]:
annot_file_paths = list(itertools.chain(*[glob.glob(str(Path(folder) / '*.csv')) for folder in 
                                          [all_a4_events_annot_folder, all_a9_events_annot_folder]]))

annot_file_names = [Path(p).name for p in annot_file_paths]
annot_subjs = [generate_standard_id_for_full_annots(fn) for fn in annot_file_names]
annot_subjs.sort()

Do the comparison for each cell type here 

In [10]:
for type_str, act_subjects in zip(type_strings, cl_act_subjects):
    print('***  For ' + type_str + ' cell types ***')
    print('Subjects with activity but no annotations:')
    print(set(act_subjects) - set(annot_subjs))
    print('Subjects with annotations but no activity:')
    print(set(annot_subjs) - set(act_subjects))

***  For a00c cell types ***
Subjects with activity but no annotations:
set()
Subjects with annotations but no activity:
{'CW_17-11-03-L6-1', 'CW_17-09-01-L3'}
***  For handle cell types ***
Subjects with activity but no annotations:
set()
Subjects with annotations but no activity:
{'CW_17-11-28-L2', 'CW_17-11-03-L2', 'CW_17-09-01-L1', 'CW_17-11-03-L6-1', 'CW_17-08-23-L1', 'CW_17-08-27-L3', 'CW_17-11-30-L3', 'CW_17-09-01-L3', 'CW_17-11-02-L3', 'CW_17-08-31-L1'}
***  For basin cell types ***
Subjects with activity but no annotations:
set()
Subjects with annotations but no activity:
{'CW_17-11-03-L6-1', 'CW_17-09-01-L3'}


## Now we ask if there are raw fluorescence traces (provided by Chen) for every subject that we originally analyzed in the single cell, closed loop analysis

Get a list of all subjects we have traces for

In [23]:
trace_subjects = [None]*3
for i, trace_folder in enumerate([a00c_trace_folder, handle_trace_folder, basin_trace_folder]):
    type_trace_folder = Path(trace_base_folder) / trace_folder
    cell_folders = glob.glob(str(type_trace_folder / '*.traces'))
    cell_folder_names = [Path(folder).name for folder in cell_folders]
    trace_subjects[i] = [generate_standard_id_for_trace_subject(id) for id in cell_folder_names]
    trace_subjects[i].sort()
    
    # Apply correction 
    ind = np.argwhere(np.asarray(trace_subjects[i]) == 'CW_17-11-03-L6-Q')
    if len(ind) > 0:
        ind = ind[0][0]
        trace_subjects[i][ind] = 'CW_17-11-03-L6-2'

Do the comparison for each cell type here 

In [24]:
for type_str, cl_subjects, tr_subjects in zip(type_strings, cl_act_subjects, trace_subjects):
    print('***  For ' + type_str + ' cell types ***')
    print('Subjects which were included in original closed loop analysis but we have no traces for:')
    print(set(cl_subjects) - set(tr_subjects))

***  For a00c cell types ***
Subjects which were included in original closed loop analysis but we have no traces for:
set()
***  For handle cell types ***
Subjects which were included in original closed loop analysis but we have no traces for:
set()
***  For basin cell types ***
Subjects which were included in original closed loop analysis but we have no traces for:
set()
