The purpose of this notebook is to simulate recording the same neuron across different subjects, where we specify the ground-truth encoding model of preceeding and suceeding behavior for the neuron, and see if we can correctly detect the presence of an interaction term.

We seek to include potentially confounding factors in the simulations that may also be present in the real data.  In particular, we want to include:

    1) Offsets for each subject (random, independent of behavior)

    2) Correlations between preceeding and succeeding behaviors (we do this through 1st order Markov dependence) 
    
    3) Recordings where we only record transitions starting with a given behavior for each subject
    
    4) Varying amounts of noise from subject to subject


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from keller_zlatic_vnc.data_processing import count_unique_subjs_per_transition
from keller_zlatic_vnc.linear_modeling import one_hot_from_table
from keller_zlatic_vnc.linear_modeling import reference_one_hot_to_beh

from janelia_core.stats.regression import grouped_linear_regression_ols_estimator
from janelia_core.stats.regression import grouped_linear_regression_acm_stats



In [3]:
pd.set_option('display.max_rows', None)

## Specifications for the simulation

In [4]:
# Labels for behaviors
beh_lbls = ['B', 'F', 'Q'] 

# Give probability of transition from (rows) each behavior to (cols) each behavior 
beh_trans_probs = np.asarray([[.4, .2, .4], 
                              [.1, .8, .1], 
                              [.1, .5, .4]])

#beh_trans_probs = np.ones([3, 3])/3

# Specify true encoding 
# Encoding of each preceeding behavior 
before_c =[1.2, 1.2, 1.2] 
# Encoding of each suceeding behavior
after_c = [.1, .1, .1] 
# Encoding of interaction
interact_c = np.asarray([[0.0, 1.0, .0], 
                         [.0, .0, .0], 
                         [.0, .0, 1.0]])

# Range that standard deviation of noise terms can be pulled from - each subject gets its own amount of noise
noise_std_range = [.5, 1.5]

# Standard deviation of normal distribution offsets are pulled from
offset_std = .1

# Range of number of trials we record per subject
n_trials_range = [5, 25]

# Number of subject we simulate recording from
n_subjs = 100

# List the candidate interactions we want to test

beh_before = ['B', 'F', 'Q']
beh_after = ['B', 'F', 'Q']

beh_interactions = [('B', 'F'), ('F', 'F'), ('Q', 'Q')]

beh_ref = 'Q'

## Generate simulated data

In [5]:
def generate_subject_data(start_beh_ind, n_trials, sub_n):
    
    n_behs = len(beh_lbls)
    
    offset = np.random.randn()*offset_std
    noise_std = np.random.uniform(low=noise_std_range[0], high=noise_std_range[1])
    
    dff = np.zeros(n_trials)
    beh_after = [None]*n_trials
    for t_i in range(n_trials):
        end_beh_i = np.random.choice(n_behs, p=beh_trans_probs[start_beh_ind, :])
        
        beh_after[t_i] = beh_lbls[end_beh_i]
        
        dff[t_i] = before_c[start_beh_ind] + after_c[end_beh_i] + interact_c[start_beh_ind, end_beh_i]
        dff[t_i] += offset + np.random.randn()*noise_std
    
    data = pd.DataFrame(data={'subject_id': 'subj_' + str(sub_n),
                              'beh_before': beh_lbls[start_beh_ind],
                              'beh_after': beh_after,
                              'dff': dff})
    
    return [data, offset, noise_std]
    

In [6]:
n_behs = len(beh_lbls)
subj_data = [None]*n_subjs
for s_i in range(n_subjs):
    n_trials_i = np.random.randint(low=n_trials_range[0], high=n_trials_range[1]+1)
    start_beh_i = np.random.randint(low=0, high=len(beh_lbls))
    subj_data[s_i], _, _ = generate_subject_data(start_beh_ind=np.random.randint(low=0, high=n_behs), 
                                                 n_trials=n_trials_i, sub_n=s_i)
    
data = pd.concat(subj_data, ignore_index=True)

## See how many subjects we recorded for each type of behavior transition

In [7]:
count_unique_subjs_per_transition(data)

Unnamed: 0,B,F,Q
B,31.0,28.0,31.0
F,27.0,35.0,32.0
Q,24.0,34.0,33.0


## Pull out $\Delta F/F$

In [8]:
dff = data['dff']

## Find grouping of data by subject

In [9]:
unique_ids = data['subject_id'].unique()
g = np.zeros(len(data))
for u_i, u_id in enumerate(unique_ids):
    g[data['subject_id'] == u_id] = u_i

## Calculate stats

In [10]:
one_hot_data, one_hot_vars = one_hot_from_table(data, beh_before=beh_before, beh_after=beh_after, 
                                         enc_subjects=False, enc_beh_interactions=False, 
                                         beh_interactions=beh_interactions)

one_hot_data_ref, one_hot_vars_ref = reference_one_hot_to_beh(one_hot_data=one_hot_data, 
                                                              one_hot_vars=one_hot_vars, 
                                                              beh=beh_ref, 
                                                              remove_interaction_term=False)

one_hot_data_ref = np.concatenate([one_hot_data_ref, np.ones([one_hot_data_ref.shape[0], 1])], axis=1)
one_hot_vars_ref.append('ref_'+ beh_ref)

In [11]:
_, v, _ = np.linalg.svd(one_hot_data_ref)
print(v)
if np.min(v) < .001:
    raise(RuntimeError('regressors are nearly co-linear'))

[52.21416658 28.31779181 18.03123105 17.29154293 11.98023242  8.37623082
  7.87976198  2.20972051]


In [12]:
beta, acm, n_gprs = grouped_linear_regression_ols_estimator(x=one_hot_data_ref, y=dff, g=g)
stats = grouped_linear_regression_acm_stats(beta=beta, acm=acm, n_grps=n_gprs, alpha=.05)

## View results

In [13]:
rs = pd.DataFrame(data=np.stack([beta, stats['non_zero_p']]).transpose(), index=one_hot_vars_ref, 
                  columns=['beta', 'p'])

In [14]:
def color_small_p(val):
    color = 'red' if val < .05 else 'black'
    return 'color: %s' % color

In [15]:
rs.style.applymap(color_small_p, subset=['p'])

Unnamed: 0,beta,p
beh_before_B,-0.192945,0.242065
beh_before_F,-0.315698,0.0778001
beh_after_B,-0.102607,0.250188
beh_after_F,-0.220581,0.243092
beh_interact_BF,1.25398,4.07494e-08
beh_interact_FF,0.384553,0.0635856
beh_interact_QQ,0.728576,0.000303389
ref_Q,1.51854,9.99076e-15
