This is a notebook to see how many bootstrap samples we need to run in the linear modeling analysis so that every p-value is larger than the smallest value possible.

We examine p-values for the coefficients of encoding models of dff after a perturbation, as these dff values are large. 

The user can supply options specifying the cell type to look at, as well as events to consider, etc as well as importantly the number of bootstrap samples to perform.  The script will then do a full analysis, fitting linear models and calculating p-values for each coefficient in the model. The smallest p-value will be reported. We want this to be greater than 2/(# of bootstrap samples). 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import copy
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import time

from janelia_core.stats.regression import grouped_linear_regression_boot_strap
from janelia_core.stats.regression import grouped_linear_regression_boot_strap_stats

from keller_zlatic_vnc.linear_modeling import one_hot_from_table

## Parameters go here

In [3]:
data_dir = r'\\dm11\bishoplab\projects\keller_vnc\data\extracted_dff\A00c'
data_file = 'A00c_activity.pkl'

# Specify variables that we predict from
beh_before = ['Q', 'F', 'B']
beh_after = ['Q', 'F', 'B']
enc_beh_interactions = True
enc_subjects = True
closure = True # True if the only events we consider must start with a before_beh 
               # behavior and end with an beh_after behavior

# How many boot strap samples we use in each analysis
n_bs_smps = 10000000

# Determine what type of manipulation events we look at
manip_type = 'both' # 'both', 'A4' or 'A9'

## Load the data

In [4]:
data_path = Path(data_dir) / data_file
with open(data_path, 'rb') as f:
    data = pickle.load(f)
data = pd.DataFrame(data)

## Down select to only the manipulation events we want to consider

In [5]:
if manip_type == 'A4':
    print('Analyzing only A4 manipulation events.')
    data = data[data['man_tgt'] == 'A4']
elif manip_type == 'A9':
    print('Analyzing only A9 manipulation events.')
    data = data[data['man_tgt'] == 'A9']
else:
    print('Analyzing all manipulation events.')

Analyzing all manipulation events.


## Enforce closure if needed

In [6]:
if closure:
    print('Enforcing closure.')
    before_closure = np.asarray([b in set(beh_before) for b in data['beh_before']], 
                                dtype=bool)
    after_closure = np.asarray([b in set(beh_after) for b in data['beh_after']], 
                                dtype=bool)
    closure = np.logical_and(before_closure, after_closure)
    
    data = data[closure]

Enforcing closure.


## Get rid of rows of data that have no behavior of interest

In [7]:
before_ignore = np.asarray([b not in set(beh_before) for b in data['beh_before']], 
                                dtype=bool)
after_ignore = np.asarray([b not in set(beh_after) for b in data['beh_after']], 
                                dtype=bool)

ignore_rows = np.logical_and(before_ignore, after_ignore)

data = data[np.logical_not(ignore_rows)]

## Get groups of data (a group corresponds to each subject)

In [8]:
unique_ids = data['subject_id'].unique()
g = np.zeros(len(data))
for u_i, u_id in enumerate(unique_ids):
    g[data['subject_id'] == u_id] = u_i

## Pull out $\Delta F / F$

In [9]:
dff = data['dff_after'].to_numpy()

## Now we fit linear models with user specified options

In [10]:
one_hot_data, one_hot_vars = one_hot_from_table(data, 
                                                beh_before=beh_before, 
                                                beh_after=beh_after,
                                                enc_subjects=enc_subjects, 
                                                enc_beh_interactions=enc_beh_interactions)
if not enc_subjects:
    one_hot_vars.append('mean')

In [11]:
t_start = time.time()
reg_rs = grouped_linear_regression_boot_strap(x=one_hot_data, y=dff, g=g, n_bs_smps=n_bs_smps, include_mean=(not enc_subjects))
t_stop = time.time()

## Get statistics

In [12]:
stats = grouped_linear_regression_boot_strap_stats(reg_rs[0])

## Print results

In [13]:
comp_time = t_stop - t_start
smallest_p_vl = np.min(stats['non_zero_p'])

print('Performed ' + str(n_bs_smps) + ' bootstrap samples in ' + str(comp_time) + ' seconds.')
print('Smallest p-value: ' + str(smallest_p_vl))

Performed 10000000 bootstrap samples in 25144.7517683506 seconds.
Smallest p-value: 2e-07


In [14]:
stats['non_zero_p']

array([2.7946600e-01, 6.1826000e-03, 1.0200000e-05, 1.2166000e-03,
       2.0000000e-07, 1.6292000e-03, 1.8914460e-01, 5.9930580e-01,
       9.8274220e-01, 1.0296086e+00, 6.7662600e-02, 3.8470000e-01,
       8.0931520e-01, 1.3114360e-01, 2.9795840e-01, 3.8682200e-01,
       9.5472940e-01, 4.8350280e-01, 7.0462820e-01, 6.0632400e-01,
       3.9099080e-01, 8.2364200e-01, 5.8610200e-01, 8.5840780e-01,
       4.7769820e-01, 3.9948460e-01, 4.0053700e-01, 6.5321380e-01,
       8.3633660e-01, 9.6939380e-01, 9.1711180e-01, 8.0633120e-01,
       4.1299320e-01, 7.7146020e-01, 6.3875940e-01, 7.1180120e-01,
       4.4948980e-01, 6.1988460e-01, 9.5792640e-01, 1.0677572e+00,
       5.7302040e-01, 5.3390720e-01, 5.7672480e-01, 5.9279080e-01,
       6.3670260e-01, 1.1420252e+00, 7.1598460e-01, 7.0768800e-01,
       7.1696060e-01, 7.3127860e-01, 1.2725548e+00, 9.2470940e-01,
       7.6938600e-01, 1.1914636e+00, 1.3423810e+00, 1.1254466e+00,
       7.2885900e-01, 7.2934480e-01, 7.2853800e-01, 7.2979180e