In [1]:
import numpy as np
import pandas as pd
import sys
import os 
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')
import glob

import sys
print(sys.path)

from sglm import utils, glm_fit

#d1 - T430
#d2 - T434

['/Users/liviamurray/photometry_preprocessing/sabatini-glm-workflow/notebooks', '/Users/liviamurray/opt/anaconda3/envs/photometry/lib/python311.zip', '/Users/liviamurray/opt/anaconda3/envs/photometry/lib/python3.11', '/Users/liviamurray/opt/anaconda3/envs/photometry/lib/python3.11/lib-dynload', '', '/Users/liviamurray/opt/anaconda3/envs/photometry/lib/python3.11/site-packages']


## Create a project

#### First, let's create a new project. The project directory will create a data and results folder and a config file.

#### You will need to edit the config file with the particular glm params you wish to use. Fields that are necessary to edit are: predictors, predictors_shift_bounds, response, and the glm_keyword_args.

#### You will also need to move your data into the data folder.

In [2]:
project_name = 'D1_all_glm'
project_dir = r'/Volumes/Neurobio/MICROSCOPE/Livia/glm_output'

utils.create_new_project(project_name, project_dir)

Project directory already exists!


'/Volumes/Neurobio/MICROSCOPE/Livia/glm_output/D1_all_glm/config.yaml'

# Import and Format Data

Input data should conform to the following convention and be saved as a *.csv:

Indices / Unique Row Identifiers:
* SessionName -- Any order is acceptable
* TrialNumber-- Must be in chronological order, but does not need to start from zero
* Timestamp -- Must be in chronological order, but does not need to start from zero

Columns (Predictors + Responses):
* Predictors - binary
* Reponses - e.g. neural responses (analog or binary)

Example, shown below is dummy data depicting a trial_0 that last four response timestamps:
| SessionName | TrialNumber | Timestamp | predictor_1 | predictor_2 | predictor_3 | response_1 | response_2 |
| --- | --- | --- | --- | --- | --- | --- | --- |
| session_0 | trial_0 | -1 | 0 | 0 | 0 | 1 | 0.3 |
| session_0 | trial_0 | 0 | 0 | 0 | 0 | 0 | 1.4 |
| session_0 | trial_0 | 1 | 0 | 0 | 0 | 1 | 2.3 |
| session_0 | trial_0 | 2 | 0 | 1 | 0 | 1 | 0.3 |
| session_0 | trial_1 | -2 | 0 | 0 | 0 | 0 | 1.4 |
| session_0 | trial_1 | -1 | 0 | 0 | 0 | 1 | 2.3 |
| session_0 | trial_1 | 0 | 1 | 0 | 0 | 0 | 1.4 |
| session_0 | trial_1 | 1 | 0 | 0 | 0 | 1 | 2.3 |
| session_1 | trial_0 | 5 | 0 | 0 | 0 | 0 | 1.4 |
| session_1 | trial_0 | 6 | 1 | 0 | 0 | 1 | 2.3 |
| session_1 | trial_0 | 7 | 0 | 0 | 0 | 0 | 1.4 |
| session_1 | trial_0 | 8 | 0 | 0 | 0 | 1 | 2.3 |
| session_1 | trial_1 | 9 | 0 | 0 | 0 | 0 | 1.4 |
| session_1 | trial_1 | 10 | 0 | 0 | 0 | 1 | 2.3 |
....

#### Now, let's get set up to start our project

In [3]:
project_path = os.path.join(project_dir, project_name)
files = os.listdir(project_path)

assert 'data' in files, 'data folder not found! {}'.format(files)
assert 'results' in files, 'results folder not found! {}'.format(files)
assert 'config.yaml' in files, 'config.yaml not found! {}'.format(files)

#### If needed, use the following function to combine multiple sessions into one csv. You will need a filename you wish to call your output_csv

In [4]:
# output_csv = 'output.csv'
output_csv = 'D1_all_glmFormat.csv'

# utils.combine_csvs(project_path, output_csv)

#### Next, we'll open the data and set the columns you wish to use as fixed indices

In [5]:
input_file = os.path.join(project_path, 'data', output_csv)
index_col = ['SessionName', 'TrialNumber', 'Timestamp']

df = utils.read_data(input_file, index_col)

print('Your dataframe has {} rows and {} columns'.format(df.shape[0], df.shape[1]))
df

Your dataframe has 1789648 rows and 17 columns


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Cue,ENL_Licks_L,ENL_Licks_R,Select_L,Select_R,ENLP_L,ENLP_R,Consumption_R_R,Consumption_R_L,Cons_more_R_R,Cons_more_R_L,Consumption_UR_R,Consumption_UR_L,Cons_more_UR_R,Cons_more_UR_L,z_grnR,z_grnL
SessionName,TrialNumber,Timestamp,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
T429_2023_07_12,2.0,50.697830,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
T429_2023_07_12,2.0,50.718474,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
T429_2023_07_12,2.0,50.739118,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
T429_2023_07_12,2.0,50.759762,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
T429_2023_07_12,2.0,50.780406,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T442_2023_10_04,6913.0,2310.062899,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.548984,-0.562397
T442_2023_10_04,6913.0,2310.083543,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.615215,-0.683538
T442_2023_10_04,6913.0,2310.104187,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.679485,-0.781860
T442_2023_10_04,6913.0,2310.124831,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.705743,-0.835182


#### You can now explore and add to the dataframe. As an example, you may want to add various "predictors" or "features" to explore. You can use the example below as inspiration

In [6]:
# #Identify the individual licks that have specific meaning in the tasks: 
# #lick 1, lick 2 and lick 3 are "operant licks" on different training days
# #licknon1-3 are all the other licks

# df_source = df.copy()
# srs_lick = df_source.groupby(['SessionName', 'TrialNumber'])['Lick'].cumsum()
# srs_lick_count = srs_lick * df_source['Lick']
# df_lick_count_dummies = pd.get_dummies(srs_lick_count).drop(0, axis=1)
# df_lick_count_dummies = df_lick_count_dummies[[1,2,3]]
# df_lick_count_dummies['non1-3'] = df_source['Lick'] - df_lick_count_dummies.sum(axis=1)
# df_lick_count_dummies.columns = [f'lick_{original_column_name}' for original_column_name in df_lick_count_dummies.columns]

# # Columns lick and lick_1, lick_2, lick_3, lick_non-13 should not all be used together
# # as predictors because of multicollinearity.
# df_source = pd.concat([df_source, df_lick_count_dummies], axis=1)
# df_source

# assert np.all(df_source['Lick'] == df_source[['lick_1', 'lick_2', 'lick_3', 'lick_non1-3']].sum(axis=1)), 'Column lick should equal the sum of all other lick columns.'

#### Friendly reminder, the df we have imported is mutli-index, meaning, it's organization is dependent on 3-columns that we have set in index_col. Therefore, we can use "groupby" if you need to split the organization. 

In [7]:
# reIndex = df_source.groupby(level=[0, 1])

## Load your fitting paramaters and set up your train/test data

In [8]:
config_file = os.path.join(project_path, 'config.yaml')
config = utils.load_config(config_file)
config

{'Project': {'project_name': 'D1_all_glm',
  'project_path': '/Volumes/Neurobio/MICROSCOPE/Livia/glm_output/D1_all_glm'},
 'glm_params': {'glm_keyword_args': {'alpha': [0.05,
    0.075,
    0.08,
    0.1,
    0.2,
    0.3,
    0.4,
    0.5,
    0.6,
    0.7,
    0.8],
   'cv': 5,
   'fit_intercept': True,
   'l1_ratio': [0.005,
    0.0075,
    0.008,
    0.009,
    0.01,
    0.02,
    0.03,
    0.04,
    0.05,
    0.075,
    0.08,
    0.1],
   'max_iter': 10000,
   'n_alphas': 100,
   'n_jobs': -1,
   'score_metric': 'r2',
   'selection': 'cyclic',
   'warm_start': False},
  'predictors': ['Cue',
   'ENL_Licks_L',
   'ENL_Licks_R',
   'Select_L',
   'Select_R',
   'ENLP_L',
   'ENLP_R',
   'Consumption_R_R',
   'Cons_more_R_R',
   'Consumption_R_L',
   'Cons_more_R_L',
   'Consumption_UR_R',
   'Cons_more_UR_R',
   'Consumption_UR_L',
   'Cons_more_UR_L'],
  'predictors_shift_bounds': {'Cue': [-3, 3],
   'ENL_Licks_L': [-3, 3],
   'ENL_Licks_R': [-3, 3],
   'Select_L': [-3, 3],
   'Sel

#### Shift responses and predictors. If you do not want to shift your predictors by an amount you set, feel free to comment out the entire "predictors_shift_bounds" in config.yaml. We will then use the default set when we created the config file.

In [9]:
response_shift, df_predictors_shift, shifted_params = glm_fit.shift_predictors(config, df)
print('Your dataframe was shifted using: {}'.format(shifted_params))


Your dataframe was shifted using: [('Cue', [-3, 3]), ('ENL_Licks_L', [-3, 3]), ('ENL_Licks_R', [-3, 3]), ('Select_L', [-3, 3]), ('Select_R', [-3, 3]), ('ENLP_L', [-3, 3]), ('ENLP_R', [-3, 3]), ('Consumption_R_R', [-3, 3]), ('Cons_more_R_R', [-3, 3]), ('Consumption_R_L', [-3, 3]), ('Cons_more_R_L', [-3, 3]), ('Consumption_UR_R', [-3, 3]), ('Cons_more_UR_R', [-3, 3]), ('Consumption_UR_L', [-3, 3]), ('Cons_more_UR_L', [-3, 3])]


In [10]:
response_shift 
temp = response_shift.values.flatten()

### Create your test/train datasets

In [11]:
X_train,X_test, y_train, y_test = glm_fit.split_data(df_predictors_shift, temp, config)

print('Training data has {} rows and {} columns'.format(X_train.shape[0], X_train.shape[1]))
print('Testing data has {} rows and {} columns'.format(X_test.shape[0], X_test.shape[1]))


Training data has 1317213 rows and 105 columns
Testing data has 329304 rows and 105 columns


In [12]:
# y_test1 = (y_test.values).flatten()


## Now, we're ready to run our GLM!

### We have two different options. If you know which params you would like to use, you can use the glm_fit.fit_glm function. If you would like tune your hyperparams to determine which are the best to use, you can use the glm_fit.fit_tuned_glm function. 

In [13]:
# # Fit the model
# model, y_pred, score, beta, intercept, sparse_beta = glm_fit.fit_glm(config, X_train, X_test, y_train, y_test)
# print('Your model can account for {} percent of your data'.format(score*100))

In [14]:
# Fit the model with cross validation: remember, your alphas and l1_ratios should be lists
tuned_model, y_pred, score, beta, best_params = glm_fit.fit_tuned_glm(config, X_train, X_test, y_train, y_test)
print('Your model can account for {} percent of your data'.format(score*100))

ypred: [-0.01895279 -0.01895279 -0.01895279 ... -0.01895279 -0.01895279
 -0.01895279]
y: [-1.23502479 -0.53241151 -0.43062487 ...  0.07630479  0.60218541
 -0.28818469]
Your model can account for 1.3622700810245414 percent of your data


In [15]:
best_params


{'alpha': 0.05, 'l1_ratio': 0.005}

1.0129437838088062 -state
1.002032663595287 -state
1.0056594783224626 -state


4.058554671042813 - reward


5.296818236678524 - alpha': 0.05, 'l1_ratio': 0.01, rewards
5.365408986535803 ^^


## Save your outputs

In [16]:
#Create your model dictonary, this should include all the information you wish to save
model_dict = {'model': model,
              'y_pred': y_pred,
              'score': score,
              'beta': beta,
              'intercept': intercept,
              'sparse_beta': sparse_beta,}

#Save your model dictionary
glm_fit.save_model(model_dict, config)

NameError: name 'model' is not defined

## Generate and save figures

In [None]:
# glm_fit.plot_and_save(config, y_pred, y_test, beta, df_predictors_shift)