In [None]:
import pickle
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
import glob as glob
import os
import sys; sys.path.append("../../..")  # Allows access to all the scripts/modules in the larger directory
from pyuoi.linear_model import UoI_L1Logistic
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import hamming_loss
from collections import defaultdict
from utils import calc_loadings
from scipy.stats import pearsonr
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot

### Load consolidated decoding dataframe

In [None]:
decoding_glom_path = '/home/marcush/Data/TsaoLabData/neural_control_output/degraded_decoding_param_search/degraded_decoding_param_search_glom.pickle'
with open(decoding_glom_path, 'rb') as f:
    dat_decode = pickle.load(f) 

df_decode = pd.DataFrame(dat_decode)

In [None]:
preloaded_data_path = glob.glob(df_decode['data_path'][0] + "/preloaded/preloaded_data_*.pickle")[0]
with open(preloaded_data_path, 'rb') as f:
    preload_dat = pickle.load(f) 

In [None]:
def make_hashable(d):
    return tuple(sorted((key, make_hashable(value)) if isinstance(value, dict) else (key, value)
                        for key, value in d.items()))


unique_hashes = set(make_hashable(d) for d in df_decode['loader_args'])
unique_dicts = [dict(u) for u in unique_hashes]

for u in unique_dicts:
    u['data_path'] = df_decode['data_path'][0] + "/" + df_decode['data_file'][0]
    u['spike_threshold'] = None
    u['trial_threshold'] = None

In [None]:
preload_dict_path = df_decode['data_path'][0] + "/preloaded/preloadDict.pickle"

with open(preload_dict_path, 'rb') as file:
    preloadDict = pickle.load(file)


for arg_dict in unique_dicts:
    arg_tuple = tuple(sorted(arg_dict.items()))


    for args in preloadDict.keys():

        if args == arg_tuple:

            preloadID = preloadDict[arg_tuple]
            loaded_data_path = os.path.dirname(preload_dict_path) + f"/preloaded_data_{preloadID}.pickle"
            
            if arg_dict['region'] == 'AM':
                with open(loaded_data_path, 'rb') as file:
                    AM_loaded_data = pickle.load(file)

            elif arg_dict['region'] == 'ML':
                with open(loaded_data_path, 'rb') as file:
                    ML_loaded_data = pickle.load(file)

AM_spikes = np.sum(AM_loaded_data['spike_rates'], 1)
ML_spikes = np.sum(ML_loaded_data['spike_rates'], 1)

In [None]:
degradedIDs = preload_dat['degradedIDs']
dimensions = np.unique(df_decode['dim'])
n_folds = np.unique(df_decode['fold_idx'])
regions = np.unique(df_decode['loader_args'].apply(lambda x: x.get('region')))
dimreduc_methods = np.unique(df_decode['dimreduc_method'])
stimIDs = AM_loaded_data['StimIDs']
degraded_trial_IDs = AM_loaded_data['stratifiedIDs']

In [None]:
preload_dat.keys()

# Decoding if a trial is degraded vs clear (using sklearn's logistic regression - default L2 regularization)

In [None]:
num_splits = 10
kf = KFold(num_splits)
AM_weights_degclear = np.zeros((num_splits, AM_spikes.shape[1]))
losses = np.zeros(num_splits)

for i, (train_index, test_index) in enumerate(kf.split(AM_spikes)):

    Xtrain = AM_spikes[train_index,:]
    Ytrain = degraded_trial_IDs[train_index]

    Xtest = AM_spikes[test_index,:]
    Ytest = degraded_trial_IDs[test_index]

    scaler = StandardScaler()
    Xtrain = scaler.fit_transform(Xtrain)

    clf = LogisticRegression().fit(Xtrain, Ytrain)
    predictions = clf.predict(scaler.fit_transform(Xtest))
    loss = hamming_loss(Ytest, predictions)
    losses[i] = loss

    AM_weights_degclear[i, :] = np.mean(np.abs(clf.coef_), 0)

AM_weights_degclear = np.mean(AM_weights_degclear, 0)
print(f"Average loss for region AM on degraded vs clear: {np.mean(losses)}")

In [None]:
num_splits = 10
kf = KFold(num_splits)
ML_weights_degclear = np.zeros((num_splits, ML_spikes.shape[1]))
losses = np.zeros(num_splits)

for i, (train_index, test_index) in enumerate(kf.split(ML_spikes)):

    Xtrain = ML_spikes[train_index,:]
    Ytrain = degraded_trial_IDs[train_index]

    Xtest = ML_spikes[test_index,:]
    Ytest = degraded_trial_IDs[test_index]

    scaler = StandardScaler()
    Xtrain = scaler.fit_transform(Xtrain)

    clf = LogisticRegression().fit(Xtrain, Ytrain)
    predictions = clf.predict(scaler.fit_transform(Xtest))
    loss = hamming_loss(Ytest, predictions)
    losses[i] = loss

    ML_weights_degclear[i, :] = np.mean(np.abs(clf.coef_), 0)

ML_weights_degclear = np.mean(ML_weights_degclear, 0)
print(f"Average loss for region ML on degraded vs clear: {np.mean(losses)}")

# Decoding stim ID

In [None]:
num_splits = 10
kf = KFold(num_splits)
AM_weights_stimID = np.zeros((num_splits, AM_spikes.shape[1]))
losses = np.zeros(num_splits)

for i, (train_index, test_index) in enumerate(kf.split(AM_spikes)):

    Xtrain = AM_spikes[train_index,:]
    Ytrain = stimIDs[train_index]

    Xtest = AM_spikes[test_index,:]
    Ytest = stimIDs[test_index]

    scaler = StandardScaler()
    Xtrain = scaler.fit_transform(Xtrain)

    clf = LogisticRegression(multi_class="multinomial").fit(Xtrain, Ytrain)
    predictions = clf.predict(scaler.fit_transform(Xtest))
    loss = hamming_loss(Ytest, predictions)
    losses[i] = loss

    AM_weights_stimID[i, :] = np.mean(np.abs(clf.coef_), 0)

AM_weights_stimID = np.mean(AM_weights_stimID, 0)
print(f"Average loss for region ML on Stim IDs: {np.mean(losses)}")

In [None]:
num_splits = 10
kf = KFold(num_splits)
ML_weights_stimID = np.zeros((num_splits, ML_spikes.shape[1]))
losses = np.zeros(num_splits)

for i, (train_index, test_index) in enumerate(kf.split(ML_spikes)):

    Xtrain = ML_spikes[train_index,:]
    Ytrain = stimIDs[train_index]

    Xtest = ML_spikes[test_index,:]
    Ytest = stimIDs[test_index]

    scaler = StandardScaler()
    Xtrain = scaler.fit_transform(Xtrain)

    clf = LogisticRegression(multi_class="multinomial").fit(Xtrain, Ytrain)
    predictions = clf.predict(scaler.fit_transform(Xtest))
    loss = hamming_loss(Ytest, predictions)
    losses[i] = loss

    ML_weights_stimID[i, :] = np.mean(np.abs(clf.coef_), 0)

ML_weights_stimID = np.mean(ML_weights_stimID, 0)
print(f"Average loss for region ML on Stim IDs: {np.mean(losses)}")

# Comparing importance scores to logistic regression weights for binary classificaiton (degraded vs clear)

In [None]:
ccamodel_dir = df_decode['results_file'][0]
CCA_dims = 25 #None

if CCA_dims == None:
    ccamodel_path = glob.glob(ccamodel_dir+"/CCA_*.pickle")[0]
else:
    ccamodel_path = os.path.dirname(ccamodel_dir)+f"/CCA_{CCA_dims}_dims.pickle"

with open(ccamodel_path, 'rb') as file:
    ccamodel = pickle.load(file)

In [None]:
def recursive_defaultdict():
    return defaultdict(recursive_defaultdict)
importance_scores = recursive_defaultdict()



for reg in regions:
    if reg == 'ML':
        cca_proj = ccamodel.x_rotations_
    else:
        cca_proj = ccamodel.y_rotations_

    for dim in dimensions:

        importance_scores[reg]['CCA'][dim] = calc_loadings(cca_proj[:, 0:dim])
        
        for method in dimreduc_methods:    
            all_scores = np.zeros((len(n_folds), cca_proj.shape[0]))

            for n_fold in n_folds:

                coef = df_decode[(df_decode['loader_args'].apply(lambda x: x.get('region')) == reg)  & 
                                            (df_decode['dim'] == dim) & (df_decode['fold_idx'] == n_fold) &
                                            (df_decode['dimreduc_method'] == method)]['coef'].iloc[0]
                
                all_scores[n_fold, :] = calc_loadings(coef)

            importance_scores[reg][method][dim] = np.mean(all_scores, 0)



In [None]:
region = 'AM'
DIM = 39
FFC_importance = importance_scores[region]['PCA'][DIM]
FBC_importance = importance_scores[region]['LQGCA'][DIM]
CCA_importance = importance_scores[region]['CCA'][DIM]
reg_importance = AM_weights_degclear

RegVFFC_corr, p_value = pearsonr(reg_importance, FFC_importance)
RegVFBC_corr, p_value = pearsonr(reg_importance, FBC_importance)
RegVCCA_corr, p_value = pearsonr(reg_importance, CCA_importance)


plt.scatter(reg_importance, FFC_importance, marker='x', s=2, color='r')
plt.title(f'Region {region}: FFC vs Regression Correlation: {np.round(RegVFFC_corr, 3)}')
plt.xlabel('Logistic Regression Coefficients (Binary, Degraded v Clear)')  
plt.ylabel(f'FFC Importance Scores')
plt.show()


plt.scatter(reg_importance, FBC_importance, marker='x', s=2, color='r')
plt.title(f'Region {region}: FBC vs Regression Correlation: {np.round(RegVFBC_corr, 3)}')
plt.xlabel('Logistic Regression Coefficients (Binary, Degraded v Clear)')  
plt.ylabel(f'FBC Importance Scores')
plt.show()


plt.scatter(reg_importance, CCA_importance, marker='x', s=2, color='r')
plt.title(f'Region {region}: CCA vs Regression Correlation: {np.round(RegVCCA_corr, 3)}')
plt.xlabel('Logistic Regression Coefficients (Binary, Degraded v Clear)')  
plt.ylabel(f'CCA Importance Scores')
plt.show()


In [None]:
region = 'ML'
DIM = 21
FFC_importance = importance_scores[region]['PCA'][DIM]
FBC_importance = importance_scores[region]['LQGCA'][DIM]
CCA_importance = importance_scores[region]['CCA'][DIM]
reg_importance = ML_weights_degclear

RegVFFC_corr, p_value = pearsonr(np.squeeze(reg_importance), FFC_importance)
RegVFBC_corr, p_value = pearsonr(np.squeeze(reg_importance), FBC_importance)
RegVCCA_corr, p_value = pearsonr(np.squeeze(reg_importance), CCA_importance)


plt.scatter(reg_importance, FFC_importance, marker='x', s=2, color='r')
plt.title(f'Region {region}: FFC vs Regression Correlation: {np.round(RegVFFC_corr, 3)}')
plt.xlabel('Logistic Regression Coefficients (Binary, Degraded v Clear)')  
plt.ylabel(f'FFC Importance Scores')
plt.show()


plt.scatter(reg_importance, FBC_importance, marker='x', s=2, color='r')
plt.title(f'Region {region}: FBC vs Regression Correlation: {np.round(RegVFBC_corr, 3)}')
plt.xlabel('Logistic Regression Coefficients (Binary, Degraded v Clear)')  
plt.ylabel(f'FBC Importance Scores')
plt.show()


plt.scatter(reg_importance, CCA_importance, marker='x', s=2, color='r')
plt.title(f'Region {region}: CCA vs Regression Correlation: {np.round(RegVCCA_corr, 3)}')
plt.xlabel('Logistic Regression Coefficients (Binary, Degraded v Clear)')  
plt.ylabel(f'CCA Importance Scores')
plt.show()


# Comparing importance scores to logistic regression weights for classificaiton on stimIDs using sklearn's LogisticRegression

In [None]:
region = 'AM'
DIM = 39
FFC_importance = importance_scores[region]['PCA'][DIM]
FBC_importance = importance_scores[region]['LQGCA'][DIM]
CCA_importance = importance_scores[region]['CCA'][DIM]


RegVFFC_corr, p_value = pearsonr(AM_weights_stimID, FFC_importance)
RegVFBC_corr, p_value = pearsonr(AM_weights_stimID, FBC_importance)
RegVCCA_corr, p_value = pearsonr(AM_weights_stimID, CCA_importance)



plt.scatter(AM_weights_stimID, FFC_importance, marker='x', s=2, color='r')
plt.title(f'Region {region}: FFC vs Regression Correlation: {np.round(RegVFFC_corr, 3)}')
plt.xlabel('LogReg Regression Coefficients (Multinomial, StimIDs)')  
plt.ylabel(f'FFC Importance Scores')
plt.show()


plt.scatter(AM_weights_stimID, FBC_importance, marker='x', s=2, color='r')
plt.title(f'Region {region}: FBC vs Regression Correlation: {np.round(RegVFBC_corr, 3)}')
plt.xlabel('LogReg Regression Coefficients (Multinomial, StimIDs)')  
plt.ylabel(f'FBC Importance Scores')
plt.show()


plt.scatter(AM_weights_stimID, CCA_importance, marker='x', s=2, color='r')
plt.title(f'Region {region}: CCA vs Regression Correlation: {np.round(RegVCCA_corr, 3)}')
plt.xlabel('LogReg Regression Coefficients (Multinomial, StimIDs)')  
plt.ylabel(f'CCA Importance Scores')
plt.show()


In [None]:
region = 'ML'
DIM = 21
FFC_importance = importance_scores[region]['PCA'][DIM]
FBC_importance = importance_scores[region]['LQGCA'][DIM]
CCA_importance = importance_scores[region]['CCA'][DIM]


RegVFFC_corr, p_value = pearsonr(ML_weights_stimID, FFC_importance)
RegVFBC_corr, p_value = pearsonr(ML_weights_stimID, FBC_importance)
RegVCCA_corr, p_value = pearsonr(ML_weights_stimID, CCA_importance)



plt.scatter(ML_weights_stimID, FFC_importance, marker='x', s=2, color='r')
plt.title(f'Region {region}: FFC vs Regression Correlation: {np.round(RegVFFC_corr, 3)}')
plt.xlabel('LogReg Regression Coefficients (Multinomial, StimIDs)')  
plt.ylabel(f'FFC Importance Scores')
plt.show()


plt.scatter(ML_weights_stimID, FBC_importance, marker='x', s=2, color='r')
plt.title(f'Region {region}: FBC vs Regression Correlation: {np.round(RegbVFBC_corr, 3)}')
plt.xlabel('LogReg Regression Coefficients (Multinomial, StimIDs)')  
plt.ylabel(f'FBC Importance Scores')
plt.show()


plt.scatter(ML_weights_stimID, CCA_importance, marker='x', s=2, color='r')
plt.title(f'Region {region}: CCA vs Regression Correlation: {np.round(RegVCCA_corr, 3)}')
plt.xlabel('LogReg Regression Coefficients (Multinomial, StimIDs)')  
plt.ylabel(f'CCA Importance Scores')
plt.show()


### Plotting correlations against each other

In [None]:
AM_binary = [0.209, -0.018, 0.299]
AM_multiclass = [-0.08, -0.083, 0.048]

ML_binary = [0.123, -0.187, 0.194]
ML_multiclass = [0.269, -0.43, 0.282]

corrs = np.zeros((3, 4))
corrs[:, 0] = AM_binary
corrs[:, 1] = AM_multiclass
corrs[:, 2] = ML_binary
corrs[:, 3] = ML_multiclass

x =  corrs[0,:]
y = corrs[1,:]
z = corrs[2,:]

colors = [[1,0,0, 1],[1,0,0, 0.6],[0,0,1, 1],[0,0,1, 0.6]]
labels = ["AM binary", "AM multiclass", "ML binary", "ML multiclass"]

init_notebook_mode(connected=True)
fig = go.Figure(data=[go.Scatter3d( x=x, y=y, z=z, mode='markers', marker=dict( size=5, color=colors, opacity=0.8 ), text=labels, textposition='top center'  )])

axis_limits = [-0.5,0.5]
fig.update_layout( scene=dict( xaxis=dict(title='FFC vs Reg Weight', range=axis_limits, zeroline=True, zerolinewidth=2, zerolinecolor='black', showline=True, showgrid=True, gridcolor='lightgray'), 
                              yaxis=dict(title='FBC vs Reg Weight',  range=axis_limits, zeroline=True, zerolinewidth=2, zerolinecolor='black', showline=True, showgrid=True, gridcolor='lightgray'),  
                              zaxis=dict(title='CCA vs Reg Weight', range=axis_limits, zeroline=True, zerolinewidth=2, zerolinecolor='black', showline=True, showgrid=True, gridcolor='lightgray'),
                              camera=dict(eye=dict(x=1.25, y=1.25, z=1.25)) ), title=f'Region {region} subspace importance scores vs regression weight', height=500 )


fig.show()
fig.write_html("ML_importance_scores.html")
#iplot(fig)


# Tried using Union of Intersections (UoI) decoding, but the code base is pretty outdated/doesn't perform that well
### The idea was that we have data from two different areas that have different levels of face selectivity, different number of neurons, and other various uncontrolled parameters. Thus, how are we to compare differences in their decoding accuracy given these biases? The idea was that UoI regression would factor in these biases and perform the best-case-scenario unbiased sestimate of accuracy.
### Unfortunately, for binary decoding (where we have a serious, 90/10 imbalance of degraded vs clear trial types) it just outputs that all trials are degraded, and for multiclass classification (stimulus ID) the code simply does not run/returns syntax errors from deep in the code
### Here is the code that I had been using:


In [None]:
uoi_ML_save_path = os.path.dirname(df_decode['results_file'][0]) + f"/UOI_ML.pickle"
with open(uoi_ML_save_path, 'rb') as file:
    uoi_ML = pickle.load(file)


uoi_AM_save_path = os.path.dirname(df_decode['results_file'][0]) + f"/UOI_AM.pickle"
with open(uoi_AM_save_path, 'rb') as file:
    uoi_AM = pickle.load(file)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = AM_spikes
y = degraded_trial_IDs
indices = np.arange(len(X))

X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(X, y, indices, test_size=0.2, random_state=42, stratify=y)
    

fitter = LogisticRegression().fit(X_train, y_train)
support = np.ones(X.shape[1]).astype(bool)


uoi_AM = UoI_L1Logistic(estimation_score='BIC')
assert uoi_AM._estimation_target == 0
uoi_AM.classes_ = np.unique(y)
score = -1 * uoi_AM._score_predictions('BIC', fitter, X, y, support, (train_indices, test_indices))

In [None]:
sum(uoi_AM.predict_proba(AM_spikes[test_indices, :]), 0)

In [None]:
X = AM_spikes
y = degraded_trial_IDs
indices = np.arange(len(X))

X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(X, y, indices, test_size=0.2, random_state=42, stratify=y)

uoi_AM = UoI_L1Logistic(estimation_score='BIC', estimation_target='train', random_state=10).fit(AM_spikes[train_indices, :], degraded_trial_IDs[train_indices])


In [None]:
uoi_AM = UoI_L1Logistic(estimation_score='BIC', estimation_target='train', random_state=10).fit(AM_spikes, degraded_trial_IDs)
uoi_ML = UoI_L1Logistic(estimation_score='BIC',  estimation_target='train', random_state=10).fit(ML_spikes, degraded_trial_IDs)


In [None]:
uoi_ML_save_path = os.path.dirname(df_decode['results_file'][0]) + f"/UOI_ML.pickle"
with open(uoi_ML_save_path, 'wb') as file:
    pickle.dump(uoi_ML, file)


uoi_AM_save_path = os.path.dirname(df_decode['results_file'][0]) + f"/UOI_AM.pickle"
with open(uoi_AM_save_path, 'wb') as file:
    pickle.dump(uoi_AM, file)

In [None]:
uoi_AM.score