# SVM Classification Scores
- Calculate SVM scores with the specific (randomly generated) splits used in Bayesian analysis classification
- calculate/ confirm these scores with Logistic regression model
- save these scores to seperate files


In [1]:
import numpy as np
import pandas as pd
from data_frame_tools import *
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split 
import pickle
from sklearn.manifold import TSNE,MDS

In [2]:
from sklearn.linear_model import LogisticRegression
#get myself some svm scores
def SVM_one_neuron_changed(X, y, test_size = 1/5, num_splits=15):
    
    # Which SVM Optimization problem do we solve?
    n_samples = X.shape[0] * (1-test_size) # Number of spike trains in the training set
    n_features = X.shape[1]  # Number of time points in the spike trains
    dual_param = (n_samples < n_features)

    # Define the SVM model
    model_SVM = LinearSVC(dual=dual_param, max_iter=10000, tol = 0.0001, random_state=651)
    #add a tolerance here - what happens if tol is 0.001 instead of 0.0001?


    split_crs = [] 
    #for all five tastes:
    #splits = [805, 252, 43, 815, 318, 951, 405, 80, 53, 691, 137, 616, 788, 234, 875]
    #for 0 vs 1 AND 0 vs 4 AND four taste (no water):
    splits = [305, 428, 488, 211, 729, 914,  26,  30, 727, 377, 547, 541, 185, 833, 470]
    #for 2 vs 3 
    #splits = [473, 643, 408, 557, 269,  53, 319, 240, 111, 441, 699, 681, 180, 392,  58]

    for j in splits:                          # Use several splits of training and testing sets for robustness

        X_train,X_test,y_train,y_test = train_test_split(X,y,                 # This function is from sklearn
                                                         test_size = test_size,random_state= j, # Default: 2/3 of data to train and 1/3 to test
                                                         shuffle = True,
                                                         stratify = y)        # Sample from each taste

        model_SVM.fit(X_train,y_train)                   # Re-fit the classifier with the training set
        split_crs.append(model_SVM.score(X_test,y_test))  # Fit the testing set and record score

    svm_rate = np.mean(split_crs)  # After scores from each split have been obtained, 
                                                             # record the average

    return svm_rate


# This function returns one classification rate per neuron in the dataframe it is given.

def SVM_all_neurons_changed(dataFrame, test_size = 1/5, num_splits=15):
    
    if 'Trial' in dataFrame.columns:
        start_index = dataFrame.columns.get_loc('Trial') + 1
    else:
        start_index = 0
        
    # This will be the returned array, consisting of one classification rate per neuron
    all_SVM_rates = []

    # Iterate through all neurons
    for neuron in dataFrame['Neuron'].unique():
        
        neuron_df = dataFrame[dataFrame['Neuron']==neuron] # Select all spike trains from this neuron
        X = neuron_df.iloc[:,start_index:]  # X is the data. It has the shape (n_observations, n_times)
        y = np.array(neuron_df['Taste'])                           # y is the labels. We're classifying based on taste.
        
        all_SVM_rates.append(SVM_one_neuron_changed(X, y, test_size=test_size, num_splits=num_splits))

    return all_SVM_rates

def logistic_reg_one_neuron(X, y, test_size = 1/5, num_splits=15):
    
    #n_samples = X.shape[0] * (1-test_size) # Number of spike trains in the training set
    #n_features = X.shape[1]  # Number of time points in the spike trains
    #dual_param = (n_samples < n_features)
    

    split_crs = [] 
    #for all five tastes:
    #splits = [805, 252, 43, 815, 318, 951, 405, 80, 53, 691, 137, 616, 788, 234, 875]
    #for 0 vs 1 AND 0 vs 4 AND four tastes (no water):
    splits = [305, 428, 488, 211, 729, 914,  26,  30, 727, 377, 547, 541, 185, 833, 470]
    #for 2 vs 3 
    #splits = [473, 643, 408, 557, 269,  53, 319, 240, 111, 441, 699, 681, 180, 392,  58]

    for j in splits:                          # Use several splits of training and testing sets for robustness

        X_train,X_test,y_train,y_test = train_test_split(X,y,                 # This function is from sklearn
                                                         test_size = test_size,random_state= j, # Default: 2/3 of data to train and 1/3 to test
                                                         shuffle = True,
                                                         stratify = y)        # Sample from each taste

        clf = LogisticRegression(random_state=j).fit(X_train, y_train)
        #model_SVM.fit(X_train,y_train)                   # Re-fit the classifier with the training set
        split_crs.append(clf.score(X_test,y_test))  # Fit the testing set and record score

    log_reg_rate = np.mean(split_crs)  # After scores from each split have been obtained, 
                                                             # record the average

    return log_reg_rate

def logreg_all_neurons(dataFrame, test_size = 1/5, num_splits=15):
    
    if 'Trial' in dataFrame.columns:
        start_index = dataFrame.columns.get_loc('Trial') + 1
    else:
        start_index = 0
        
    # This will be the returned array, consisting of one classification rate per neuron
    all_logreg_rates = []

    # Iterate through all neurons
    for neuron in dataFrame['Neuron'].unique():
        
        neuron_df = dataFrame[dataFrame['Neuron']==neuron] # Select all spike trains from this neuron
        X = neuron_df.iloc[:,start_index:]  # X is the data. It has the shape (n_observations, n_times)
        y = np.array(neuron_df['Taste'])                           # y is the labels. We're classifying based on taste.
        
        all_logreg_rates.append(logistic_reg_one_neuron(X, y, test_size=test_size, num_splits=num_splits))

    return all_logreg_rates

In [3]:
def split_data_Li(neuronDF, licksDF):
    #this fctn takes dataframes and splits off the first 5 lI of data, writes in np arrray. 
    
    yvals = np.zeros(shape=(len(licksDF),1000))
    for row in range(len(licksDF)):
        y = np.zeros(1000)
        
        l = np.array(licksDF.iloc[row, 4:]).astype(int)
        ltimes = np.nonzero(l)
        
        nfires = np.array(neuronDF.iloc[row, 4:]).astype(float)
        
        if len(ltimes[0]) >= 6:
            nfires = nfires[ltimes[0][0]:ltimes[0][5]]
            li_len = ltimes[0][5] - ltimes[0][0]
        else:
            nfires = nfires
            li_len = len(nfires)
        
        
        if sum(nfires) != 0.0:
        #tspikes tells you times of where spikes should fall
            n=np.nonzero(nfires)
            tspikes = np.array([nf/li_len for nf in n])
            #(np.nonzero(nfires)/(li_len)) #+ li
            tspikes = ((np.round(tspikes, decimals=3))*1000).astype(int)
        
            for i in tspikes[0]:
                if 0 <= i < 1000:
                    y[i] = 1.0
        
            yvals[row,:] = y
    
    return yvals

In [4]:
clean_spike_train_df =pd.read_pickle(
'C:/Users/nasha/OneDrive - Florida State University/TomRichard/Paper/dataFolder/clean_spike_train_df.pickle')
clean_spike_train_df = clean_spike_train_df[~clean_spike_train_df['Taste'].isin([4])]

post_stim = truncate(clean_spike_train_df)
# ^ this function just takes the post-taste data, deleting any pre-taste data.
neuronDF = post_stim[(post_stim['Recording Type'] == 'Neuron')]
LickDF = post_stim[(post_stim['Recording Type'] == 'Lick')]

redo svm with normalized data for only 1000 / 5LI only. 

In [5]:
normalized_short_spikes = split_data_Li(neuronDF, LickDF)
normalized_short_spikes.shape

(44248, 1000)

In [6]:
norm_short = pd.concat([neuronDF.iloc[:,:4].reset_index(drop=True),pd.DataFrame(normalized_short_spikes)], axis = 1)

In [7]:
norm_short

Unnamed: 0,Recording Type,Taste,Neuron,Trial,0,1,2,3,4,5,...,990,991,992,993,994,995,996,997,998,999
0,Neuron,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Neuron,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Neuron,0,0,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Neuron,0,0,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Neuron,0,0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44243,Neuron,3,528,17,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44244,Neuron,3,528,18,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44245,Neuron,3,528,19,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44246,Neuron,3,528,20,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
smoothing_window = 100

smoothed_spikes = smooth_all_spike_trains(norm_short, smoothing_window)

log_reg_rates = logreg_all_neurons(smoothed_spikes)

In [9]:
log_reg_rates

[0.6851851851851852,
 0.36666666666666675,
 0.3518518518518519,
 0.28888888888888886,
 0.32222222222222224,
 0.2375,
 0.3,
 0.23333333333333334,
 0.2833333333333333,
 0.25,
 0.25833333333333336,
 0.2875,
 0.17916666666666667,
 0.24814814814814815,
 0.4592592592592593,
 0.26666666666666666,
 0.2592592592592593,
 0.2740740740740741,
 0.2555555555555556,
 0.21481481481481482,
 0.3222222222222223,
 0.3074074074074074,
 0.33703703703703697,
 0.24074074074074076,
 0.29629629629629634,
 0.24814814814814815,
 0.22222222222222227,
 0.3777777777777779,
 0.4222222222222223,
 0.2777777777777778,
 0.28518518518518515,
 0.2555555555555556,
 0.25925925925925924,
 0.23703703703703702,
 0.2037037037037037,
 0.22222222222222227,
 0.23703703703703705,
 0.2333333333333333,
 0.2962962962962964,
 0.35555555555555557,
 0.35555555555555557,
 0.24074074074074076,
 0.4481481481481481,
 0.22962962962962966,
 0.4740740740740741,
 0.2444444444444444,
 0.24444444444444444,
 0.25,
 0.3541666666666667,
 0.41666666666

In [10]:
neurons = smoothed_spikes['Neuron'].unique()
n_neurons = len(neurons)

neuron_list = np.array([neurons])
neuron_list = neuron_list.reshape((neuron_list.shape[0]*neuron_list.shape[1]))



In [11]:
signal_types_list = np.array(['post-taste'] * n_neurons)

classification_Logreg_rate_array = np.zeros(shape=(len(log_reg_rates)))
classification_Logreg_rate_array[:n_neurons] = log_reg_rates
#classification_rate_array[n_neurons:(n_neurons*2)] = pre_taste_svm_rates
#classification_rate_array[(n_neurons*2):] = post_taste_svm_rates

logreg_alltastes = pd.DataFrame()
logreg_alltastes['Neuron'] = neuron_list
logreg_alltastes['Signal Type'] = signal_types_list
logreg_alltastes['LogReg Rate'] = classification_Logreg_rate_array
logreg_alltastes

Unnamed: 0,Neuron,Signal Type,LogReg Rate
0,0,post-taste,0.685185
1,1,post-taste,0.366667
2,2,post-taste,0.351852
3,3,post-taste,0.288889
4,4,post-taste,0.322222
...,...,...,...
524,524,post-taste,0.296296
525,525,post-taste,0.566667
526,526,post-taste,0.255556
527,527,post-taste,0.255556


In [12]:
filename = 'LogReg_fourtastes_no_water.pkl'
with open(filename, 'wb') as file:
    pickle.dump(logreg_alltastes, file)

In [14]:
post_taste_svm_rates = SVM_all_neurons_changed(smoothed_spikes)

In [15]:
signal_types_list = np.array(['post-taste'] * n_neurons)

classification_rate_array = np.zeros(shape=(len(post_taste_svm_rates)))
classification_rate_array[:n_neurons] = post_taste_svm_rates
#classification_rate_array[n_neurons:(n_neurons*2)] = pre_taste_svm_rates
#classification_rate_array[(n_neurons*2):] = post_taste_svm_rates

svm_all_tastes_df = pd.DataFrame()
svm_all_tastes_df['Neuron'] = neuron_list
svm_all_tastes_df['Signal Type'] = signal_types_list
svm_all_tastes_df['SVM Rate'] = classification_rate_array
svm_all_tastes_df

Unnamed: 0,Neuron,Signal Type,SVM Rate
0,0,post-taste,0.674074
1,1,post-taste,0.370370
2,2,post-taste,0.337037
3,3,post-taste,0.292593
4,4,post-taste,0.270370
...,...,...,...
524,524,post-taste,0.425926
525,525,post-taste,0.600000
526,526,post-taste,0.355556
527,527,post-taste,0.322222


In [16]:
filename = 'SVM_4tastes_no_water_15split.pkl'
with open(filename, 'wb') as file:
    pickle.dump(svm_all_tastes_df, file)