In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline

# Load dataset

In [1]:
def load_data(filename):
    '''
    Load in data and return values
    
    param:   filename (str) - filename/path to data
    return:  data - data values
    '''
    # double check it's samples x features
    data = pd.read_csv(filename)
    return data.values

In [2]:
def norm_data(data):
    '''
    Normalize data to z-values (0 mean and 1 std dev)
    
    param:   data - data values (n_samples, n_features)
    return:  data - normalized data
    '''
    means = data.mean(axis=0)    # mean for each feature
    stdevs = data.std(axis=0)    # std dev for each feature
    data = (data - means) / stdevs    # normalized data (Z-score)
    return data

# Covariance matrix?

# PCA 

In [6]:
from sklearn.decomposition import PCA
def pca(data):
    '''
    Does PCA on normalized data set. Can optionally set fewer components
    
    param: data - normalized data (n_samples, n_features)
    return: 
        data_pc: data transformed onto princinpal components
        components:  principal axes in feature space, array, shape (n_components, n_features)
        weights: percentage of variance explained by each of the selected components. array, shape (n_components,)
    '''
    # create PCA model
    pca = PCA() 
    
    # fit model to data
    data_pc = pca.fit(data)  
    
    # obtain components and components' weights
    components = pca.components_
    weights = pca.explained_variance_ratio_
    
    return data_pc, components, weights, pca

In [5]:
def cum_var_plot(weights, desired_var):
    '''
    Cumulative variance plot (number of components vs cumulative variance captured) with calculated number 
    of PCs required to get to a certain desired variance explained
    
    params:
        weights: percentage of variance explained by each of the selected components. array, shape (n_components,)
        desired_var:  percent variance to find number of PCs for
    return
        pcs_req:   pcs required to captured at least desired variance
        captured_var   exact variance captured by pcs_req
    
    '''
    INDEX_SHIFT = 1
    # cumulative variance captured
    cum_var = np.cumsum(weights) 
    
    # find pcs req to get desired variance
    pcs_req = ceil(np.min(np.where(cum_var > desired_var)))   
    
    # actual variance captured
    captured_var = cumulative_var[pcs_req-INDEX_SHIFT]
    
    # plotting
    plt.plot(range(INDEX_SHIFT,len(cum_var)+INDEX_SHIFT), cum_var)
    plt.axvline(x=pcs_req, ymin=0, ymax=1, color='k', linestyle='--')
    plt.xlabel('Number of components')
    plt.ylabel('Cumulative variance captured')
    plt.title('Cumulative Variance Captured by Principal Components')
    
    return pcs_req, captured_var

In [None]:
def biplot(data, dim=2, pca):
    '''
    Creates biplot for data mapping data onto top 2 or 3 principal components
    
    params
        data - normalized data, array shape (n_samples, n_features)
        dim - 2 or 3 dimensions for plotting
        pca - pca model
    '''
    PC1_IDX = 0
    PC2_IDX = 1
    
    # top 2/3 pc's and their variance explainted
    top_pcs = pca.transform(data)[:,:dim]
    top_var = pca.explained_variance_ratio_[:dim]
    
    # plotting
    plt.scatter(top_pcs[:,PC1_IDX], top_pcs[:,PC2_IDX])
    plt.xlabel('PC1 ({.1%})'.format(top_pcs[PC1_IDX]))
    plt.ylabel('PC2 ({.1%})'.format(top_pcs[PC2_IDX]))
    plt.axvline(x=0, ymin=np.min(top_pcs[:,PC2_IDX]), ymax=np.max(top_pcs[:,PC2_IDX]), 'k--')
    plt.axhline(y=0, xmin=np.min(top_pcs[:,PC1_IDX]), xmax=np.max(top_pcs[:,PC1_IDX]), 'k--')

# LDA Classification

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import precision_recall_curve
def lda(data):
    '''
    Linear discriminant analysis (LDA) classifier for input data into 3 stimulus conditions (left, right, no stimulus)
    
    param:
        data - normalized and potentially transformed to pcs
    
    '''
    
    N_TRIALS = 20
    labels = np.array(['left', 'right', 'none'])
    labels = np.repeat(labels, N_TRIALS)
    
    # TRANSFORM DATA TO LOWER DIM
    # Automatically sets test as 0.25
    # Random state so that it's the same every run
    X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state = 0)

    # train LDA classifier
    # look into using LDA for dimensionality reduction??
    lda_model = LinearDiscriminantAnalysis.fit(X_train , y_train)
    y_predict = lda_model.predict(X_test)
    
    # model accuracy for X_test
    lda_acc = 100*accuracy_score(y_test, y_predict)
    print('Accuracy:',round(lda_acc,2),'%')
    print(classification_report(y_test, y_predict,target_names=['Left', 'Right', 'None']))
    
    # creating a confusion matrix
    cm = confusion_matrix(y_test, y_predict)


    
    plt.show()

### Receiver Operating Characteristic (ROC)

ROC curves have true positive (predicted positive and actually positive) rate on the Y axis and false positive (predicted positive, actually negative) on the X axis. THe top left corner for the plot is the "ideal" point with a false positive rate of zero and a true positive rate of one.  
  
In order to extend ROC curve and ROC area to multi-label classification, it is necessary to binarize the output (one vs all). One ROC curve can be drawn per label and/or one can draw a ROC curve by considering each element of the label indicator matrix as a binary prediction (micro-averaging).  
  
Another evaluation for multi-label classification is macro-averaging, which gives equal weight to the classification of each label.  
https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#plot-roc-curves-for-the-multilabel-problem

In [None]:
def roc_curve():
    #https://towardsdatascience.com/the-5-classification-evaluation-metrics-you-must-know-aa97784ff226
    #https://stackoverflow.com/questions/56090541/how-to-plot-precision-and-recall-of-multiclass-classifier/56092736
    #https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html#in-multi-label-settings
    
    # Precision-Recall curve

### TODO: Add regularization step

### TODO: Add bootstrapping?