In [54]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns; sns.set_style('white')  # plot formatting
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold


# Algorithms
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

# Import data
data = pd.read_csv('Data/Filtered_Data.csv')

In [48]:
data.head()

Unnamed: 0,participant,open,closed,"label (0 for not depressed, 1 for depressed)",BDI
0,509,491.109212,408.343868,0,7
1,519,324.802986,963.380921,0,6
2,542,450.667603,451.559918,0,0
3,545,356.680814,790.636981,0,0
4,546,700.472825,270.410014,0,5


In [49]:
X = data[['open', 'closed']]
Y = data['label (0 for not depressed, 1 for depressed)']

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.40, random_state=42) # 50/50 vs 80/20 vs 60/40

## I think we should show how the results change between all the splits

In [96]:
X_train.shape

(48, 2)

In [97]:
y_train.shape

(48,)

In [7]:
# pipe = Pipeline([('std', StandardScaler()),
#                  ('classifier1', LinearDiscriminantAnalysis()),
#                  ('classifier2', SVC())])
#                  #('classifier3', KNeighborsClassifier())])

# # Parameters to search
# search_space = [{'classifier1': [LinearDiscriminantAnalysis()],
#                  'classifier1__solver': ['svd', 'lsqr', 'eigen']},
#                 {'classifier2': [SVC()],
#                  'classifier2__kernel': ['linear'],
#                  'classifier2__C': np.logspace(-4, 4, 9)
#                 #{'classifier3': [KNeighborsClassifier()],
#                  #'classifier3__weights': ['uniform', 'distance']
#                  #'classifier3__n_neighbors': [i for i in range(1,11)]
#                 }]

# # Create grid search 
# clf1 = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=5), 
#                    scoring='accuracy', refit=True,
#                    verbose=0)

# # Fit grid search
# best_model = clf1.fit(X_train, y_train)

In [98]:
pipe_LDA = Pipeline([('classifier', LinearDiscriminantAnalysis())])
search_space1 = [{'classifier': [LinearDiscriminantAnalysis()],
                 'classifier__solver': ['svd', 'lsqr', 'eigen']}]
# Create grid search 
clf = GridSearchCV(pipe_LDA, search_space1, cv=StratifiedKFold(n_splits=10), 
                   scoring='accuracy', refit=True,
                   verbose=0)

# Fit grid search
best_model_LDA = clf.fit(X_train, y_train)



In [102]:
#best_model_LDA.cv_results_

In [100]:
best_model_LDA.cv_results_['params'][ np.argmin(best_model_LDA.cv_results_['rank_test_score']) ]

{'classifier': LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                            solver='svd', store_covariance=False, tol=0.0001),
 'classifier__solver': 'svd'}

In [103]:
pipe_SVC = Pipeline([('std', StandardScaler()),
                    ('classifier', SVC())])
search_space2 = [{'classifier': [SVC()],
                 'classifier__kernel': ['linear'],
                 'classifier__C': np.logspace(-4, 4, 9)}]
# Create grid search 
clf2 = GridSearchCV(pipe_SVC, search_space2, cv=StratifiedKFold(n_splits=10), 
                   scoring='accuracy', refit=True,
                   verbose=0)

# Fit grid search
best_model_SVC = clf2.fit(X_train, y_train)



In [104]:
best_model_SVC.cv_results_

{'mean_fit_time': array([0.00318358, 0.00357521, 0.0033808 , 0.00286665, 0.00334518,
        0.0032371 , 0.00441017, 0.0124315 , 0.11741135]),
 'std_fit_time': array([0.00038027, 0.00121996, 0.0005203 , 0.00030364, 0.00054248,
        0.00073181, 0.00103055, 0.00563522, 0.07283075]),
 'mean_score_time': array([0.00114021, 0.00130341, 0.0011523 , 0.0010581 , 0.00118663,
        0.00105841, 0.00112052, 0.00114248, 0.00173681]),
 'std_score_time': array([4.58136298e-05, 2.85871382e-04, 1.02883540e-04, 8.35737212e-05,
        2.02124051e-04, 9.07161890e-05, 1.44018589e-04, 1.32417149e-04,
        7.36791875e-04]),
 'param_classifier': masked_array(data=[SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
     kernel='linear', max_iter=-1, probability=False, random_state=None,
     shrinking=True, tol=0.001, verbose=False),
                    SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
     decision_fu

In [105]:
best_model_SVC.cv_results_['params'][ np.argmin(best_model_SVC.cv_results_['rank_test_score']) ]

{'classifier': SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
     kernel='linear', max_iter=-1, probability=False, random_state=None,
     shrinking=True, tol=0.001, verbose=False),
 'classifier__C': 0.1,
 'classifier__kernel': 'linear'}

In [106]:
pipe_KNN = Pipeline([('classifier', KNeighborsClassifier())])
search_space3 = [{'classifier': [KNeighborsClassifier()],
                 'classifier__weights': ['uniform', 'distance'],
                 'classifier__n_neighbors': [i for i in range(1,11)],
                 'classifier__p': [1, 2]
                }]
# Create grid search 
clf3 = GridSearchCV(pipe_KNN, search_space3, cv=StratifiedKFold(n_splits=10), 
                   scoring='accuracy', refit=True,
                   verbose=0)

# Fit grid search
best_model_KNN = clf3.fit(X_train, y_train)



In [107]:
best_model_KNN.cv_results_

{'mean_fit_time': array([0.00296512, 0.00205672, 0.00165081, 0.00170298, 0.00161142,
        0.00171916, 0.0016814 , 0.0015981 , 0.00173035, 0.00164161,
        0.00157478, 0.00164106, 0.00158093, 0.00164323, 0.00174952,
        0.00183029, 0.00174389, 0.00188568, 0.00168126, 0.00171022,
        0.00188472, 0.00166135, 0.0016777 , 0.00204525, 0.00186439,
        0.00223441, 0.00226955, 0.00237579, 0.00229225, 0.00233028,
        0.00230794, 0.00215409, 0.00211444, 0.00221739, 0.00220706,
        0.00217772, 0.00172358, 0.00165153, 0.00156231, 0.00181451]),
 'std_fit_time': array([5.87850819e-04, 2.87028151e-04, 1.03303499e-04, 1.59733075e-04,
        5.61616003e-05, 1.20811569e-04, 1.44374215e-04, 6.54188997e-05,
        1.05961339e-04, 8.06720680e-05, 3.52328213e-05, 1.43423092e-04,
        3.25008251e-05, 1.03090925e-04, 2.62385122e-04, 1.41763567e-04,
        7.74980829e-05, 9.73531960e-05, 4.59991006e-05, 4.57286337e-05,
        1.49555262e-04, 8.21769599e-05, 1.35668376e-04, 6.805

In [108]:
best_model_KNN.cv_results_['params'][ np.argmin(best_model_KNN.cv_results_['rank_test_score']) ]

{'classifier': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=6, p=1,
                      weights='distance'),
 'classifier__n_neighbors': 6,
 'classifier__p': 1,
 'classifier__weights': 'distance'}

## Train Best Models

### LDA 

In [109]:
best_LDA_model = LinearDiscriminantAnalysis(solver='svd')
best_LDA_model.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)

In [110]:
y_pred_LDA = best_LDA_model.predict(X_test)

In [111]:
accuracy_LDA = accuracy_score(y_pred_LDA, y_test)

In [112]:
accuracy_LDA

0.65625

### SVM

In [113]:
best_SVM_model = SVC(C=0.1, kernel='linear', max_iter=-1, random_state=None,verbose=False)
best_SVM_model.fit(X_train, y_train)

y_pred_SVM = best_SVM_model.predict(X_test)

accuracy_SVM = accuracy_score(y_pred_SVM, y_test)

In [114]:
accuracy_SVM

0.59375

### KNN

The first KNN  was a random test, as it had the best results with the 80/20 split

In [115]:
best_KNN_model1 = KNeighborsClassifier(n_neighbors=4, p=2,weights='uniform') 
best_KNN_model1.fit(X_train, y_train)

y_pred_KNN1 = best_KNN_model1.predict(X_test)

accuracy_KNN1 = accuracy_score(y_pred_KNN1, y_test)

In [116]:
accuracy_KNN1

0.53125

In [117]:
best_KNN_model2 = KNeighborsClassifier(n_neighbors=6, p=1,weights='distance')
best_KNN_model2.fit(X_train, y_train)

y_pred_KNN2 = best_KNN_model2.predict(X_test)

accuracy_KNN2 = accuracy_score(y_pred_KNN2, y_test)

accuracy_KNN2

0.625