## Final Project 
## Brainster DS x Parkinson's Disease Specifications

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from functools import partial
import re
import pickle
import joblib
from scipy.stats import skew, kurtosis
from scipy import stats
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split, KFold, StratifiedKFold
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix,recall_score,precision_score,roc_curve,auc,roc_auc_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from catboost import CatBoostClassifier


In [2]:
df_users = pd.read_csv("df_user.csv")
df_keys = pd.read_csv("df_keys.csv")

In [3]:
# duplicate rows
print(f'Number of duplicated row: {df_keys.duplicated().sum()}')

# drop duplicates
df_keys.drop_duplicates()

# check df shape
print(f'Records dropped.The new shape of the DataFrame is: {df_keys.shape}')

Number of duplicated row: 26835
Records dropped.The new shape of the DataFrame is: (3662927, 8)


In [4]:
hold_by_user =  df_keys[df_keys['Hand'] != 'S'].groupby(['ID', 'Hand'])['HoldTime'].agg(['mean','std', skew, kurtosis])
latency_by_user = df_keys[np.in1d(df_keys['Direction'], ['LL', 'LR', 'RL', 'RR'])].groupby(['ID', 'Direction'])['LatencyTime'].agg(['mean','std',skew, kurtosis])

hold_by_user_flat = hold_by_user.unstack()
hold_by_user_flat.columns = ['_'.join(col).strip() for col in hold_by_user_flat.columns.values]
hold_by_user_flat['mean_hold_diff'] = hold_by_user_flat['mean_L'] - hold_by_user_flat['mean_R']

latency_by_user_flat = latency_by_user.unstack()
latency_by_user_flat.columns = ['_'.join(col).strip() for col in latency_by_user_flat.columns.values]
latency_by_user_flat['mean_LR_RL_diff'] = latency_by_user_flat['mean_LR'] - latency_by_user_flat['mean_RL']
latency_by_user_flat['mean_LL_RR_diff'] = latency_by_user_flat['mean_LL'] - latency_by_user_flat['mean_RR']



In [5]:
combined = pd.concat([hold_by_user_flat, latency_by_user_flat], axis=1)
full_set = pd.merge(combined.reset_index(), df_users[['ID', 'Parkinsons']], on='ID')
full_set.set_index('ID', inplace=True)
full_set.dropna(inplace=True)

In [6]:
full_set['Parkinsons'] = full_set['Parkinsons'].astype(int)


In [7]:
X = full_set.drop(columns=['Parkinsons'])
y = full_set['Parkinsons']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(full_set.drop(columns=['Parkinsons']), full_set['Parkinsons'], test_size=0.2, random_state=42)

In [9]:
lda_model = LDA()
lda_model.fit(X_train, y_train)

In [10]:
model_cat = CatBoostClassifier(learning_rate=0.03,iterations=1000,depth=8,loss_function='Logloss',min_data_in_leaf=5,random_seed=42)
model_cat.fit(lda_model.transform(X_train), y_train,verbose=False)  

<catboost.core.CatBoostClassifier at 0x1ffc32d0950>

In [11]:
pred = model_cat.predict_proba(lda_model.transform(X_test))
auc = roc_auc_score(y_test, pred[:, 1])
auc

0.9097222222222222