In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler

from mlcog import evaluation, models as m
from mlcog import tuning, bootstrap, evaluation 
from mlcog.utils import io

#### *Lexical and semantic psycholinguistics features*

In [2]:
ling_train = pd.read_pickle('../data/features/ling_train.pkl')
ling_train.head()

Unnamed: 0,pid,label,data
0,267,0,"[81, 58.92, 54.78, 83.36, 20.23, 4.26, 8.64, 9..."
1,273,0,"[110, 80.0, 71.57, 20.79, 47.86, 10.0, 13.64, ..."
2,298,0,"[92, 84.34, 93.93, 44.75, 35.66, 7.08, 14.13, ..."
3,307,0,"[83, 56.34, 74.41, 13.52, 76.34, 7.55, 7.23, 7..."
4,312,0,"[128, 64.33, 75.49, 48.9, 30.91, 5.82, 11.72, ..."


In [3]:
ling_train.data[0].shape   

(100,)

In [4]:
X = ling_train['data']
y = ling_train['label']
X_train_2d = np.stack(X.values)
# Normalization
scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X_train_2d)

In [16]:
from mlcog.utils.io import get_cv_model_path
print(get_cv_model_path("cv_ling", "lr").resolve())

/Users/marialima/Desktop/GitHub-ML-cog-code/data/cv_eval/cv_ling/10fcv_lr.pkl


In [17]:
models = m.create_models()
param_grids = m.create_param_grids()

# model selection and crossvalidation
results_ling = []
for name, model in models.items():
    result = tuning.crossval(name, model, param_grids[name], X_scaled_train, y, feature_set = 'cv_ling')
    results_ling.append(result)

df_eval_cv_ling = pd.DataFrame(results_ling)
df_eval_cv_ling

Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits




Fitting 10 folds for each of 50 candidates, totalling 500 fits


Unnamed: 0,Model,Sensitivity,Specificity,Roc_auc,Accuracy
0,lr,70.4 (10.0),77.3 (9.2),78.6 (6.0),73.6 (6.5)
1,svm,75.0 (14.5),69.5 (14.3),78.1 (5.2),72.2 (8.6)
2,rf,78.6 (14.3),73.4 (15.3),83.3 (7.7),76.0 (9.0)
3,nn,69.4 (18.2),70.7 (17.2),73.4 (12.9),69.9 (7.9)
4,xgboost,79.3 (10.9),78.4 (18.7),86.6 (8.9),78.9 (8.7)


In [None]:
# PATH_SAVE_DF = "../data/cv_eval/cv_ling/"
# df_eval_cv_ling.to_csv(PATH_SAVE_DF + "classif_results.csv", index=False)

#### Evaluation on unseen test set

In [7]:
ling_test = pd.read_pickle('../data/features/ling_test.pkl')
ling_test.head()

Unnamed: 0,pid,label,data
0,58,0,"[57, 47.02, 94.48, 1.0, 5.65, 8.14, 14.04, 89...."
1,64,1,"[57, 62.72, 70.59, 14.46, 74.93, 5.7, 15.79, 9..."
2,70,1,"[62, 95.94, 97.31, 25.64, 6.37, 4.43, 6.45, 95..."
3,71,1,"[156, 1.0, 99.0, 1.31, 49.66, 19.5, 13.46, 97...."
4,65,0,"[107, 88.19, 94.26, 10.52, 48.73, 15.29, 12.15..."


In [None]:
X_test = ling_test['data']
y_test = ling_test['label']
X_test_2d = np.stack(X_test.values)
# Transform test set with scaler object already fit on training data
X_scaled_test = scaler.transform(X_test_2d)

model_map_class = {
        'Logistic Regression': 'lr',
        'SVM': 'svm',
        'Random Forest': 'rf',
        'Neural Network': 'nn',
        'XGBoost': 'xgboost',
    }
prefix = "10fcv_"
best_hyperparams = io.load_best_params(model_map_class, prefix, feature_set = 'cv_ling')
best_hyperparams

{'Logistic Regression': LogisticRegression(C=np.float64(0.14024971326600363), max_iter=10000,
                    penalty='l1', random_state=42, solver='liblinear'),
 'SVM': SVC(C=np.float64(0.002460422958018418), gamma=np.float64(0.002154434690031882),
     kernel='linear', probability=True, random_state=42),
 'Random Forest': RandomForestClassifier(max_depth=12, min_samples_leaf=3, random_state=42),
 'Neural Network': MLPClassifier(activation='logistic', alpha=np.float64(0.00016676611460145484),
               batch_size=64, hidden_layer_sizes=(400,),
               learning_rate='adaptive',
               learning_rate_init=np.float64(0.0018595691547710724),
               max_iter=10000, random_state=42, solver='sgd'),
 'XGBoost': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=np.float64(0.7835939392709834), device=None,
               early_stopping_rounds=None, enable_categ

In [9]:
evaluation_bootstrap, probs = bootstrap.fit_and_evaluate_bootstrap_classification(
    best_hyperparams, 
    X_scaled_train, 
    y, 
    X_scaled_test, 
    y_test
)
evaluation_bootstrap_df = pd.DataFrame(evaluation_bootstrap)
df = evaluation_bootstrap_df.round(3)
df[df.columns[1:]] = df[df.columns[1:]] * 100
results_dict = {}
for model_name in df.Model:
    results = evaluation.extract_results_classif_test(df[df.Model == model_name])
    results_dict[model_name] = results
results_dict

{'Logistic Regression': ('74.9 (72.0 - 77.7)',
  '85.0 (80.7 - 89.3)',
  '87.0 (84.7 - 89.2)',
  '80.0 (77.5 - 82.5)'),
 'SVM': ('70.3 (63.8 - 76.8)',
  '88.9 (86.2 - 91.5)',
  '88.3 (86.5 - 90.1)',
  '79.7 (77.1 - 82.3)'),
 'Random Forest': ('70.0 (67.1 - 72.9)',
  '85.8 (80.9 - 90.7)',
  '86.0 (83.7 - 88.3)',
  '78.0 (75.7 - 80.3)'),
 'Neural Network': ('69.4 (64.0 - 74.9)',
  '70.8 (65.6 - 76.1)',
  '76.3 (73.5 - 79.0)',
  '70.1 (67.0 - 73.3)'),
 'XGBoost': ('70.0 (64.0 - 76.0)',
  '83.6 (80.3 - 86.9)',
  '84.1 (81.7 - 86.4)',
  '76.9 (73.6 - 80.2)')}

In [12]:
probs['Random Forest']

[array([0.43254762, 0.69669048, 0.5745    , 0.7807381 , 0.32828571,
        0.44903571, 0.47847222, 0.33959921, 0.65063492, 0.68153535,
        0.32308333, 0.655     , 0.70813492, 0.18923016, 0.78346429,
        0.73216667, 0.70180952, 0.15771429, 0.41917063, 0.35080952,
        0.15977778, 0.84396825, 0.52545238, 0.81939286, 0.76988095,
        0.53707143, 0.52425397, 0.54383333, 0.3485873 , 0.59759524,
        0.54421429, 0.2506746 , 0.6698171 , 0.43969048, 0.46483333,
        0.58957143, 0.40088492, 0.40395238, 0.37697619, 0.31428175,
        0.58967424, 0.4630873 , 0.40144841, 0.90678571, 0.39050397,
        0.25007143, 0.4150754 , 0.40128571, 0.33480952, 0.37769048,
        0.64884921, 0.47088492, 0.28621032, 0.42871429, 0.74166667,
        0.25721429, 0.26695238, 0.78818254, 0.44175   , 0.77915476,
        0.52497619, 0.54171429, 0.4644127 , 0.46788095, 0.22758333,
        0.17925   , 0.25966667, 0.53940476, 0.10683333, 0.51561905,
        0.69497619]),
 array([0.40583333, 0.7390

In [None]:
# # Save the probs dictionary as a pickle file
name = 'ling_probs_classif'
with open(f'../data/test_eval_probs/{name}.pkl', 'wb') as f:
    pickle.dump(probs, f)

___

#### *GPT embeddings*

In [4]:
with open('../data/features/gpt_train.pkl', 'rb') as f:
    gpt_train = pickle.load(f)
gpt_train.head()

  gpt_train = pickle.load(f)


Unnamed: 0,data,label,pid
0,"[0.005286967847496271, 0.024346468970179558, -...",1,24
1,"[0.022915663197636604, 0.011132195591926575, -...",1,25
2,"[-0.0063759456388652325, 0.015673918649554253,...",1,27
3,"[0.018051227554678917, -0.0007191338227130473,...",1,28
4,"[0.023917796090245247, 0.04508865252137184, -0...",1,31


In [5]:
gpt_train.data[0].shape    

(1536,)

In [None]:
X = gpt_train['data']
y = gpt_train['label']
X_train_2d = np.stack(X.values)
# Normalization
scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X_train_2d)

In [15]:
from mlcog.utils.io import get_cv_model_path
print(get_cv_model_path("cv_gpt", "lr").resolve())

/Users/marialima/Desktop/GitHub-ML-cog-code/data/cv_eval/cv_gpt/10fcv_lr.pkl


In [8]:
models = m.create_models()
param_grids = m.create_param_grids()

# model selection and crossvalidation
all_results_gpt = []
for name, model in models.items():
    result_gpt = tuning.crossval(name, model, param_grids[name], X_scaled_train, y, feature_set = 'cv_gpt')
    all_results_gpt.append(result_gpt)

df_eval_cv_gpt = pd.DataFrame(all_results_gpt)
df_eval_cv_gpt

Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits




Fitting 10 folds for each of 50 candidates, totalling 500 fits


Unnamed: 0,Model,Sensitivity,Specificity,Roc_auc,Accuracy
0,lr,76.0 (12.0),86.2 (13.1),87.0 (8.2),80.7 (10.7)
1,svm,78.3 (12.7),81.2 (15.1),86.3 (8.6),79.5 (9.1)
2,rf,82.8 (9.4),68.4 (17.8),83.2 (9.7),76.0 (10.7)
3,nn,79.3 (13.5),76.2 (18.1),87.5 (7.7),77.7 (12.3)
4,xgboost,81.8 (9.0),74.6 (17.7),83.8 (10.9),78.3 (11.4)


In [None]:
PATH_SAVE_DF = "../data/cv_eval/cv_gpt/"
df_eval_cv_gpt.to_csv(PATH_SAVE_DF + "classif_results.csv", index=False)


NOTE: 
- The same steps above can be applied to evaluate on unseen test set ```./data/features/gpt_test.pkl```
- Similarly, these methods are applied to the external DementiaBank dataset and the pilot dataset