In [21]:
import os, warnings

# Make worker processes (joblib/loky) ignore both warnings
os.environ["PYTHONWARNINGS"] = (
    "ignore:Got `batch_size` less than 1 or larger than sample size:UserWarning,"
    "ignore:A worker stopped while some jobs were given to the executor:UserWarning"
)

# Suppress in the current process as well

# Batch size clipping warnings
warnings.filterwarnings(
    "ignore",
    message=r"Got `batch_size` less than 1 or larger than sample size\.?(?:\s*It is going to be clipped\.)?",
    category=UserWarning
)
warnings.filterwarnings(
    "ignore",
    message=r".*batch_size.*clipped",
    category=UserWarning
)

# Worker stopped warnings (joblib / sklearn)
warnings.filterwarnings(
    "ignore",
    message=r"A worker stopped while some jobs were given to the executor",
    category=UserWarning
)

In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler

from mlcog import evaluation, models as m
from mlcog.utils.io import get_cv_model_path
from mlcog import tuning, bootstrap, evaluation 
from mlcog.utils import io

#### *Lexical and semantic psycholinguistic features*

In [3]:
ling_train = pd.read_pickle('../data/features/ling_train.pkl')
ling_train.head()

Unnamed: 0,pid,label,data
0,267,0,"[81, 58.92, 54.78, 83.36, 20.23, 4.26, 8.64, 9..."
1,273,0,"[110, 80.0, 71.57, 20.79, 47.86, 10.0, 13.64, ..."
2,298,0,"[92, 84.34, 93.93, 44.75, 35.66, 7.08, 14.13, ..."
3,307,0,"[83, 56.34, 74.41, 13.52, 76.34, 7.55, 7.23, 7..."
4,312,0,"[128, 64.33, 75.49, 48.9, 30.91, 5.82, 11.72, ..."


In [3]:
ling_train.data[0].shape   

(100,)

In [4]:
X_train = ling_train['data']
y = ling_train['label']
X_train_2d = np.stack(X_train.values)
# Normalization
scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X_train_2d)

models = m.create_models()
param_grids = m.create_param_grids_()

# model selection and crossvalidation
results_ling = []

for name, model in models.items():
    result = tuning.crossval_(name, model, param_grids[name], X_scaled_train, y, feature_set='cv_ling')
    results_ling.append(result)

df_eval_cv_ling = pd.DataFrame(results_ling)
df_eval_cv_ling

Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits


Unnamed: 0,Model,Sensitivity,Specificity,Roc_auc,Accuracy
0,lr,70.4 (10.0),77.3 (9.2),78.6 (6.0),73.6 (6.5)
1,svm,75.0 (14.5),69.5 (14.3),78.1 (5.2),72.2 (8.6)
2,rf,78.8 (16.7),72.1 (13.4),83.5 (8.9),75.3 (9.4)
3,nn,69.4 (18.2),70.7 (17.2),73.4 (12.9),69.9 (7.9)
4,xgboost,79.3 (10.9),78.4 (18.7),86.6 (8.9),78.9 (8.7)


In [5]:
PATH_SAVE_DF = "../data/cv_eval/cv_ling/"
df_eval_cv_ling.to_csv(PATH_SAVE_DF + "classif_results.csv", index=False)

##### Evaluation on unseen test set

In [5]:
model_map_class = {
        'Logistic Regression': 'lr',
        'SVM': 'svm',
        'Random Forest': 'rf',
        'Neural Network': 'nn',
        'XGBoost': 'xgboost',
    }
prefix = "10fcv_"
best_hyperparams = io.load_best_params(model_map_class, feature_set = 'cv_ling')
best_hyperparams['Random Forest']

In [8]:
ling_test = pd.read_pickle('../data/features/ling_test.pkl')

X_test = ling_test['data']
y_test = ling_test['label']
X_test_2d = np.stack(X_test.values)
# scaler object already fit on training data
X_scaled_test = scaler.transform(X_test_2d)

evaluation_bootstrap, probs = bootstrap.fit_and_evaluate_bootstrap_classification(
    best_hyperparams, 
    X_scaled_train, 
    y, 
    X_scaled_test, 
    y_test
)
evaluation_bootstrap_df = pd.DataFrame(evaluation_bootstrap)
df = evaluation_bootstrap_df.round(3)
df[df.columns[1:]] = df[df.columns[1:]] * 100

results_dict = {}
for model_name in df.Model:
    results = evaluation.extract_results_classif_test(df[df.Model == model_name])
    results_dict[model_name] = results
results_dict["Random Forest"]

('69.4 (66.4 - 72.5)',
 '83.3 (78.0 - 88.7)',
 '85.7 (83.8 - 87.6)',
 '76.5 (74.4 - 78.6)')

In [12]:
probs['Random Forest'][0]

array([0.24378571, 0.5242619 , 0.60224603, 0.83431746, 0.24952381,
       0.34054762, 0.5516746 , 0.16938095, 0.54938095, 0.71452381,
       0.43909524, 0.44961111, 0.59985714, 0.19633333, 0.65466667,
       0.83136508, 0.55935714, 0.25355556, 0.24871429, 0.40209524,
       0.17602381, 0.84638095, 0.5132381 , 0.85795238, 0.69538095,
       0.58938095, 0.39034921, 0.68904762, 0.38628571, 0.54380952,
       0.77684921, 0.30096825, 0.61783333, 0.42660317, 0.48219048,
       0.6377619 , 0.30633333, 0.30989683, 0.3994127 , 0.35953968,
       0.53088889, 0.5074127 , 0.33202381, 0.89990476, 0.27163492,
       0.12519048, 0.36547619, 0.393     , 0.52159524, 0.35722222,
       0.70088889, 0.553     , 0.186     , 0.36026984, 0.80166667,
       0.27247619, 0.29590476, 0.829     , 0.44177778, 0.70854762,
       0.45446032, 0.32738095, 0.47414286, 0.40652381, 0.38407937,
       0.11785714, 0.30519048, 0.67712698, 0.1588254 , 0.49387302,
       0.85557143])

In [14]:
# # Save the probs dictionary as a pickle file
name = 'ling_probs_classif'
with open(f'../data/test_eval_probs/{name}.pkl', 'wb') as f:
    pickle.dump(probs, f)

**NOTE**: Similarly, independent evaluation of the best-performing RF-NLP model can be applied applied to the external DementiaBank dataset (```./data/features/ext.pkl```) and the pilot dataset (```./data/features/pilot.pkl```). See results on the paper.

#### *GPT embeddings*

In [15]:
gpt_train = pd.read_pickle('../data/features/gpt_train.pkl')
gpt_train.head()

Unnamed: 0,data,label,pid
0,"[0.005286967847496271, 0.024346468970179558, -...",1,24
1,"[0.022915663197636604, 0.011132195591926575, -...",1,25
2,"[-0.0063759456388652325, 0.015673918649554253,...",1,27
3,"[0.018051227554678917, -0.0007191338227130473,...",1,28
4,"[0.023917796090245247, 0.04508865252137184, -0...",1,31


In [16]:
gpt_train.data[0].shape    

(1536,)

In [5]:
X_gpt_train = np.stack(gpt_train['data'].values)   
y_gpt_train = gpt_train['label'].values

In [22]:
X_gpt_train = gpt_train['data']
y_gpt = gpt_train['label']
X_train_2d = np.stack(X_gpt_train.values)
# Normalization
scaler = StandardScaler()
X_gpt_scaled_train = scaler.fit_transform(X_train_2d)

models = m.create_models()
param_grids = m.create_param_grids_()

# model selection and crossvalidation
all_results_gpt = []
for name, model in models.items():
    result_gpt = tuning.crossval_(name, model, param_grids[name], X_gpt_scaled_train, y_gpt, feature_set = 'cv_gpt')
    all_results_gpt.append(result_gpt)

df_eval_cv_gpt = pd.DataFrame(all_results_gpt)
df_eval_cv_gpt

Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits


Unnamed: 0,Model,Sensitivity,Specificity,Roc_auc,Accuracy
0,lr,76.0 (12.0),86.2 (13.1),87.0 (8.2),80.7 (10.7)
1,svm,78.3 (12.7),81.2 (15.1),86.3 (8.6),79.5 (9.1)
2,rf,82.8 (9.4),68.4 (17.8),83.5 (9.7),76.0 (10.7)
3,nn,79.3 (13.5),76.2 (18.1),87.5 (7.7),77.7 (12.3)
4,xgboost,81.8 (9.0),74.6 (17.7),83.8 (10.9),78.3 (11.4)


In [19]:
PATH_SAVE_DF = "../data/cv_eval/cv_gpt/"
df_eval_cv_gpt.to_csv(PATH_SAVE_DF + "classif_results.csv", index=False)


NOTE: 
- The same steps above can be applied to evaluate the best-performing model trained on GPT embeddings on the unseen test set using ```./data/features/gpt_test.pkl```