In [12]:
import pandas as pd
import numpy as np
import os
import joblib
import pickle
from sklearn.preprocessing import StandardScaler
from mlcog import bootstrap, tuning, models as m
from mlcog.utils import io

In [2]:
ling_train = pd.read_pickle('../data/features/ling_train.pkl')
ling_test = pd.read_pickle('../data/features/ling_test.pkl')
pilot = pd.read_pickle('../data/features/pilot.pkl')

#### Test set (from 70:30 train-test ratio)

In [7]:
ling_test

Unnamed: 0,pid,label,data
0,58,0,"[57, 47.02, 94.48, 1.0, 5.65, 8.14, 14.04, 89...."
1,64,1,"[57, 62.72, 70.59, 14.46, 74.93, 5.7, 15.79, 9..."
2,70,1,"[62, 95.94, 97.31, 25.64, 6.37, 4.43, 6.45, 95..."
3,71,1,"[156, 1.0, 99.0, 1.31, 49.66, 19.5, 13.46, 97...."
4,65,0,"[107, 88.19, 94.26, 10.52, 48.73, 15.29, 12.15..."
...,...,...,...
66,68,0,"[113, 86.9, 79.24, 53.37, 20.23, 12.56, 15.93,..."
67,69,0,"[111, 99.0, 93.46, 98.8, 32.75, 6.53, 15.32, 8..."
68,55,0,"[248, 74.04, 86.1, 6.29, 25.43, 9.92, 15.32, 8..."
69,41,1,"[49, 97.86, 74.89, 66.12, 82.06, 12.25, 14.29,..."


In [3]:
df_all = pd.read_csv('../data/dx-mmse.csv')
# Select only the specified columns
selected_columns = ['age', 'gender', 'mmse', 'dx', 'adressfname', 'test', 'pid']
df_all['pid'] = df_all['adressfname'].str.extract(r'(\d{3})$')
df_all = df_all[selected_columns]
df_train = df_all[df_all['test'] == False]
df_test = df_all[df_all['test'] == True]
df_test.head()

Unnamed: 0,age,gender,mmse,dx,adressfname,test,pid
2,74,female,30.0,Control,adrso004,True,4
4,65,female,28.0,Control,adrso006,True,6
7,68,female,29.0,Control,adrso009,True,9
9,71,female,30.0,Control,adrso011,True,11
11,70,female,29.0,Control,adrso013,True,13


In [4]:
df_reg = ling_train.merge(df_train, left_on='pid', right_on='pid', how='left')
df_reg['mmse'] = df_reg['mmse'].round().astype(int)
df_reg.head()

Unnamed: 0,pid,label,data,age,gender,mmse,dx,adressfname,test
0,267,0,"[81, 58.92, 54.78, 83.36, 20.23, 4.26, 8.64, 9...",64,female,30,Control,adrso267,False
1,273,0,"[110, 80.0, 71.57, 20.79, 47.86, 10.0, 13.64, ...",67,female,30,Control,adrso273,False
2,298,0,"[92, 84.34, 93.93, 44.75, 35.66, 7.08, 14.13, ...",68,female,30,Control,adrso298,False
3,307,0,"[83, 56.34, 74.41, 13.52, 76.34, 7.55, 7.23, 7...",70,male,29,Control,adrso307,False
4,312,0,"[128, 64.33, 75.49, 48.9, 30.91, 5.82, 11.72, ...",67,female,29,Control,adrso312,False


In [8]:
X = df_reg['data']
y = df_reg['mmse']
X_train_2d = np.stack(X.values)
# Normalization
scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X_train_2d)

models_regression = m.create_regression_models()
param_grids_regression = m.create_param_grids_regression()

# Model selection and crossvalidation
results = []
for name, model in models_regression.items():
    result = tuning.crossvalidation_regression(name, model, param_grids_regression[name], X_scaled_train, y, feature_set = 'cv_ling')
    results.append(result)

df_eval_cv = pd.DataFrame(results)
df_eval_cv

Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits




Fitting 10 folds for each of 48 candidates, totalling 480 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits
Fitting 10 folds for each of 50 candidates, totalling 500 fits


Unnamed: 0,Model,RMSE,MAE
0,Ridge,7.5 (1.3),6.0 (1.2)
1,SVR,6.4 (1.2),5.3 (0.9)
2,RFR,5.9 (0.7),4.8 (0.5)
3,MLP,8.3 (1.8),6.6 (1.7)
4,XGBoost,5.9 (0.8),4.8 (0.5)


In [9]:
feature_abbr = 'cv_ling'  
model_abbr = 'reg_rfr'
filename = f"10fcv_{model_abbr}.pkl"
file_path = os.path.join("../data/cv_eval/", feature_abbr, filename)
if os.path.exists(file_path):
    best_model_reg = joblib.load(file_path)
best_model_reg

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [5]:
test_groups = pd.read_csv('../data/test_groups.csv')
test_groups.mmse_split.value_counts()

mmse_split
cn        36
ad-mod    21
ad-mil    11
ad-sev     2
Name: count, dtype: int64

In [12]:
test_groups.head()

Unnamed: 0,adressfname,gender,dx,age,mmse,pid,mmse_split,age_split
0,adrso108,male,ProbableAD,65,19.0,1,ad-mod,60-69
1,adrso171,female,Control,57,27.0,10,cn,50-59
2,adrso184,female,Control,78,30.0,11,cn,70-80
3,adrso293,female,Control,57,30.0,12,cn,50-59
4,adrso113,female,ProbableAD,69,20.0,13,ad-mod,60-69


In [6]:
# Harmonize types/whitespace first
ling = ling_test.copy()
groups = test_groups.copy()
ling['pid']   = ling['pid'].astype(str).str.strip()
groups['pid'] = groups['pid'].astype(str).str.strip()

# Left-join on index
df_reg_test = (
    ling.set_index('pid')
        .join(groups.set_index('pid'), how='left', rsuffix='_grp')
        .reset_index()
)

# NOTE: remove outlier without MMSE 
df_reg_test = df_reg_test[df_reg_test['pid'] != '54']
df_reg_test['mmse'] = df_reg_test['mmse'].round().astype(int)
df_reg_test.head()

Unnamed: 0,pid,label,data,adressfname,gender,dx,age,mmse,mmse_split,age_split
0,58,0,"[57, 47.02, 94.48, 1.0, 5.65, 8.14, 14.04, 89....",adrso013,female,Control,70,29,cn,70-80
1,64,1,"[57, 62.72, 70.59, 14.46, 74.93, 5.7, 15.79, 9...",adrso038,female,ProbableAD,65,24,ad-mil,60-69
2,70,1,"[62, 95.94, 97.31, 25.64, 6.37, 4.43, 6.45, 95...",adrso214,male,ProbableAD,56,18,ad-mod,50-59
3,71,1,"[156, 1.0, 99.0, 1.31, 49.66, 19.5, 13.46, 97....",adrso083,male,ProbableAD,78,24,ad-mil,70-80
4,65,0,"[107, 88.19, 94.26, 10.52, 48.73, 15.29, 12.15...",adrso166,female,Control,58,30,cn,50-59


In [9]:
X_test = df_reg_test['data']
y_test = df_reg_test['mmse']
X_test_2d = np.stack(X_test.values)
# Transform test set with scaler object already fit on training data
X_scaled_test = scaler.transform(X_test_2d)

model_map_reg = {
        'Ridge': 'rr',
        'SVR': 'svr',
        'RFR': 'rfr',
        'MLP': 'mlp',
        'XGBoost': 'xgb',
    }
best_hyperparams = io.load_best_params(model_map_reg, feature_set = 'cv_ling', reg=True)
best_hyperparams

{'Ridge': Ridge(alpha=np.float64(7.579479953348009)),
 'SVR': SVR(C=np.float64(8.28752236376816), gamma='auto'),
 'RFR': RandomForestRegressor(min_samples_leaf=2, random_state=42),
 'MLP': MLPRegressor(alpha=np.float64(0.0021143813626634373), batch_size=16,
              hidden_layer_sizes=(400,), learning_rate='adaptive',
              learning_rate_init=np.float64(0.015226341829186323),
              max_iter=10000, random_state=42),
 'XGBoost': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=np.float64(0.8114452379095001), device=None,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, feature_weights=None,
              gamma=np.float64(0.0341389859975072), grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=np.float64(0.02550451635058231), max_bin=None

In [11]:
# modified bootstrap function
eval, preds = bootstrap.fit_and_evaluate_bootstrap_regression(best_hyperparams, X_scaled_train, y, X_scaled_test, y_test)
evaluation_bootstrap_df = pd.DataFrame(eval)
df = evaluation_bootstrap_df.round(1)
df

Unnamed: 0,Model,MAE Mean,MAE Lower CI,MAE Upper CI,RMSE Mean,RMSE Lower CI,RMSE Upper CI
0,Ridge,5.1,4.8,5.4,7.0,6.5,7.6
1,SVR,4.1,4.0,4.2,5.0,4.9,5.1
2,RFR,3.7,3.7,3.8,4.7,4.6,4.8
3,MLP,5.8,5.5,6.1,7.3,7.0,7.7
4,XGBoost,3.7,3.6,3.9,4.7,4.6,4.9


In [13]:
# # Save the probs dictionary as a pickle file
name = 'ling_preds_reg'
with open(f'../data/test_eval_probs/{name}.pkl', 'wb') as f:
    pickle.dump(preds, f)

#### Pilot dataset

In [14]:
### CHANGE feature_set = 

X_test = pilot['data']
y_test = pilot['mmse']
X_test_2d = np.stack(X_test.values)
# Transform test set with scaler object already fit on training data
X_scaled_test = scaler.transform(X_test_2d)

model_map_reg = {
        'Ridge': 'rr',
        'SVR': 'svr',
        'RFR': 'rfr',
        'MLP': 'mlp',
        'XGBoost': 'xgb',
    }
best_hyperparams = io.load_best_params(model_map_reg, feature_set = 'cv_ling', reg=True)

eval, preds = bootstrap.fit_and_evaluate_bootstrap_regression(best_hyperparams, X_scaled_train, y, X_scaled_test, y_test)
evaluation_bootstrap_df = pd.DataFrame(eval)
df = evaluation_bootstrap_df.round(1)
df

Unnamed: 0,Model,MAE Mean,MAE Lower CI,MAE Upper CI,RMSE Mean,RMSE Lower CI,RMSE Upper CI
0,Ridge,7.9,6.6,9.1,9.7,8.6,10.9
1,SVR,3.7,3.4,3.9,4.5,4.2,4.7
2,RFR,3.3,3.1,3.5,4.2,3.9,4.4
3,MLP,7.0,6.4,7.7,8.8,8.2,9.3
4,XGBoost,3.5,3.2,3.7,4.2,3.9,4.5
