In [None]:
import utils_ml_analysis as ml # Check utils_ml_analysis.py for the implementation of the methods
import pandas as pd
import pickle
import numpy as np

In [None]:
# Uncomment and run to download the required spaCy model for NLP tasks
import spacy
from spacy.cli import download
download("en_core_web_sm")

## 1. Linguistic feature extraction

In [None]:
# Run this method to extract the following linguistic features from a text file

def text_analysis_features(file_path, lang='en'):
    results = []

    if file_path.endswith(".txt"): 
        with open(file_path, 'r', encoding='ISO-8859-1') as file:
            content = file.read()
        brunet_index, honors_statistic, cttr, pid, duplicate_proportion = ml.calculate_ling_nlp(content, lang=lang)
        results.append({
            'Brunet': brunet_index,
            'Honore': honors_statistic,
            'CTTR': cttr,
            'PIDensity': pid,
            'Duplic': duplicate_proportion
        })

    df = pd.DataFrame(results)
    return df

p = './data/ad-example.txt'
df_ad_features = text_analysis_features(p)
df_ad_features

In [None]:
# Run to load the precomputed linguistic features and inspect the dataframe
liwc_train = pd.read_pickle('./data/train.pkl')
liwc_train.head()

## 2. Cognitive Impairment Detection (binary classification)

#### *10-fold crossvalidation*

In [None]:
from sklearn.preprocessing import StandardScaler
# TODO: Load liwc_train features and assign features (X) and labels (y)
# TODO: Convert list of feature vectors into a 2D NumPy array for processing
# TODO: Normalize features for model compatibility and improved performance using StandardScaler()
# TODO: Apply fit_transform to the training data. Set X_scaled_train

In [None]:
# TODO: Check and complete the functions below in utils_ml_analysis.py
models = ml.create_models()
param_grids = ml.create_param_grids()

results = []
# TODO: Check and complete cross-validation and hyperparameter tuning using ml.crossval() under utils_ml_analysis.py
for name, model in models.items():
    result = ml.crossval(name, model, param_grids[name], X_scaled_train, y, feature_set = 'cv_hyperparam_liwc')
    results.append(result)

# Aggregate cross-validation results into a DataFrame df_eval_cv
df_eval_cv = pd.DataFrame(results)
df_eval_cv

In [None]:
# TODO: Uncomment and run to save the cross-validation results

# PATH_SAVE_DF = "./data/"
# df_eval_cv.to_csv(PATH_SAVE_DF + "results_cv.csv", index=False)

#### *Evaluation on test set*

In [None]:
liwc_test = pd.read_pickle('./data/test.pkl')
liwc_test.head()

In [None]:
# TODO: Assign test set features and labels using ./data/test_liwc.pkl
# TODO: Convert test features to 2D NumPy array
# TODO: Apply training-set scaler object to normalize test features 

# Load best hyperparameters from previous cross-validation
best_hyperparams = ml.load_best_params(feature_set = 'cv_hyperparam_liwc')
best_hyperparams

Uncomment to evaluate results with bootstrapping (ensure variable names are compatible with your previous code)

In [None]:
# evaluation_bootstrap, probs = ml.fit_and_evaluate_bootstrap_(best_hyperparams, X_scaled_train, y, X_scaled_test, y_test)
# evaluation_bootstrap_df = pd.DataFrame(evaluation_bootstrap)
# df = evaluation_bootstrap_df.round(3)
# df[df.columns[1:]] = df[df.columns[1:]] * 100
# results_dict = {}
# for model_name in df.Model:
#     results = ml.extract_results_classif_test(df[df.Model == model_name])
#     results_dict[model_name] = results
# results_dict

Uncomment to save the probs dictionary as a pickle file for later processing

In [None]:
# name = 'results_eval_probs'
# with open(f'./data/{name}.pkl', 'wb') as f:
#     pickle.dump(probs, f)

#### *Compute performance metrics using GPT embeddings from the transcripts*

In [None]:
with open("./data/gpt_train.pkl", 'rb') as f: # new
    df_gpt_train = pickle.load(f)

with open('./data/gpt_test.pkl', 'rb') as f: # new
    df_gpt_test = pickle.load(f)

In [None]:
models = ml.create_models()
param_grids = ml.create_param_grids()

# TODO: Like before, perform 10-f cross-validation with hyperparameter tuning for each model
# TODO: Convert results to DataFrame 'df_eval_cv' for inspection

In [None]:
# PATH_SAVE_DF = "./data/"
# df_eval_cv.to_csv(PATH_SAVE_DF + "results_cv_gpt.csv", index=False)

#### *Compare and discuss results (NLP vs. GPT)*