In [1]:
import sys
import os

sys.path.append(os.path.abspath("..")) 

In [2]:
import re
import random
import numpy as np

from collections import Counter
from textstat import flesch_reading_ease
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
import nltk
nltk.download('punkt_tab')
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download("stopwords")
nltk.download('wordnet')
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger_eng')
nltk.download("averaged_perceptron_tagger")
from nltk.util import ngrams
import spacy
from spacy.cli import download
download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package average

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


### Import utils functions

In [3]:
from utils import *
from utils_fe_models import grid_search_logistic, grid_search_svc, genetic_algorithm_xgb, genetic_algorithm_randomforest

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package average

### Extract features: 

- Structural Features

- Disfluency Features

- Functional Word Usage

- Lexical Features

- Complexity & Readability

- Repetition Features

In [4]:
def extract_features(sentences):
    # Initialize feature dictionary
    features = {
        # Structural Features
        'total_sentences': len(sentences),            # Total number of sentences in the input
        'total_words': 0,                              # Total number of words across all sentences
        'unique_words': set(),                         # Set of unique words used (excluded from final output)
        'avg_sentence_length': 0,                      # Average number of words per sentence
        # Disfluency Features
        'disfluency_counts': Counter(),               # Counts of different disfluency types
        # Functional Word Usage
        'function_word_count': 0,                      # Count of function words (e.g., "and", "but")
        'pronoun_count': 0,                            # Count of pronouns (e.g., "he", "she")
        # Lexical Feature
        'lexical_diversity': 0,                        # Ratio of unique words to total words
        # Complexity & Readability
        'sentence_complexity': [],                     # List of sentence complexities (word counts)
        'flesch_reading_ease': [],                     # List of Flesch Reading Ease scores
        # Repetition Feature
        'repetitions': 0,                              # Placeholder for repetition count
    }

    # Define word groups
    function_words = set(["the", "and", "but", "or", "if", "then", "that", "which", "who", "in", "on", "at", "to", "of", "for"])
    pronouns = set(["he", "she", "it", "they", "we", "I", "you", "his", "her", "their", "mine", "yours"])

    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        features['total_words'] += len(words)
        features['unique_words'].update(words)
        features['function_word_count'] += sum(1 for word in words if word in function_words)
        features['pronoun_count'] += sum(1 for word in words if word in pronouns)

        # Count disfluencies
        features['disfluency_counts']['hesitations'] += len(re.findall(r'&uh|&eh|&um', sentence))         # Disfluency: Hesitations
        features['disfluency_counts']['false_starts'] += len(re.findall(r'\[\/\]', sentence))             # Disfluency: False starts
        features['disfluency_counts']['self_corrections'] += len(re.findall(r'\[\/\] \[\/\]', sentence))  # Disfluency: Self-corrections
        features['disfluency_counts']['exclamations'] += len(re.findall(r'\[\+ exc\]', sentence))         # Disfluency: Exclamations

        # Sentence complexity
        features['sentence_complexity'].append(len(words))

        # Readability score
        features['flesch_reading_ease'].append(flesch_reading_ease(sentence))

    # Compute averages and final values
    if features['total_sentences'] > 0:
        features['avg_sentence_length'] = features['total_words'] / features['total_sentences']
        features['lexical_diversity'] = len(features['unique_words']) / features['total_words'] if features['total_words'] > 0 else 0
        features['sentence_complexity'] = sum(features['sentence_complexity']) / len(features['sentence_complexity'])
        features['flesch_reading_ease'] = sum(features['flesch_reading_ease']) / len(features['flesch_reading_ease'])

    # Filter out 'unique_words' key using dictionary comprehension
    filtered_data = {k: v for k, v in features.items() if k != 'unique_words'}

    # Flatten the disfluency counts into separate columns
    disfluency_data = dict(filtered_data.pop('disfluency_counts'))

    # Add the disfluency counts as separate columns
    for key, value in disfluency_data.items():
        filtered_data[f'disfluency_{key}'] = value

    return features, filtered_data


### Extract features:

- Lexical Diversity Features

- Stopword Features

- Rare Word Features

In [5]:
def extract_word_features(features):
    unique_words = features['unique_words']
    total_words = features['total_words']

    # Lexical Diversity Features
    # Vocabulary Size
    vocab_size = len(unique_words)

    # Type-Token Ratio (TTR)
    ttr = vocab_size / total_words if total_words > 0 else 0  # Ratio of unique words to total words

    # Stopword Features
    stop_words = set(stopwords.words('english'))
    stopword_count = sum(1 for word in unique_words if word.lower() in stop_words)  # Count of stopwords

    # Rare Word Features
    common_words = set(stopwords.words('english'))  # Approximate common vocabulary
    rare_word_count = sum(1 for word in unique_words if word.lower() not in common_words)  # Count of rare words

    # Return extracted features
    return {
        # Feature Outputs
        'vocab_size': vocab_size,                 # Total number of unique words
        'type_token_ratio': ttr,                  # Type-Token Ratio: Measures lexical diversity
        'stopword_count': stopword_count,         # Count of stopwords in the text
        'rare_word_count': rare_word_count        # Count of rare words (those not in common vocabulary)
    }


### Extract features:

- POS Tagging Features


In [6]:
def extract_pos_features(features):
    unique_words = sorted(list(features['unique_words']))  # Convert set to list

    # POS tagging
    pos_tags = pos_tag(unique_words)

    # Count POS occurrences
    pos_counts = Counter(tag for word, tag in pos_tags)

    # Syntactic Features
    # Noun Count (e.g., NN, NNS, NNP, NNPS)
    noun_count = sum(pos_counts[tag] for tag in ['NN', 'NNS', 'NNP', 'NNPS'])  # Nouns

    # Verb Count (e.g., VB, VBD, VBG, VBN, VBP, VBZ)
    verb_count = sum(pos_counts[tag] for tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])  # Verbs

    # Adjective Count (e.g., JJ, JJR, JJS)
    adj_count = sum(pos_counts[tag] for tag in ['JJ', 'JJR', 'JJS'])  # Adjectives

    # Adverb Count (e.g., RB, RBR, RBS)
    adv_count = sum(pos_counts[tag] for tag in ['RB', 'RBR', 'RBS'])  # Adverbs

    # Pronoun Count (e.g., PRP, PRP$)
    pronoun_count = sum(pos_counts[tag] for tag in ['PRP', 'PRP$'])  # Pronouns

    # Return POS feature dictionary
    return {
        # Feature Outputs
        'noun_count': noun_count,                # Count of nouns in the text
        'verb_count': verb_count,                # Count of verbs in the text
        'adj_count': adj_count,                  # Count of adjectives in the text
        'adv_count': adv_count,                  # Count of adverbs in the text
        'pronoun_count': pronoun_count           # Count of pronouns in the text
    }


### Extract features:

- N-gram Frequency Features

- N-gram Diversity Features

In [7]:
def extract_ngram_features(sentences, n=2):
    all_ngrams = []
    
    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())  # Lowercase and tokenize
        all_ngrams.extend(ngrams(tokens, n))  # Extract n-grams
    
    # N-gram Frequency Features
    # Count occurrences of each n-gram
    ngram_counts = Counter(all_ngrams)
    
    # Get the most common n-grams (Top 5)
    most_common_ngrams = ngram_counts.most_common(5)  # Top 5 n-grams
    
    # N-gram Diversity Features
    # Unique n-gram ratio (diversity)
    total_ngrams = sum(ngram_counts.values())
    unique_ngrams = len(ngram_counts)
    ngram_diversity = unique_ngrams / total_ngrams if total_ngrams > 0 else 0

    # Return n-gram features
    return {
        # Feature Outputs
        f'most_common_{n}grams': most_common_ngrams,  # Most common n-grams in the text
        f'unique_{n}gram_ratio': ngram_diversity      # Diversity of unique n-grams
    }


### Extract features:

- Sentence Structure Features

- Parse Tree and Syntax Analysis

- Grammar Error Analysis

- Ratio Calculations

In [8]:
def extract_syntactic_features(sentences):
    doc = nlp(" ".join(sentences))
    
    features = {
        # Sentence Structure Features
        "parse_tree_depth": 0,            # Maximum depth of dependency tree in sentence
        "grammar_error_count": 0,         # Count of grammatical errors based on simple heuristic
        "passive_voice_count": 0,         # Count of passive voice constructions
        "subordination_count": 0,         # Count of subordinate clauses
        "fragment_count": 0,              # Count of sentence fragments
        "total_sentences_prime": len(list(doc.sents)),  # Total number of sentences
        "total_clauses": 0                # Total number of clauses (to be computed later)
    }
    
    # Parse Tree and Syntax Analysis
    for sent in doc.sents:
        # Parse tree depth (max depth of dependency tree)
        depths = []
        for token in sent:
            depths.append(len(list(token.ancestors)))
        if depths:
            features["parse_tree_depth"] += max(depths)
        
        # Passive voice detection (auxiliary verb + past participle)
        for token in sent:
            if token.dep_ == "auxpass":
                features["passive_voice_count"] += 1
        
        # Subordinate clauses (clauses with subordinating conjunctions)
        for token in sent:
            if token.dep_ == "mark" and token.head.dep_ == "advcl":
                features["subordination_count"] += 1
        
        # Sentence fragments (missing subject or verb)
        has_subject = any(t.dep_ in ("nsubj", "nsubjpass") for t in sent)
        has_verb = any(t.pos_ == "VERB" for t in sent)
        if not (has_subject and has_verb):
            features["fragment_count"] += 1
    
    # Grammar Error Analysis
    for token in doc:
        if token.dep_ in ("nsubj", "nsubjpass"):
            head = token.head
            if head.pos_ == "VERB":
                # Check for subject-verb number agreement errors (singular/plural disagreement)
                subj_plural = token.morph.get("Number") == ["Plur"]
                verb_plural = head.morph.get("Number") == ["Plur"]
                if subj_plural != verb_plural:
                    features["grammar_error_count"] += 1
    
    # Ratio Calculations
    if features["total_sentences_prime"] > 0:
        features["parse_tree_depth"] /= features["total_sentences_prime"]  # Average tree depth per sentence
        features["passive_voice_ratio"] = features["passive_voice_count"] / features["total_sentences_prime"]
        features["subordination_ratio"] = features["subordination_count"] / features["total_sentences_prime"]
        features["fragment_ratio"] = features["fragment_count"] / features["total_sentences_prime"]
    
    return features


### Extract all features

In [9]:
def extract_all_features(patients_data):
    feature_list = []
    
    for patient_id, sentences in enumerate(patients_data):
        features, filtered_data = extract_features(sentences)
        word_features = extract_word_features(features)
        pos_features = extract_pos_features(features)
        synthatic_features = extract_syntactic_features(sentences)
        bigram_features = extract_ngram_features(sentences, n=2)
        trigram_features = extract_ngram_features(sentences, n=3)
        
        # Combine all features into a single dictionary
        patient_features = {
            **filtered_data,
            **word_features,
            **pos_features,
            **synthatic_features,
            "unique_2gram_ratio": bigram_features["unique_2gram_ratio"],
            "unique_3gram_ratio": trigram_features["unique_3gram_ratio"]
        }
        
        feature_list.append(patient_features)
    
    return pd.DataFrame(feature_list)

### Extract all sentences for each patient and put into a list. all_sentences is 2D list as an output.

In [10]:
train_cc = "../ADReSS-IS2020-data/train/transcription/cc"
train_cd = "../ADReSS-IS2020-data/train/transcription/cd"
test = "../ADReSS-IS2020-data-test/test/transcription"
all_sentences_cc = extract_all_sentences(train_cc)

# In the analysis conducted in analysis.ipynb for the CD dataset,
# we identified an outlier. Upon further review,
# we concluded that removing this outlier would not lead to improved results.
# Therefore, we have decided to retain all sentences, including the outlier,
# to maintain the integrity of the analysis.
all_sentences_cd = extract_all_sentences(train_cd)
all_sentences_test = extract_all_sentences(test)

### Create dataframe for each class

In [11]:
random.seed(42)
np.random.seed(42)
df_cc = extract_all_features(all_sentences_cc)
df_cd = extract_all_features(all_sentences_cd)
df_test = extract_all_features(all_sentences_test)

### Extract meta features

In [12]:
meta_data_cc = pd.read_csv("../ADReSS-IS2020-data/train/cc_meta_data.txt", delimiter=";")
meta_data_cd = pd.read_csv("../ADReSS-IS2020-data/train/cd_meta_data.txt", delimiter=";")
test_data = pd.read_csv("../ADReSS-IS2020-data-test/test/test_labels.txt", delimiter=";")

# Extract test labels
labels = test_data["Label "]
test_data.drop(columns=['Label '], inplace=True)

In [13]:
meta_data_cd_cleaned = clean_meta_data(meta_data_cd)
meta_data_cc_cleaned = clean_meta_data(meta_data_cc)
test_data_cleaned = clean_meta_data(test_data, test_data=False)

In [14]:
df_cd.index = meta_data_cd_cleaned.index
df_cd["age"] = meta_data_cd_cleaned["age"]
df_cd["gender"] = meta_data_cd_cleaned["gender"]

df_cc.index = meta_data_cc_cleaned.index
df_cc["age"] = meta_data_cc_cleaned["age"]
df_cc["gender"] = meta_data_cc_cleaned["gender"]

df_test.index = test_data_cleaned.index
df_test["age"] = test_data_cleaned["age"]
df_test["gender"] = test_data_cleaned["gender"]

df_cd["label"] = 1
df_cc["label"] = 0

### Combine CC and CD to create training dataset

In [15]:
df_train = pd.concat([df_cd, df_cc])
df_train.head()

Unnamed: 0_level_0,total_sentences,total_words,avg_sentence_length,function_word_count,pronoun_count,lexical_diversity,sentence_complexity,flesch_reading_ease,repetitions,disfluency_hesitations,...,total_sentences_prime,total_clauses,passive_voice_ratio,subordination_ratio,fragment_ratio,unique_2gram_ratio,unique_3gram_ratio,age,gender,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
S079,13,229,17.615385,29,4,0.353712,17.615385,93.709231,0,12,...,15,0,0.0,0.0,0.666667,0.694444,0.852217,59,1,1
S080,7,76,10.857143,3,2,0.5,10.857143,94.635714,0,1,...,9,0,0.0,0.0,0.888889,0.73913,0.822581,55,1,1
S081,17,229,13.470588,35,5,0.375546,13.470588,87.482941,0,0,...,20,0,0.0,0.0,0.65,0.683962,0.810256,69,1,1
S082,22,396,18.0,38,5,0.295455,18.0,103.934091,0,20,...,22,0,0.0,0.045455,0.363636,0.644385,0.803977,66,1,1
S083,12,153,12.75,18,6,0.477124,12.75,91.811667,0,7,...,13,0,0.0,0.0,0.538462,0.758865,0.844961,52,1,1


### Define X_train, y_train, X_test, y_test

In [16]:
X_train = df_train.drop(columns=["label"])
y_train = df_train['label']  
X_test = df_test
y_test = labels

### Visualization using TSNE

In [17]:
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(X_train)
plot_tsne(X_tsne, y_train)

**The visualization above does not show a clear class separation based on handcrafted features. We expect a challenging classification problem for all models.** 

### Initialize models with default parameters

In [18]:
classifiers = all_models()

### Cross validation

In [19]:
metrics_cross = get_crossvalidation_metrics(classifiers, X_train, y_train)
plot_metrics_table(metrics_cross, title="Evaluation Metrics on cross validation")


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



### Test data

In [20]:
metrics_test = get_model_metrics(classifiers, X_train, y_train, X_test, y_test)
plot_metrics_table(metrics_test, title="Evaluation Metrics on test data")

In [21]:
plot_confusion_matrices_with_roc(classifiers, X_train, y_train, X_test, y_test, title="Confusion Matrices on test (1: dementia, 0: non-dementia)")

**So far, we have observed that all models—except for the SVM—show improved performance compared to those trained with meta-data features. However, this improvement can still be further enhanced through fine-tuning. Among the models, Random Forest, and Voting Classifier appear to be more robust than the others. The XGB classifier performs poorly during cross-validation but delivers good results on the test data, which may suggest that it is less reliable compared to the other models. Meanwhile, the SVM performs poorly in both cross-validation and on the test data. Fine-tuning these models will likely lead to greater improvements.**

# Fine tuning

**Fine-tuning is performed using Grid Search for the Logistic Regression and SVM models, while a Genetic Algorithm is used for XGBoost and Random Forest, as it is generally faster and more efficient than Grid Search for these models. Since hyperparameter tuning is time-consuming, we will comment out the tuning line and only retain the saved results in the following cells.**

### XGBClassifier

In [22]:
random.seed(42)
np.random.seed(42)
# best_params_xgb = genetic_algorithm_xgb(X_train, y_train)
# print("Best parameters found:")
# for k, v in best_params_xgb.items():
#     print(f"{k}: {v}")

In [23]:
best_params_xgb = {
    "n_estimators": 316,
"learning_rate": 0.398796960207327,
"subsample": 0.5210061390013416,
"colsample_bytree": 0.5158902671829911,
"reg_alpha": 4.523028560852915
}

In [24]:
random.seed(42)
np.random.seed(42)
# params_bound_rf = PARAM_BOUNDS = {
#             'n_estimators': (100, 700),       
#             'max_depth': (3, 30),                
#             'min_samples_split': (2, 20),        
#             'min_samples_leaf': (1, 20),        
#         }
# best_params_rf = genetic_algorithm_randomforest(X_train, y_train, params_bound=params_bound_rf)
# print("Best parameters found:")
# for k, v in best_params_rf.items():
#     print(f"{k}: {v}")


In [25]:
best_params_rf = {
    "n_estimators": 130,
"max_depth": 27,
"min_samples_split": 17,
"min_samples_leaf": 13
}

In [26]:
random.seed(42)
np.random.seed(42)
best_lg, best_lg_params, best_lg_score = grid_search_logistic(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [27]:
random.seed(42)
np.random.seed(42)
best_svc, best_svc_params, best_svc_score = grid_search_svc(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


# After fune tuning

### Initialize the models with fine tuned parameters

In [28]:
classifiers_tuned = all_models(lg_params=best_lg_params, svc_params=best_svc_params, xgb_params=best_params_xgb, rf_params=best_params_rf)

### Cross validation

In [29]:
metrics_cross_tuned = get_crossvalidation_metrics(classifiers_tuned, X_train, y_train)
plot_metrics_table(metrics_cross_tuned, title="Evaluation Metrics on cross validation after fine tuning")

### Test data

In [30]:
metrics_test_tuned = get_model_metrics(classifiers_tuned, X_train, y_train, X_test, y_test)
plot_metrics_table(metrics_test_tuned, title="Evaluation Metrics on test data after fine tuning")

### Confusion Matrix and Roc curve

In [31]:
plot_confusion_matrices_with_roc(classifiers_tuned, X_train, y_train, X_test, y_test, title="Confusion Matrices on test after fine tuning (1: dementia, 0: non-dementia)")

**Fine-tuning has improved the performance of the models. Among them, the XGB classifier stands out as the most reliable and robust, showing relatively strong scores for both cross-validation and test data after fine-tuning compared to other models.**

# Comparison between before and after fine tuning

### Cross validation

In [32]:
plot_metrics_comparison(metrics_cross, metrics_cross_tuned)

### Test data

In [33]:
plot_metrics_comparison(metrics_test, metrics_test_tuned)