**Note: We included all necessary parameters to reproduce the results obtained here. However, the FastText model can behave differently across systems due to factors like hardware, threading, and underlying library versions. As a result, you may observe slight variations in performance on the training and test data.**

In [1]:
import sys
import os

sys.path.append(os.path.abspath("..")) 

In [2]:
import random
import numpy as np
import pandas as pd
import fasttext
from sklearn.manifold import TSNE

### import utils functions

In [3]:
from utils import *
from utils_fasttext import *

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package average

### Extract all sentences for each patient and put into a list. all_sentences is 2D list as an output.

In [4]:
train_cc = "../ADReSS-IS2020-data/train/transcription/cc"
train_cd = "../ADReSS-IS2020-data/train/transcription/cd"
test = "../ADReSS-IS2020-data-test/test/transcription"
all_sentences_cc = extract_all_sentences(train_cc)
all_sentences_cd = extract_all_sentences(train_cd)
all_sentences_test = extract_all_sentences(test)

### Apply cleaning step on all_sentences both for training and testing dataset. Output is a 2D list.

In [5]:
random.seed(42)
np.random.seed(42)
cleaned_healthy_speech = [
    [clean_text(sentence) for sentence in sentence_list]
    for sentence_list in all_sentences_cc
]

cleaned_dementia_speech = [
    [clean_text(sentence) for sentence in sentence_list]
    for sentence_list in all_sentences_cd
]

cleaned_test_speech = [
    [clean_text(sentence) for sentence in sentence_list]
    for sentence_list in all_sentences_test
]

### Combine CC and CD to make training dataset

In [6]:
cleaned_train_speech = cleaned_healthy_speech + cleaned_dementia_speech

### Join sentences to make a one single string for each patient in training and test datasets

In [7]:
# This is necessary as an input for fasttext
clean_texts_train = [" ".join(sentences) for sentences in cleaned_train_speech]
y_train = [0]*54 + [1]*54

clean_texts_test = [" ".join(sentences) for sentences in cleaned_test_speech]
test_data = pd.read_csv("../ADReSS-IS2020-data-test/test/test_labels.txt", delimiter=";")
# Extract test labels
y_test = test_data["Label "]


### Cross-validation with default parameters of fasttext

In [8]:
X = np.array(clean_texts_train)
y = np.array(y_train)
n_splits = 5
metrics_cross = cross_validate_fasttext(X, y, n_splits=n_splits)
print(f"Average FastText Metrics ({n_splits}-Fold CV):")
print(metrics_cross)


Average FastText Metrics (5-Fold CV):
{'Accuracy': np.float64(0.723), 'Precision': np.float64(0.866), 'Recall': np.float64(0.54), 'F1 Score': np.float64(0.646), 'AUC': np.float64(0.84)}


In [9]:
plot_metrics_table(metrics_cross, title="FastText Metrics (cross validation)")

### Test data

In [10]:
# Create train_data file to train fasttext
with open("train_data.txt", "w", encoding="utf-8") as f:
    for text, label in zip(clean_texts_train, y_train):
        f.write(f"__label__{label} {text}\n")

In [11]:
# Initialize fasttext model with same parameters used for cross validation
model = fasttext.train_supervised(input="train_data.txt", seed=42, thread=1)

In [12]:
# Visualize training data using TSNE
embeddings = np.array([model.get_word_vector(text) for text in clean_texts_train])

# Apply t-SNE to reduce dimensions to 2D
tsne = TSNE(n_components=2, perplexity=50, random_state=42, init='random')
reduced_embeddings = tsne.fit_transform(embeddings)
plot_tsne(reduced_embeddings, y_train)

In [13]:
# Evaluate on test data
metrics_test = get_metrics_fasttext(model, clean_texts_test, y_test)
plot_metrics_table(metrics_test, title="Fasttext Metrics (Test data)")

In [14]:
plot_confusion_matrices_with_roc_fasttext(model, clean_texts_test, y_test, "Fasttext")

{'accuracy': 0.688,
 'precision': 0.8,
 'recall': 0.5,
 'f1': 0.615,
 'auc': np.float64(0.816)}

### Fine tuning

In [15]:
# best_hyperparams = genetic_algorithm_fasttext(clean_texts_train, y_train)

In [16]:
# Results from cell above
best_hyperparams = {'lr': np.float64(0.161),
 'epoch': 40,
 'wordNgrams': 1,
 'dim': 227,
 'minCount': 4,
 'loss': 'softmax'}

### Cross-validation after fine tuning

In [17]:
X = np.array(clean_texts_train)
y = np.array(y_train)
n_splits = 5
metrics_cross_tuned = cross_validate_fasttext(X, y, n_splits=n_splits, params=best_hyperparams)
print(f"Average FastText Metrics ({n_splits}-Fold CV):")
print(metrics_cross_tuned)


Average FastText Metrics (5-Fold CV):
{'Accuracy': np.float64(0.87), 'Precision': np.float64(0.884), 'Recall': np.float64(0.851), 'F1 Score': np.float64(0.865), 'AUC': np.float64(0.898)}


In [18]:
plot_metrics_table(metrics_cross_tuned, title="FastText Metrics (cross validation after fine tuning)")

In [19]:
# Initialize fasttext model with fine tuned parameters
model_tuned = fasttext.train_supervised(input="train_data.txt", **best_hyperparams, seed=42, thread=1)

In [20]:
# Evaluate on test data 
metrics_test_tuned = get_metrics_fasttext(model_tuned, clean_texts_test, y_test)
plot_metrics_table(metrics_test_tuned, title="Fasttext Metrics (Test data after fine tuning)")

In [21]:
plot_confusion_matrices_with_roc_fasttext(model_tuned, clean_texts_test, y_test, "Fasttext")

{'accuracy': 0.854,
 'precision': 0.905,
 'recall': 0.792,
 'f1': 0.844,
 'auc': np.float64(0.896)}

**Fine tuning fasttext parameters improved its performance. However, It did not provide a better result we obtained from svc-tfidf model. (See file tfidf_models.ipynb inside tfidf folder)**

## Comparing the results before and after tuning parameters

In [22]:
# Reformat metrics dictionary to plot comparsion

def reformat_metrics(metrics_dict):
    for key, value in metrics_dict.items():
        metrics_dict[key] = [value]
    metrics_dict['Model'] = ["Fasttext"]
    return metrics_dict

metrics_cross = reformat_metrics(metrics_cross)

metrics_cross_tuned = reformat_metrics(metrics_cross_tuned)

metrics_test = reformat_metrics(metrics_test)

metrics_test_tuned = reformat_metrics(metrics_test_tuned)

### Cross validation

In [23]:
plot_metrics_comparison(metrics_cross, metrics_cross_tuned, "Model Performance Comparison Before and After tuning (cross validation)")

### Test data

In [24]:
plot_metrics_comparison(metrics_test, metrics_test_tuned)

### Remove txt files generated to train fasttext

In [25]:
# filename_template = "train_data_tmp.txt"
filename_train = "train_data.txt"
try:
    # os.remove(filename_template)
    os.remove(filename_train)
except Exception as e:
    print(e)