In [None]:
!pip install transformers torch scikit-learn pandas
!pip install fasttext
!pip install deap
!pip install evaluate

In [6]:
%cd "/content/drive/MyDrive/project550-main"

/content/drive/MyDrive/project550-main


**BERT serves the purpose of distinguishing dementia from normal cognitive function because of its true prowess in language context and structure, an ability that is especially needed due to impairments that dementia may inflict upon a person's speech and writing. The illuminating insight into subtle linguistic phenomena such as incoherence, repetition, and grammatical errors will accommodate detection of dementia. Pretraining of BERT on very large text corpora permits fine-tuning on small task-related datasets, hence its efficiency in tracking dementia languages. BERT, therefore, tends to outperform previous models with respect to accuracy and sensitivity toward this classification problem.**

In [7]:
import random
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

**import utils functions**

In [8]:
from utils import extract_all_sentences, clean_text
from utils_models import *
from finetuning_models import genetic_algorithm_fasttext

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


**Extract all sentences for each patient and put into a list. all_sentences is 2D list as an output.**

In [9]:
train_cc = "ADReSS-IS2020-data/train/transcription/cc"
train_cd = "ADReSS-IS2020-data/train/transcription/cd"
test = "ADReSS-IS2020-data-test/test/transcription"
all_sentences_cc = extract_all_sentences(train_cc)
all_sentences_cd = extract_all_sentences(train_cd)
all_sentences_test = extract_all_sentences(test)

**Apply cleaning step on all_sentences both for training and testing dataset. Output is a 2D list.**

In [10]:
random.seed(42)
np.random.seed(42)
cleaned_healthy_speech = [
    [clean_text(sentence) for sentence in sentence_list]
    for sentence_list in all_sentences_cc
]

cleaned_dementia_speech = [
    [clean_text(sentence) for sentence in sentence_list]
    for sentence_list in all_sentences_cd
]

cleaned_test_speech = [
    [clean_text(sentence) for sentence in sentence_list]
    for sentence_list in all_sentences_test
]

**Combine CC and CD to make training dataset**

In [11]:
cleaned_train_speech = cleaned_healthy_speech + cleaned_dementia_speech

**Join sentences to make a one single string for each patient in training and test datasets**

In [12]:
# This is necessary as an input for bert
clean_texts_train = [" ".join(sentences) for sentences in cleaned_train_speech]
y_train = [0]*54 + [1]*54

clean_texts_test = [" ".join(sentences) for sentences in cleaned_test_speech]
test_data = pd.read_csv("ADReSS-IS2020-data-test/test/test_labels.txt", delimiter=";")
# Extract test labels
y_test = test_data["Label "]

**Custom PyTorch Dataset for BERT Model Input and Label Handling**

In [15]:
from torch.utils.data import Dataset
import torch

class BERTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


**Tokenization Function and DataLoader Setup for BERT Model**

In [16]:
from torch.utils.data import Dataset, DataLoader

def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512)

In [17]:
train_encodings = tokenize_function(clean_texts_train)
train_dataset = BERTDataset(train_encodings, y_train)
test_encodings = tokenize_function(clean_texts_test)
test_dataset = BERTDataset(test_encodings, y_test)

**Stratified K-Fold Cross-Validation Setup for Model Evaluation**

In [18]:
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [19]:
import matplotlib.pyplot as plt
import pandas as pd

def plot_metrics_table(metrics, title="BERT Metrics"):
    df = pd.DataFrame(metrics)
    df = df.T
    df.columns = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
    ax = df.plot(kind='bar', figsize=(10, 6))
    ax.set_title(title)
    ax.set_ylabel('Scores')
    plt.show()

metrics_cross = {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1-Score': []}

In [20]:
import evaluate
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_curve, auc

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    accuracy_score = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1_score = f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"]

    precision = precision_score(labels, predictions, average="macro")
    recall = recall_score(labels, predictions, average="macro")

    conf_matrix = confusion_matrix(labels, predictions)

    fpr, tpr, thresholds = roc_curve(labels, predictions)
    roc_auc = auc(fpr, tpr)

    return {
        "accuracy": accuracy_score,
        "f1": f1_score,
        "precision": precision,
        "recall": recall,
        "roc_auc": roc_auc
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

**5-Fold Cross-Validation with BERT using Trainer and Early Stopping**

In [None]:
def train_one_fold(fold, model, tokenizer, train_dataset, train_index, val_index):
    print(f"Training fold {fold + 1}...")

    # Create subsets
    train_subset = torch.utils.data.Subset(train_dataset, train_index)
    val_subset = torch.utils.data.Subset(train_dataset, val_index)

    training_args = TrainingArguments(
        output_dir=f"./results/fold_{fold+1}",
        num_train_epochs=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        eval_strategy="epoch",
        save_strategy="epoch",
        seed=42,
        weight_decay=0.01,
        logging_dir=f"./logs/fold_{fold+1}",
        logging_strategy="epoch",
        logging_steps=1,
        report_to="none",
        load_best_model_at_end=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_subset,
        eval_dataset=val_subset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

    trainer.train()
    predictions, label_ids, _ = trainer.predict(val_subset)
    preds = predictions.argmax(axis=-1)

    accuracy = accuracy_score(label_ids, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(label_ids, preds, average='binary')

    print(f"Fold {fold + 1} metrics:")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    }


**Full Cross_Validation**

In [None]:
def cross_validate(model, tokenizer, train_dataset, kf, y_train):
    metrics_cross = {
        'Accuracy': [],
        'Precision': [],
        'Recall': [],
        'F1-Score': []
    }

    for fold, (train_index, val_index) in enumerate(kf.split(train_dataset, y_train)):
        fold_metrics = train_one_fold(fold, model, tokenizer, train_dataset, train_index, val_index)
        for key in metrics_cross:
            metrics_cross[key].append(fold_metrics[key])

    return metrics_cross

**Model Evaluation on Test Set with Metrics Calculation**

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate_on_test_set(trainer, test_dataset, average='binary'):
    trainer.model.eval()

    predictions, label_ids, _ = trainer.predict(test_dataset)
    preds = predictions.argmax(axis=-1)

    accuracy = accuracy_score(label_ids, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(label_ids, preds, average=average)

    metrics_test = {
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1-Score': [f1]
    }

    print("\nTest Set Metrics:")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

    return metrics_test


**1.1 BertForSequenceClassification:**

Fine-tunes BERT with a classification head for end-to-end training and direct class predictions, making it ideal for classification tasks.

In [14]:
from transformers import BertForSequenceClassification, BertTokenizer # Import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model_BertForSequenceClassification = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
metrics = cross_validate(model_BertForSequenceClassification, tokenizer, train_dataset, kf, y_train)
print("Cross-Validation Metrics:")
print(metrics)

**Test data**

In [None]:
metrics_test = evaluate_on_test_set(trainer, test_dataset)
print("Cross-Validation Metrics:")
print(metrics)

**1.2 Bert for feature extraction and
CNN Classification**

This model combines BERT with a CNN architecture for text classification. It first uses a pre-trained BERT model to extract contextualized token embeddings from the input text. The output from BERT’s last hidden layer is then transposed and passed through several 1D convolutional layers with varying kernel sizes (2, 3, 4), each followed by a ReLU activation and max pooling across the sequence. The resulting feature maps are concatenated, passed through a dropout layer for regularization, and finally fed into a fully connected layer to produce logits for classification. If labels are provided during the forward pass, the model also computes the cross-entropy loss.

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

class BERT_CNN_Classifier(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", num_labels=2):
        super(BERT_CNN_Classifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=768, out_channels=100, kernel_size=k)
            for k in [2, 3, 4]
        ])
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(len(self.convs) * 100, num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        last_hidden_state = outputs.last_hidden_state
        x = last_hidden_state.transpose(1, 2)
        # Apply CNN + ReLU + MaxPool
        x = [torch.relu(conv(x)).max(dim=2)[0] for conv in self.convs]
        x = torch.cat(x, dim=1)
        # Dropout
        x = self.dropout(x)
        logits = self.fc(x)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {'loss': loss, 'logits': logits}


In [None]:
model_BERT_CNN_Classifier = BERT_CNN_Classifier("bert-base-uncased", num_labels=2)

In [None]:
metrics = cross_validate(model_BERT_CNN_Classifier, tokenizer, train_dataset, kf, y_train)
print("Cross-Validation Metrics:")
print(metrics)

**Test data**

In [None]:
metrics_test = evaluate_on_test_set(trainer, test_dataset)
print("Cross-Validation Metrics:")
print(metrics)

**1.3 Freezing the First Two Layers of BERT for Fine-Tuning**

This code freezes the weights of the first two layers of the BERT model during training. By iterating over the model's parameters, it checks if the parameter belongs to the first or second encoder layers (layer 0 or layer 1), and sets requires_grad to False for these layers, preventing their weights from being updated during backpropagation. This technique is commonly used in transfer learning when you want to retain the general features learned by earlier layers of the model and only fine-tune the higher layers for a specific task, saving computational resources and reducing the risk of overfitting.

In [None]:
model_BERT_CNN_Classifier = BERT_CNN_Classifier("bert-base-uncased", num_labels=2)

In [None]:
for name, param in model_BERT_CNN_Classifier.bert.named_parameters():
    if "encoder.layer.0" in name or "encoder.layer.1" in name:
        param.requires_grad = False


In [None]:
metrics = cross_validate(model_BERT_CNN_Classifier, tokenizer, train_dataset, kf, y_train)
print("Cross-Validation Metrics:")
print(metrics)

**Test data**

In [None]:
metrics_test = evaluate_on_test_set(trainer, test_dataset)
print("Cross-Validation Metrics:")
print(metrics)

----------------

**2. BERT Token Embeddings**

Extracts token embeddings from BERT for use in a separate classifier ( SVM or logistic regression), offering more flexibility but requiring additional training and setup.(by default 768 features)

In [93]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

In [94]:
import torch

def get_bert_token_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.squeeze(0)  # Shape: (seq_len, 768)


bert_embeddings = [get_bert_token_embeddings(text) for text in clean_texts_train]
X = np.array(bert_embeddings)
y = np.array(y_train)

bert_embeddings_test = [get_bert_token_embeddings(text) for text in clean_texts_test]
X_test = np.array(bert_embeddings_test)
y1_test = np.array(y_test)

**Normalize data**

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.fit_transform(X_test)

**Cross-validation with normalized x and default y**

In [91]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

models = {
    'Logistic Regression': LogisticRegression(max_iter=5000),
    'SVM': SVC(kernel='linear'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1),
    'KNN': KNeighborsClassifier(n_neighbors=5),
}

scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

**Print Results**

In [92]:
for model_name, model in models.items():
    print(f"\nEvaluating {model_name}...")
    cv_results = cross_validate(model, X_scaled, y, cv=5, scoring=scoring)
    print(f"\n{model_name} Average Metrics (5-Fold Cross-Validation):")
    for score in scoring:
        avg = np.mean(cv_results[f'test_{score}'])
        print(f"{score.title()}: {avg:.3f}")


Evaluating Logistic Regression...

Logistic Regression Average Metrics (5-Fold Cross-Validation):
Accuracy: 0.750
Precision: 0.772
Recall: 0.704
F1: 0.730
Roc_Auc: 0.839

Evaluating SVM...

SVM Average Metrics (5-Fold Cross-Validation):
Accuracy: 0.741
Precision: 0.755
Recall: 0.704
F1: 0.723
Roc_Auc: 0.826

Evaluating Random Forest...

Random Forest Average Metrics (5-Fold Cross-Validation):
Accuracy: 0.648
Precision: 0.659
Recall: 0.629
F1: 0.642
Roc_Auc: 0.692

Evaluating Gradient Boosting...

Gradient Boosting Average Metrics (5-Fold Cross-Validation):
Accuracy: 0.649
Precision: 0.681
Recall: 0.593
F1: 0.628
Roc_Auc: 0.699

Evaluating KNN...

KNN Average Metrics (5-Fold Cross-Validation):
Accuracy: 0.575
Precision: 0.588
Recall: 0.504
F1: 0.539
Roc_Auc: 0.590


**2.2 Build a Simple CNN for Text Classification**

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class CNNClassifier(nn.Module):
    def __init__(self, embedding_dim=768, num_classes=2):
        super(CNNClassifier, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=100, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=embedding_dim, out_channels=100, kernel_size=4)
        self.conv3 = nn.Conv1d(in_channels=embedding_dim, out_channels=100, kernel_size=5)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(3 * 100, num_classes)

    def forward(self, x):
        x = x.permute(0, 2, 1)  # (batch_size, embedding_dim, seq_len)
        x1 = F.relu(self.conv1(x)).max(dim=2)[0]
        x2 = F.relu(self.conv2(x)).max(dim=2)[0]
        x3 = F.relu(self.conv3(x)).max(dim=2)[0]
        x = torch.cat((x1, x2, x3), dim=1)
        x = self.dropout(x)
        return self.fc(x)


In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNClassifier().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

**Prepare Input to Feed into CNN**

In [44]:
def get_bert_token_embeddings(text, max_len=128):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_len)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    last_hidden_state = outputs.last_hidden_state.squeeze(0)  # shape: (seq_len, 768)

    # Ensure consistent shape
    if last_hidden_state.shape[0] < max_len:
        pad_len = max_len - last_hidden_state.shape[0]
        pad = torch.zeros(pad_len, last_hidden_state.shape[1])
        last_hidden_state = torch.cat([last_hidden_state, pad], dim=0)
    elif last_hidden_state.shape[0] > max_len:
        last_hidden_state = last_hidden_state[:max_len, :]

    return last_hidden_state  # shape: (128, 768)


In [46]:
from sklearn.metrics import classification_report
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_embeddings = [get_bert_token_embeddings(text) for text in clean_texts_train]
X = np.array(bert_embeddings)
y = np.array(y_train)

bert_embeddings_test = [get_bert_token_embeddings(text) for text in clean_texts_test]
X_test = np.array(bert_embeddings_test)
y1_test = np.array(y_test)
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"\n--- Fold {fold + 1} ---")

    X_train_fold = [X[i] for i in train_idx]
    y_train_fold = [y[i] for i in train_idx]
    X_val_fold = [X[i] for i in val_idx]
    y_val_fold = [y[i] for i in val_idx]

    train_dataset = BERTDataset(X_train_fold, y_train_fold)
    val_dataset = BERTDataset(X_val_fold, y_val_fold)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8)

    # Initialize model
    model = CNNClassifier().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    # Training loop
    for epoch in range(3):
        model.train()
        total_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            outputs = model(xb)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(yb.numpy())

    print(classification_report(val_labels, val_preds))



--- Fold 1 ---
Epoch 1, Loss: 7.7253
Epoch 2, Loss: 6.4008
Epoch 3, Loss: 5.1468
              precision    recall  f1-score   support

           0       0.71      0.91      0.80        11
           1       0.88      0.64      0.74        11

    accuracy                           0.77        22
   macro avg       0.79      0.77      0.77        22
weighted avg       0.79      0.77      0.77        22


--- Fold 2 ---
Epoch 1, Loss: 7.5017
Epoch 2, Loss: 6.1446
Epoch 3, Loss: 5.1660
              precision    recall  f1-score   support

           0       1.00      0.73      0.84        11
           1       0.79      1.00      0.88        11

    accuracy                           0.86        22
   macro avg       0.89      0.86      0.86        22
weighted avg       0.89      0.86      0.86        22


--- Fold 3 ---
Epoch 1, Loss: 7.9761
Epoch 2, Loss: 6.3527
Epoch 3, Loss: 5.0921
              precision    recall  f1-score   support

           0       1.00      1.00      1.00  

In [47]:

test_dataset = BERTDataset(bert_embeddings_test, y1_test)
test_loader = DataLoader(test_dataset, batch_size=8)

model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        outputs = model(xb)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        test_preds.extend(preds)
        test_labels.extend(yb.numpy())

print("\n--- Final Evaluation on Test Set ---")
print(classification_report(test_labels, test_preds))



--- Final Evaluation on Test Set ---
              precision    recall  f1-score   support

           0       0.52      0.50      0.51        24
           1       0.52      0.54      0.53        24

    accuracy                           0.52        48
   macro avg       0.52      0.52      0.52        48
weighted avg       0.52      0.52      0.52        48

