In [1]:
import sys
import os
sys.path.append(os.path.abspath("..")) 

In [2]:
import random
import numpy as np
from sklearn.manifold import TSNE
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


### Set seed for reproduciblility

In [3]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)
    torch.backends.cudnn.deterministic = True 
    torch.backends.cudnn.benchmark = False

set_seed(42)

### import utils functions

In [4]:
from utils import *
from utils_bert_cnn import *

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mohse\AppData\Roaming\nltk_data...
[nltk_data]   Package average

### Combine the augment data and original one

In [5]:
clean_aug_healthy = combine_clean_and_augmented(
    clean_path='aug_clean_txtfiles/clean_healthy.txt',
    cont_augmented_path='aug_clean_txtfiles/cont_augmented_sentences_healthy.txt'
)

clean_aug_dementia = combine_clean_and_augmented(
    clean_path='aug_clean_txtfiles/clean_dementia.txt',
    cont_augmented_path='aug_clean_txtfiles/cont_augmented_sentences_dementia.txt'
)

### Prepare train, validation and test data

**Note: We dont use cross validation due to limitation of time and gpu** 

In [6]:
# Train dataset
clean_texts_train = clean_aug_healthy + clean_aug_dementia
y_train = [0] * len(clean_aug_healthy) + [1] * len(clean_aug_dementia)

# Split the training data into training and validation sets
texts_train, texts_val, labels_train, labels_val = train_test_split(
    clean_texts_train, y_train, test_size=0.2, stratify=y_train, random_state=42
)

# Test dataset
clean_texts_test = read_file("aug_clean_txtfiles/clean_test.txt")
test_data = pd.read_csv("../ADReSS-IS2020-data-test/test/test_labels.txt", delimiter=";")
# Extract test labels
y_test = test_data["Label "]

### Pretrain BERT with a classification head for end-to-end training and direct class predictions, making it ideal for classification tasks (BertForSequenceClassification)

In [7]:
# Build Bert Dataset 
class BERTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

**Tokenizes text data using the BERT tokenizer (bert-base-uncased) with padding and truncation up to a max length of 512 tokens. It then creates BERT-compatible datasets (BERTDataset) for training, validation, and testing by applying the tokenizer to the respective text sets and pairing them with their labels.**

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512)

train_encodings = tokenize_function(texts_train)
train_dataset_bert = BERTDataset(train_encodings, labels_train)
val_encodings = tokenize_function(texts_val)
val_dataset_bert = BERTDataset(val_encodings, labels_val)
test_encodings = tokenize_function(clean_texts_test)
test_dataset_bert = BERTDataset(test_encodings, y_test)

### Initialize Bert model

In [9]:
model_BERT = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_BERT.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

### Train and save bert model

In [10]:
# The following line are commented out because it takes long time using CPU. However, model is saved in saved_models folder.
# train_bert_model(model_BERT, train_dataset_bert, epochs=20, batch_size=4)
# torch.save(model_BERT.state_dict(), "saved_models/bert_model.pth")

### Load bert model

In [11]:
model_BERT.load_state_dict(torch.load("saved_models/bert_model.pth", map_location=torch.device('cpu')))

<All keys matched successfully>

### Evaluate bert model on validation

In [12]:
val_metrics_bert = evaluate_bert_model(model_BERT, val_dataset_bert)
plot_metrics_table(val_metrics_bert)

### Evaluate bert model on test data

In [13]:
test_metrics_bert = evaluate_bert_model(model_BERT, test_dataset_bert)
plot_metrics_table(test_metrics_bert)

### plot confusion matrix and roc curve for test data

In [14]:
plot_confusion_matrix_and_roc_for_bert(model_BERT, test_dataset_bert)

**The BERT model shows low recall, meaning it struggles to identify positive instances (class 1). The confusion matrix indicates a high number of false negatives, where positive cases are misclassified as negative. While accuracy and precision are reasonable, improving recall would require addressing class imbalance or adjusting the decision threshold.**

## Bert Embeddings + CNN Classification

**In the following, we combine BERT with a CNN architecture for text classification. We Extract token embeddings from BERT for use in a separate classifier (CNN), offering more flexibility but requiring additional training and setup.(by default 768 features). The output from BERT’s last hidden layer is then transposed and passed through several 1D convolutional layers with varying kernel sizes (2, 3, 4), each followed by a ReLU activation and max pooling across the sequence. The resulting feature maps are concatenated, passed through a dropout layer for regularization, and finally fed into a fully connected layer to produce logits for classification. If labels are provided during the forward pass, the model also computes the cross-entropy loss.**

In [15]:
# Initialize tokenizer and bert model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

In [16]:
def get_bert_token_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.squeeze(0)  # Shape: (seq_len, 768)


In [17]:
from sklearn.model_selection import train_test_split

bert_embeddings_train = [get_bert_token_embeddings(text) for text in clean_texts_train]
bert_train, bert_val, y_train_split, y_val_split = train_test_split(
    bert_embeddings_train, y_train, test_size=0.2, random_state=42
)

bert_embeddings_test = [get_bert_token_embeddings(text) for text in clean_texts_test]

### Pad to consistent size & create dataset

In [18]:
import torch.nn.functional as F

def pad_embeddings(embeddings, max_len=128):
    padded = []
    for emb in embeddings:
        if emb.size(0) < max_len:
            emb = F.pad(emb, (0, 0, 0, max_len - emb.size(0)))
        else:
            emb = emb[:max_len]
        padded.append(emb)
    return torch.stack(padded)

X_train_tensor = pad_embeddings(bert_train)
X_val_tensor = pad_embeddings(bert_val)
X_test_tensor = pad_embeddings(bert_embeddings_test)

y_train_tensor = torch.tensor(y_train_split, dtype=torch.long)
y_val_tensor = torch.tensor(y_val_split, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)


### Build Dataset

**The EmbeddingDataset class handles pre-computed embeddings for classification tasks, using them as input_ids and creating dummy attention masks. It prepares data for training, validation, and testing by returning embeddings and labels in a dictionary format**

In [19]:
class EmbeddingDataset(torch.utils.data.Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.embeddings[idx],  # Treat embeddings as input_ids
            'attention_mask': torch.ones(self.embeddings[idx].shape[0]),  # dummy mask
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
        return item


train_dataset_cnn = EmbeddingDataset(X_train_tensor, y_train_tensor)
val_dataset_cnn = EmbeddingDataset(X_val_tensor, y_val_tensor)
test_dataset_cnn = EmbeddingDataset(X_test_tensor, y_test_tensor)


### Visualization using TSNE

In [20]:
# Extract embeddings from the train dataset
embeddings = [item['input_ids'] for item in train_dataset_cnn]  # Extract embeddings (input_ids)
# Stack embeddings into a tensor
embeddings_tensor = torch.stack(embeddings) 

# Average pooling to flatten the embeddings
embeddings_flattened = embeddings_tensor.mean(dim=1)

# Extract labels
labels = [item['labels'] for item in train_dataset_cnn]
labels_tensor = torch.tensor(labels)

# Convert tensor labels to list of integers. This is necessary for the plot_tsne function.
labels_list = labels_tensor.tolist()

# Apply t-SNE to reduce dimensions to 2D
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_2d = tsne.fit_transform(embeddings_flattened)

# Plot the t-SNE visualization using the preprocessed labels
plot_tsne(X_2d, labels_list, title="t-SNE Visualization for CNN Model")


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



**The t-SNE visualization indicates that the two classes are not well-separated in the embedding space, showing that distinguishing between the classes could be challenging for the model.**

### CNNClassifier with tuned parameters

 **Using PyTorch and BERT embeddings and applies 1D convolutional layers with different kernel sizes (2, 3, 4) to the BERT output (which has a shape of (batch_size, seq_len, 768)). After passing through the convolutions, it performs max pooling, concatenates the results, applies dropout for regularization, and finally passes the output through a fully connected layer (Linear) to predict the num_labels**

In [21]:
import torch
import torch.nn as nn
from transformers import BertModel

class CNNClassifier(nn.Module):
    def __init__(self, in_channels=768, num_labels=2):
        super(CNNClassifier, self).__init__()
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=in_channels, out_channels=100, kernel_size=k)
            for k in [2, 3, 4]
        ])
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(100 * len(self.convs), num_labels)

    def forward(self, x):  # x shape: (batch_size, seq_len, 768)
        x = x.permute(0, 2, 1)  # shape: (batch_size, 768, seq_len)
        x = [torch.relu(conv(x)).max(dim=2)[0] for conv in self.convs]
        x = torch.cat(x, dim=1)
        x = self.dropout(x)
        return self.fc(x)



### Initialize CNNClassifier

In [22]:
model_BERT_CNN_Classifier = CNNClassifier(num_labels=2)

### Train and save CNNClassifier

In [23]:
# # The following line are commented out because it takes long time using CPU. However, model is saved in saved_models folder.
# train_cnn_model(model_BERT_CNN_Classifier, train_dataset, epochs=50, batch_size=4)
# torch.save(model_BERT_CNN_Classifier.state_dict(), "saved_models/cnn_model.pth")

### Load CNNClassifier

In [24]:
model_BERT_CNN_Classifier.load_state_dict(torch.load("saved_models/cnn_model.pth", map_location=torch.device('cpu')))

<All keys matched successfully>

### Evaluate CNNClasifier on validation

In [25]:
val_metrics_cnn = evaluate_cnn_model(model_BERT_CNN_Classifier, val_dataset_cnn)
plot_metrics_table(val_metrics_cnn)


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



### Evaluate CNNClassifier on test data

In [26]:
test_metrics_cnn = evaluate_cnn_model(model_BERT_CNN_Classifier, test_dataset_cnn)
plot_metrics_table(test_metrics_cnn)


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



In [27]:
plot_confusion_matrix_and_roc_for_cnn(model_BERT_CNN_Classifier, test_dataset_cnn, model_name="CNN")


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



**The CNN model shows a slight improvement in recall, indicating a better ability to identify positive instances compared to previous iterations. Both the validation and test data yield similar, moderate accuracy, suggesting the model generalizes well across datasets. This consistency makes the CNN model more reliable, as it maintains stable performance without overfitting, providing a robust solution for classification.**

## Compare Results from Bert Model and CNNClassifier

In [28]:
plot_metrics_comparison_bert_cnn(test_metrics_cnn, test_metrics_bert, title="")