# Data Exploration

This notebook is for exploring and visualizing the data.

In [30]:
import sys
sys.path.append('/home/mjnk/Study/MoE_Alzheimer_Detection')
sys.path.append('/work/21013299/Minh/MoE_Alzheimer_Detection')

In [31]:
sys.path

['/home/21013299/.conda/envs/minh_ml/lib/python311.zip',
 '/home/21013299/.conda/envs/minh_ml/lib/python3.11',
 '/home/21013299/.conda/envs/minh_ml/lib/python3.11/lib-dynload',
 '',
 '/home/21013299/.conda/envs/minh_ml/lib/python3.11/site-packages',
 '/home/mjnk/Study/MoE_Alzheimer_Detection',
 '/work/21013299/Minh/MoE_Alzheimer_Detection',
 '/home/mjnk/Study/MoE_Alzheimer_Detection',
 '/work/21013299/Minh/MoE_Alzheimer_Detection']

In [32]:
import os
import re

import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import seaborn as sns
import pylangacq

from tqdm import tqdm
import string
punctuations = string.punctuation

from scripts.utils import *

In [33]:
# Define PATH
ADReSS2020_DATAPATH = "../data/ADReSS-IS2020-data"
ADReSS2020_TRAINPATH = os.path.join(ADReSS2020_DATAPATH, "train")
ADReSS2020_TESTPATH = os.path.join(ADReSS2020_DATAPATH, "test")

TRANSCRIPT_NAME = "transcription"

In [34]:
def get_chat_data(split):
    """
    Get the CHAT data from the ADReSS-IS2020 dataset.
    
    Parameters:
        split (str): The split to load (either "train" or "test").
        data_path (str): The path to the ADReSS-IS2020 dataset.
    
    Returns:
        DataFrame: A DataFrame containing the CHAT data and labels.
    """
    path = ADReSS2020_TRAINPATH if split == "train" else ADReSS2020_TESTPATH
    # Define the path to the transcript files.
    transcript_path = os.path.join(path, TRANSCRIPT_NAME)
    # Read the CHAT data.
    reader = pylangacq.read_chat(transcript_path)

    file_paths = reader.file_paths()
    data = []

    test_df = pd.read_csv(ADReSS2020_DATAPATH + '/2020Labels.txt', delimiter=';', skipinitialspace=True)
    test_df = test_df.drop(columns=['age', 'mmse', 'gender'], axis=1)

    # Read and merge utterances from each file.
    for file_path in file_paths:
        # Read and merge *PAR utterances.
        utterances = read_par_utterances(file_path)

        # Tokenize and merge tokens from each utterance.
        all_tokens = []
        for utt in utterances:
            tokens = tokenize_and_merge(utt)
            all_tokens.extend([token for token in tokens if token not in list(punctuations)])

        if split == 'test':
            label = test_df[test_df['ID'] == os.path.basename(file_path).split('.')[0] + ' '].Label.iloc[0]
        
        elif split == 'train':
            label = 0 if 'cc' in file_path else 1

        data.append((all_tokens, label))

    return pd.DataFrame(data, columns=['tokens', 'label'])

In [35]:
train_data = get_chat_data('train')

In [36]:
train_data

Unnamed: 0,tokens,label
0,"[tell, me, everything, that, you, see, going, ...",0
1,"[just, look, at, the, picture, and, tell, me, ...",0
2,"[okay, [+ exc], there's, a, little, boy, and, ...",0
3,"[are, you, ready, [+ exc], mhm, &um, well, the...",0
4,"[okay, there's, the, picture, okay, [+ exc], g...",0
...,...,...
103,"[well, the, boy, on, the, chair, [: stool], [*...",1
104,"[everything, that, you, see, happening, in, th...",1
105,"[picture, and, tell, me, everything, that, you...",1
106,"[okay, the, picture, tell, me, everything, +/....",1


In [37]:
import string
import re

speacial_token = []
# Check each token in the train data to see if there is any token that contains punctuations excepts the tokens like "there's", "I'm", "I've", etc.
for tokens in train_data.tokens:
    for token in tokens:
        # Define common contractions pattern
        contractions_pattern = r"(?i)(\'s|\'m|\'ve|n\'t|\'re|\'d|\'ll)"
        
        # Check if token contains punctuation but is not a contraction
        if any(p in token for p in string.punctuation):
            if not re.search(contractions_pattern, token) and token not in speacial_token:
                speacial_token.append(token)

speacial_token

['&uh',
 'an(d)',
 '[: overflowing]',
 '[* s:r]',
 '<walk with a>',
 '[//]',
 '&c',
 '&s',
 'stealin(g)',
 'o(f)',
 'takin(g)',
 '<when the>',
 '[/]',
 '[+ exc]',
 'out_of',
 'jar,',
 '+"/.',
 '+"',
 '&ou',
 '&um',
 '&=laughs',
 '&hm',
 '&k',
 '+...',
 '<what are>',
 '+/?',
 '+<',
 '&d',
 '+/.',
 '&m',
 '&shor',
 '(..)',
 '(.)',
 '<has her>',
 '<hand up>',
 '+".',
 'doin(g)',
 '+//.[+',
 'exc]',
 '+//.',
 '[>]',
 '[<]',
 '[+ gram]',
 '<many &dish or>',
 '<the cookie jar>',
 '<(i)s saying>',
 '&t',
 'of_course',
 '<the cups>',
 '&ah',
 '&i',
 '[: kind_of]',
 '<washin(g) dishes>',
 '(...)',
 '<if that>',
 '<what have>',
 '&=clears:throat',
 '&wai',
 'cups,',
 'oh,',
 '<the water>',
 '<is that>',
 '+..?',
 'no,',
 '+//?',
 'happening,',
 '(be)cause',
 '<stool is>',
 '<she looks>',
 '<the sink>',
 '&l',
 'dryin(g)',
 'splashin(g)',
 'fallin(g)',
 'gettin(g)',
 'puttin(g)',
 'somethin(g)',
 'lookin(g)',
 'spillin(g)',
 'snowin(g)',
 '&f',
 '+,',
 'grabbin(g)',
 'reachin(g)',
 'touchin(g)',


In [38]:
# Save the special tokens to a file
with open(ADReSS2020_DATAPATH + '/special_tokens.txt', 'w') as f:
    for token in speacial_token:
        f.write(token + '\n')

In [39]:
import random
def load_special_tokens(file_path):
    """Load special tokens from file"""
    with open(file_path, 'r') as f:
        return [line.strip() for line in f if line.strip()]

def add_brackets(token):
    """Add <> brackets around a token"""
    return f"<{token}>"

def augment_dataset_with_brackets(tokens_list:list, special_tokens, 
                                bracket_prob=0.1, 
                                max_brackets_per_seq=2,
                                special_token_prob=0.2):
    """
    Augment dataset by:
    1. Randomly adding <> to existing tokens
    2. Adding special tokens from file
    
    Args:
        tokens_list: List of token sequences
        special_tokens: List of special tokens from file
        bracket_prob: Probability of adding brackets to tokens in a sequence
        max_brackets_per_seq: Maximum tokens to bracket in each sequence
        special_token_prob: Probability of adding special tokens to a sequence
    """
    augmented_dataset = []
    
    for tokens in tokens_list:
        augmented_tokens = tokens.copy()
        
        # 1. Randomly add brackets to existing tokens
        if random.random() < bracket_prob:
            # Find eligible tokens (those without brackets)
            eligible_positions = [i for i, token in enumerate(tokens) 
                               if '<' not in token and '>' not in token]
            
            if eligible_positions:
                num_to_modify = random.randint(1, min(len(eligible_positions), max_brackets_per_seq))
                positions_to_modify = random.sample(eligible_positions, num_to_modify)
                
                for pos in positions_to_modify:
                    augmented_tokens[pos] = add_brackets(augmented_tokens[pos])
        
        # 2. Add special tokens with probability
        if random.random() < special_token_prob:
            # Add 1-3 random special tokens
            num_special = random.randint(1, 3)
            selected_special = random.sample(special_tokens, min(num_special, len(special_tokens)))
            augmented_tokens.insert(random.randint(0, len(augmented_tokens)), selected_special)
        
        augmented_dataset.append(augmented_tokens)
    
    return augmented_dataset

# Example usage:
special_tokens_path = '/work/21013299/Minh/MoE_Alzheimer_Detection/data/ADReSS-IS2020-data/special_tokens.txt'
special_tokens = load_special_tokens(special_tokens_path)

# Test on a single example
example_tokens = train_data.tokens[0]
augmented = augment_dataset_with_brackets(
    [example_tokens],
    special_tokens,
    bracket_prob=0.3,
    max_brackets_per_seq=2,
    special_token_prob=0.3
)

print("Original:", example_tokens)
print("Augmented:", augmented[0])

# Apply to full dataset
augmented_dataset = augment_dataset_with_brackets(
    train_data.tokens,
    special_tokens
)
augmented_dataset

Original: ['tell', 'me', 'everything', 'that', 'you', 'see', 'going', 'on', 'in', 'that', 'picture', 'everything', 'that', 'you', 'see', 'happening', 'well', "there's", 'a', 'mother', 'standing', 'there', '&uh', '&uh', 'washing', 'the', 'dishes', 'an(d)', 'the', 'sink', 'is', 'overspilling', '[: overflowing]', '[* s:r]', 'an(d)', '&uh', 'the', "window's", 'open', 'and', 'outside', 'the', 'window', "there's", 'a', '<walk with a>', '[//]', '&c', 'curved', 'walk', 'with', 'a', 'garden', 'and', 'you', 'can', 'see', 'another', '&uh', '&uh', 'building', 'there', 'looks', 'like', 'a', 'garage', 'or', 'something', 'with', 'curtains', 'and', 'the', 'grass', 'in', 'the', 'garden', 'and', 'there', 'are', 'dishes', '[//]', '&uh', '&uh', 'two', 'cups', 'and', 'a', 'saucer', 'on', 'the', 'sink', 'and', '&uh', "she's", 'getting', 'her', 'feet', 'wet', 'from', 'the', 'overflow', 'of', 'the', 'water', 'from', 'the', 'sink', 'she', 'seems', 'to', 'be', 'oblivious', 'to', 'the', 'fact', 'that', 'the', '&

[['tell',
  'me',
  'everything',
  'that',
  'you',
  'see',
  'going',
  'on',
  'in',
  'that',
  'picture',
  'everything',
  'that',
  'you',
  'see',
  'happening',
  'well',
  "there's",
  'a',
  'mother',
  'standing',
  'there',
  '&uh',
  '&uh',
  'washing',
  'the',
  'dishes',
  'an(d)',
  'the',
  'sink',
  'is',
  'overspilling',
  '[: overflowing]',
  '[* s:r]',
  'an(d)',
  '&uh',
  'the',
  "window's",
  'open',
  'and',
  'outside',
  'the',
  'window',
  "there's",
  'a',
  '<walk with a>',
  '[//]',
  '&c',
  'curved',
  'walk',
  'with',
  'a',
  'garden',
  'and',
  'you',
  'can',
  'see',
  'another',
  '&uh',
  '&uh',
  'building',
  'there',
  'looks',
  'like',
  'a',
  'garage',
  'or',
  'something',
  'with',
  'curtains',
  'and',
  'the',
  'grass',
  'in',
  'the',
  'garden',
  'and',
  'there',
  'are',
  'dishes',
  '[//]',
  '&uh',
  '&uh',
  'two',
  'cups',
  'and',
  'a',
  'saucer',
  'on',
  'the',
  'sink',
  'and',
  '&uh',
  "she's",
  'gett

In [57]:
import random

def load_special_tokens(file_path):
    """Load special tokens from file"""
    with open(file_path, 'r') as f:
        # Load all special tokens and regular tokens
        return [line.strip() for line in f if line.strip()]

def add_brackets(token):
    """Add <> brackets around a token"""
    return f"<{token}>"

def augment_dataset_with_special_tokens(tokens_list, special_tokens, 
                                      bracket_prob=0.2, 
                                      max_brackets_per_seq=2,
                                      num_special_tokens=2):
    """
    Augment dataset by:
    1. Adding special tokens from file
    2. Randomly adding <> to some existing tokens
    """
    augmented_dataset = []
    
    for tokens in tokens_list:
        augmented_tokens = tokens.copy()
        
        # 1. Add random special tokens - insert them one by one
        selected_special = random.sample(special_tokens, 
                                       k=min(num_special_tokens, len(special_tokens)))
        for special_token in selected_special:
            insert_pos = random.randint(0, len(augmented_tokens))
            augmented_tokens.insert(insert_pos, special_token)
        
        # 2. Randomly add brackets to existing tokens
        if random.random() < bracket_prob:
            eligible_positions = [i for i, token in enumerate(tokens) 
                               if '<' not in token and '>' not in token]
            
            if eligible_positions:
                num_to_modify = random.randint(1, min(len(eligible_positions), 
                                                    max_brackets_per_seq))
                positions_to_modify = random.sample(eligible_positions, num_to_modify)
                
                for pos in positions_to_modify:
                    augmented_tokens[pos] = add_brackets(augmented_tokens[pos])
        
        augmented_dataset.append(augmented_tokens)
    
    return augmented_dataset

# Example usage:
special_tokens_path = '/work/21013299/Minh/MoE_Alzheimer_Detection/data/ADReSS-IS2020-data/special_tokens.txt'
special_tokens = load_special_tokens(special_tokens_path)

# Test on a single example
example_tokens = train_data.tokens[5]
augmented = augment_dataset_with_special_tokens(
    [example_tokens],
    special_tokens,
    bracket_prob=0.3,
    max_brackets_per_seq=2,
    num_special_tokens=2
)

print("Original:", example_tokens)
print("Augmented:", augmented[0])

Original: ['okay', 'and', "here's", 'the', 'picture', '(.)', 'okay', '[+ exc]', 'mother', 'is', 'drying', 'the', 'dishes', 'but', 'the', 'water', 'is', 'going', 'out', 'over', 'the', 'sink', 'onto', 'the', 'floor', '&ah', "it's", 'a', 'pretty', 'day', 'outside', 'lots', 'of', 'flowers', '[+ gram]', 'there', 'are', 'three', 'dishes', 'left', '(.)', 'to', 'wash', '[//]', 'dry', 'I', 'guess', "she's", 'standing', 'in', 'the', 'water', 'looks', 'dangerous', '[+ gram]', 'the', 'children', '&i', 'are', 'getting', 'into', 'the', 'cookie', 'jar', 'the', "boy's", 'up', 'there', 'on', 'the', 'stool', 'and', "that's", 'almost', 'falling', 'over', "he's", 'got', 'one', 'cookie', 'in', 'his', 'hand', "he's", 'handing', 'it', 'to', 'the', 'little', 'girl', 'and', "he's", 'getting', 'another', 'one', 'out_of', 'the', 'cookie', 'jar', 'cupboard', '(..)', 'did', 'you', 'say', 'action', 'that', 'is', 'going', 'on', '[+ exc]', 'uhhuh', '&uh', 'the', 'little', 'girl', 'is', '[//]', 'kinda', '[: kind_of]',

In [59]:
# Apply to full dataset
augmented_dataset = augment_dataset_with_special_tokens(
    train_data.tokens,
    special_tokens
)

In [60]:
from transformers import ModernBertModel, ModernBertConfig

# Initializing a ModernBert style configuration
configuration = ModernBertConfig()

# Initializing a model from the modernbert-base style configuration
model = ModernBertModel(configuration)

# Accessing the model configuration
configuration = model.config
configuration

ModernBertConfig {
  "_attn_implementation_autoset": true,
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 50281,
  "classifier_activation": "gelu",
  "classifier_bias": false,
  "classifier_dropout": 0.0,
  "classifier_pooling": "cls",
  "cls_token_id": 50281,
  "decoder_bias": true,
  "deterministic_flash_attn": false,
  "embedding_dropout": 0.0,
  "eos_token_id": 50282,
  "global_attn_every_n_layers": 3,
  "global_rope_theta": 160000.0,
  "hidden_activation": "gelu",
  "hidden_size": 768,
  "initializer_cutoff_factor": 2.0,
  "initializer_range": 0.02,
  "intermediate_size": 1152,
  "local_attention": 128,
  "local_rope_theta": 10000.0,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "mlp_dropout": 0.0,
  "model_type": "modernbert",
  "norm_bias": false,
  "norm_eps": 1e-05,
  "num_attention_heads": 12,
  "num_hidden_layers": 22,
  "pad_token_id": 50283,
  "reference_compile": null,
  "repad_logits_with_grad": false,
  "sep_token_id": 50282,
  "spa

In [61]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [62]:
from transformers import AutoTokenizer, ModernBertModel
import torch

tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
model = ModernBertModel.from_pretrained("answerdotai/ModernBERT-base")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt").to(device)
outputs = model.to(device)(**inputs)

last_hidden_states = outputs.last_hidden_state
last_hidden_states

tensor([[[ 0.3147, -0.5636, -0.7742,  ..., -0.3935,  0.1778, -0.5275],
         [-0.6348, -1.3041,  0.1655,  ..., -0.4116, -1.4177,  0.8489],
         [ 0.0905,  0.0599,  0.2130,  ...,  0.1283, -0.0392,  0.0715],
         ...,
         [-0.2144, -0.9171,  0.2220,  ..., -1.3775, -0.1156,  0.1317],
         [ 0.9564, -0.9251,  0.3561,  ..., -0.4667, -0.0707, -0.2410],
         [ 0.2560, -0.0717,  0.0984,  ...,  0.0122,  0.0510, -0.1633]]],
       grad_fn=<NativeLayerNormBackward0>)

In [64]:
augmentd_tokens = augment_dataset_with_special_tokens([train_data.tokens[5]], special_tokens, bracket_prob=0.3, max_brackets_per_seq=2, num_special_tokens=2)[0]
tokenizer(" ".join(augmentd_tokens), 
          return_tensors='pt', 
          truncation=True, 
          padding='max_length', 
          max_length=300)['input_ids'].shape

torch.Size([1, 300])

In [48]:
last_hidden_states.shape

torch.Size([1, 8, 768])

In [11]:
tokenizer(train_data.tokens.apply(" ".join).to_list(), return_tensors='pt', truncation=True, padding=True, max_length=300)

{'input_ids': tensor([[50281, 38878,   479,  ..., 50283, 50283, 50283],
        [50281,  6309,  1007,  ..., 50283, 50283, 50283],
        [50281,   536,   333,  ..., 50283, 50283, 50283],
        ...,
        [50281, 22659,   285,  ..., 50283, 50283, 50283],
        [50281,   536,   333,  ..., 50283, 50283, 50283],
        [50281, 22659,  8261,  ..., 50283, 50283, 50283]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [12]:
outputs

BaseModelOutput(last_hidden_state=tensor([[[ 0.3147, -0.5636, -0.7742,  ..., -0.3935,  0.1778, -0.5275],
         [-0.6348, -1.3041,  0.1655,  ..., -0.4116, -1.4177,  0.8489],
         [ 0.0905,  0.0599,  0.2130,  ...,  0.1283, -0.0392,  0.0715],
         ...,
         [-0.2144, -0.9171,  0.2220,  ..., -1.3775, -0.1156,  0.1317],
         [ 0.9564, -0.9251,  0.3561,  ..., -0.4667, -0.0707, -0.2410],
         [ 0.2560, -0.0717,  0.0984,  ...,  0.0122,  0.0510, -0.1633]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [13]:
last_hidden_states.shape

torch.Size([1, 8, 768])

In [16]:
import torch
torch.cuda.is_available()

True