# Data Exploration

This notebook is for exploring and visualizing the data.

In [2]:
import sys
sys.path.append('/home/mjnk/Study/MoE_Alzheimer_Detection')

In [3]:
import os
import re

import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import seaborn as sns
import pylangacq

from tqdm import tqdm
import string
punctuations = string.punctuation

from scripts.utils import *

In [4]:
# Define PATH
ADReSS2020_DATAPATH = "../data/ADReSS-IS2020-data"
ADReSS2020_TRAINPATH = os.path.join(ADReSS2020_DATAPATH, "train")
ADReSS2020_TESTPATH = os.path.join(ADReSS2020_DATAPATH, "test")

TRANSCRIPT_NAME = "transcription"

In [5]:
def get_chat_data(split):
    """
    Get the CHAT data from the ADReSS-IS2020 dataset.
    
    Parameters:
        split (str): The split to load (either "train" or "test").
        data_path (str): The path to the ADReSS-IS2020 dataset.
    
    Returns:
        DataFrame: A DataFrame containing the CHAT data and labels.
    """
    path = ADReSS2020_TRAINPATH if split == "train" else ADReSS2020_TESTPATH
    # Define the path to the transcript files.
    transcript_path = os.path.join(path, TRANSCRIPT_NAME)
    # Read the CHAT data.
    reader = pylangacq.read_chat(transcript_path)

    file_paths = reader.file_paths()
    data = []

    test_df = pd.read_csv(ADReSS2020_DATAPATH + '/2020Labels.txt', delimiter=';', skipinitialspace=True)
    test_df = test_df.drop(columns=['age', 'mmse', 'gender'], axis=1)

    # Read and merge utterances from each file.
    for file_path in file_paths:
        # Read and merge *PAR utterances.
        utterances = read_par_utterances(file_path)

        # Tokenize and merge tokens from each utterance.
        all_tokens = []
        for utt in utterances:
            tokens = tokenize_and_merge(utt)
            all_tokens.extend([token for token in tokens if token not in list(punctuations)])

        if split == 'test':
            label = test_df[test_df['ID'] == os.path.basename(file_path).split('.')[0] + ' '].Label.iloc[0]
        
        elif split == 'train':
            label = 0 if 'cc' in file_path else 1

        data.append((all_tokens, label))

    return pd.DataFrame(data, columns=['tokens', 'label'])

In [6]:
train_data = get_chat_data('train')

In [7]:
from transformers import ModernBertModel, ModernBertConfig

# Initializing a ModernBert style configuration
configuration = ModernBertConfig()

# Initializing a model from the modernbert-base style configuration
model = ModernBertModel(configuration)

# Accessing the model configuration
configuration = model.config
configuration

  from .autonotebook import tqdm as notebook_tqdm


ModernBertConfig {
  "_attn_implementation_autoset": true,
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 50281,
  "classifier_activation": "gelu",
  "classifier_bias": false,
  "classifier_dropout": 0.0,
  "classifier_pooling": "cls",
  "cls_token_id": 50281,
  "decoder_bias": true,
  "deterministic_flash_attn": false,
  "embedding_dropout": 0.0,
  "eos_token_id": 50282,
  "global_attn_every_n_layers": 3,
  "global_rope_theta": 160000.0,
  "hidden_activation": "gelu",
  "hidden_size": 768,
  "initializer_cutoff_factor": 2.0,
  "initializer_range": 0.02,
  "intermediate_size": 1152,
  "local_attention": 128,
  "local_rope_theta": 10000.0,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "mlp_dropout": 0.0,
  "model_type": "modernbert",
  "norm_bias": false,
  "norm_eps": 1e-05,
  "num_attention_heads": 12,
  "num_hidden_layers": 22,
  "pad_token_id": 50283,
  "reference_compile": null,
  "repad_logits_with_grad": false,
  "sep_token_id": 50282,
  "spa

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [9]:
from transformers import AutoTokenizer, ModernBertModel
import torch

tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
model = ModernBertModel.from_pretrained("answerdotai/ModernBERT-base")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt").to(device)
outputs = model.to(device)(**inputs)

last_hidden_states = outputs.last_hidden_state
last_hidden_states

tensor([[[ 0.3147, -0.5636, -0.7742,  ..., -0.3935,  0.1778, -0.5275],
         [-0.6348, -1.3041,  0.1655,  ..., -0.4116, -1.4177,  0.8489],
         [ 0.0905,  0.0599,  0.2130,  ...,  0.1283, -0.0392,  0.0715],
         ...,
         [-0.2144, -0.9171,  0.2220,  ..., -1.3775, -0.1156,  0.1317],
         [ 0.9564, -0.9251,  0.3561,  ..., -0.4667, -0.0707, -0.2410],
         [ 0.2560, -0.0717,  0.0984,  ...,  0.0122,  0.0510, -0.1633]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)

In [9]:
tokenizer(" ".join(train_data.iloc[5]['tokens']), 
          return_tensors='pt', 
          truncation=True, 
          padding='max_length', 
          max_length=300)['input_ids'].shape

torch.Size([1, 300])

In [10]:
last_hidden_states.shape

torch.Size([1, 8, 768])

In [11]:
tokenizer(train_data.tokens.apply(" ".join).to_list(), return_tensors='pt', truncation=True, padding=True, max_length=300)

{'input_ids': tensor([[50281, 38878,   479,  ..., 50283, 50283, 50283],
        [50281,  6309,  1007,  ..., 50283, 50283, 50283],
        [50281,   536,   333,  ..., 50283, 50283, 50283],
        ...,
        [50281, 22659,   285,  ..., 50283, 50283, 50283],
        [50281,   536,   333,  ..., 50283, 50283, 50283],
        [50281, 22659,  8261,  ..., 50283, 50283, 50283]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [12]:
outputs

BaseModelOutput(last_hidden_state=tensor([[[ 0.3147, -0.5636, -0.7742,  ..., -0.3935,  0.1778, -0.5275],
         [-0.6348, -1.3041,  0.1655,  ..., -0.4116, -1.4177,  0.8489],
         [ 0.0905,  0.0599,  0.2130,  ...,  0.1283, -0.0392,  0.0715],
         ...,
         [-0.2144, -0.9171,  0.2220,  ..., -1.3775, -0.1156,  0.1317],
         [ 0.9564, -0.9251,  0.3561,  ..., -0.4667, -0.0707, -0.2410],
         [ 0.2560, -0.0717,  0.0984,  ...,  0.0122,  0.0510, -0.1633]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [13]:
last_hidden_states.shape

torch.Size([1, 8, 768])

In [16]:
import torch
torch.cuda.is_available()

True