# Data Exploration

This notebook is for exploring and visualizing the data.

In [232]:
import os
import re

import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import seaborn as sns
import pylangacq

from tqdm import tqdm
import string
punctuations = string.punctuation

In [233]:
# Define PATH
ADReSS2020_DATAPATH = "../data/ADReSS-IS2020-data"
ADReSS2020_TRAINPATH = os.path.join(ADReSS2020_DATAPATH, "train")
ADReSS2020_TESTPATH = os.path.join(ADReSS2020_DATAPATH, "test")

TRANSCRIPT_NAME = "transcription"

In [327]:
def read_par_utterances(file_path):
    """
    Read a CHAT file and return a list of merged *PAR (and *INV) utterances.
    This function merges continuation lines and removes trailing time codes.
    """
    utterances = []
    current_utterance = None

    with open(file_path, 'r', encoding="utf-8") as f:
        for line in f:
            line = line.rstrip("\n")
            # New utterance: lines starting with *PAR: or *INV:
            if line.startswith("*PAR:") or line.startswith("*INV:"):
                # If an utterance is in progress, finish and store it.
                if current_utterance is not None:
                    # Remove any trailing time code (text between two  symbols)
                    current_utterance = re.sub(r'.*?', '', current_utterance).strip()
                    utterances.append(current_utterance)
                # Start a new utterance (remove the marker)
                if line.startswith("*PAR:"):
                    current_utterance = line[len("*PAR:"):].strip()
                else:
                    current_utterance = line[len("*INV:"):].strip()
            # Continuation lines (indented or containing a time code marker) are appended.
            elif current_utterance is not None and ('' in line):
                current_utterance += " " + line.strip()
            # Otherwise, ignore the line.
    
    # Append the final utterance if one is in progress.
    if current_utterance:
        current_utterance = re.sub(r'.*?', '', current_utterance).strip()
        utterances.append(current_utterance)
    
    return utterances

def is_retracing(token):
    """
    Determine if a token is a retracing token that should be merged with the previous token.
    
    Returns True for tokens matching patterns like:
      - o(f)
      - fallin(g)
      - an(d)
      - stealin(g)
    (case-insensitive)
    
    But returns False for tokens such as (.), (..), or (...).
    """
    pattern = re.compile(r'^(an|o|stealin|takin)\([^)]*\)$', re.IGNORECASE)
    if pattern.match(token):
        return True
    return False

def merge_annotation_tokens(tokens, start_index):
    """
    Merge tokens that are part of an annotation enclosed in brackets.
    This function supports both square-bracket annotations (e.g., "[+ exc]") and
    angle-bracket annotations (e.g., "<walk with a>").
    
    Returns a tuple of (merged_token, next_index).
    """
    token = tokens[start_index]
    if token.startswith('['):
        closing = ']'
    elif token.startswith('<'):
        closing = '>'
    else:
        return token, start_index + 1

    merged = token
    i = start_index
    # If the token already ends with the closing bracket, return it.
    if merged.endswith(closing):
        return merged, i + 1
    i += 1
    # Merge subsequent tokens until we find one that ends with the closing bracket.
    while i < len(tokens) and not tokens[i].endswith(closing):
        merged += " " + tokens[i]
        i += 1
    if i < len(tokens):
        merged += " " + tokens[i]
        i += 1
    return merged, i

def tokenize_and_merge(utterance, linking_token="{<miss_spell_token>}"):
    """
    Tokenize an utterance into tokens with the following custom behavior:
    
      - If a token is immediately followed by a retracing token 
        (e.g., 'o(f)', 'fallin(g)', 'an(d)', 'stealin(g)'), merge them into a single token 
        by concatenating with the linking_token.
      - Merge bracketed annotations so that tokens like "[+ exc]" or "[: overflowing]" 
        and angle-bracket annotations like "<walk with a>" remain intact.
    
    Returns a list of tokens.
    """
    tokens = utterance.split()
    merged_tokens = []
    i = 0
    while i < len(tokens):
        token = tokens[i]
        
        # If the token begins with '[' or '<' but does not end with the corresponding closing bracket,
        # merge the entire annotation.
        if (token.startswith('[') and not token.endswith(']')) or (token.startswith('<') and not token.endswith('>')):
            merged_token, i = merge_annotation_tokens(tokens, i)
            merged_tokens.append(merged_token)
            continue

        # Check if the next token is a retracing token.
        # print(tokens[i+1]) if i + 1 < len(tokens) else None
        if i + 1 < len(tokens) and is_retracing(str(tokens[i+1])):
            print(tokens[i+1])
            # Merge current token with the retracing token into a single token.
            merged_tokens.append(token)
            merged_tokens.append(linking_token + tokens[i+1])
            i += 2
            continue
        
        merged_tokens.append(token)
        i += 1
        
    return merged_tokens

In [329]:
# Read the 
transcript_path = os.path.join(ADReSS2020_TRAINPATH, TRANSCRIPT_NAME)
reader = pylangacq.read_chat(transcript_path)

# Replace with your actual CHAT file path.
file_path = reader.file_paths()[80]
print(file_path)

# Read and merge *PAR utterances.
utterances = read_par_utterances(file_path)

# Tokenize and merge tokens from each utterance.
all_tokens = []
for utt in utterances:
    tokens = tokenize_and_merge(utt)
    all_tokens.extend([token for token in tokens if token not in list(punctuations)])

print(all_tokens)

../data/ADReSS-IS2020-data/train/transcription/cd/S118.cha
['okay', "let's", 'try', 'something', 'different', 'okay', 'tell', 'me', 'what', 'you', 'see', 'in', 'that', 'picture', 'oh', "there's", 'a', 'cookie', 'jar', 'and', 'a', 'youngster', 'with', 'a', "<I don't know what he got>", '[//]', 'cookie', 'jar', 'and', 'the', 'boy', 'has', 'a', 'shirt', 'with', 'a', 'cookie', '&j', 'jar', 'too', 'I', 'guess', 'the', 'girl', 'has', 'one', 'too', 'a', 'jar', 'an(d)', "that's", 'the', 'stool', 'and', 'this', 'is', '&uh', 'with', 'a', 'kitchen', 'thing', '&k', 'dishes', '[+ gram]', 'and', 'this', 'is', 'water', 'somebody', 'spilled', '(.)', 'this', 'is', 'some', 'more', 'junk', '[+ es]', 'what', 'do', 'you', 'call', 'this', '[+ exc]', 'I', "don't", 'know', '[+ exc]', 'I', 'guess', "it's", 'another', 'for', 'the', '(.)', '&=sighs', '+...', 'I', "don't", 'know', "what's", 'hɑɹ@u', '[: x@n]', '[* n:uk]', 'in', 'there', '[+ jar]', '[+ exc]', 'the', 'tent@u', '[: x@n]', '[* n:uk]', 'very', 'intere

In [331]:
import re

# Build a pattern that matches our specific tokens.
pattern = re.compile(r'^(an|o|stealin|takin)\([^)]*\)$', re.IGNORECASE)

# Test tokens
tokens = ['an(d)', 'o(f)', 'an(d)', 'stealin(g)', 'takin(g)', 'o(.)', 'an(abc)', 'something_else', '(...)']

for token in all_tokens:
    if pattern.match(token):
        print(f"{token}: MATCH")
    else:
        print(f"{token}: NO MATCH")


okay: NO MATCH
let's: NO MATCH
try: NO MATCH
something: NO MATCH
different: NO MATCH
okay: NO MATCH
tell: NO MATCH
me: NO MATCH
what: NO MATCH
you: NO MATCH
see: NO MATCH
in: NO MATCH
that: NO MATCH
picture: NO MATCH
oh: NO MATCH
there's: NO MATCH
a: NO MATCH
cookie: NO MATCH
jar: NO MATCH
and: NO MATCH
a: NO MATCH
youngster: NO MATCH
with: NO MATCH
a: NO MATCH
<I don't know what he got>: NO MATCH
[//]: NO MATCH
cookie: NO MATCH
jar: NO MATCH
and: NO MATCH
the: NO MATCH
boy: NO MATCH
has: NO MATCH
a: NO MATCH
shirt: NO MATCH
with: NO MATCH
a: NO MATCH
cookie: NO MATCH
&j: NO MATCH
jar: NO MATCH
too: NO MATCH
I: NO MATCH
guess: NO MATCH
the: NO MATCH
girl: NO MATCH
has: NO MATCH
one: NO MATCH
too: NO MATCH
a: NO MATCH
jar: NO MATCH
an(d): MATCH
that's: NO MATCH
the: NO MATCH
stool: NO MATCH
and: NO MATCH
this: NO MATCH
is: NO MATCH
&uh: NO MATCH
with: NO MATCH
a: NO MATCH
kitchen: NO MATCH
thing: NO MATCH
&k: NO MATCH
dishes: NO MATCH
[+ gram]: NO MATCH
and: NO MATCH
this: NO MATCH
is: 