Sources

https://github.com/makcedward/nlpaug

https://github.com/dsfsi/textaugment

In [None]:
!pip install deap
!pip install fasttext
!pip install nltk
!pip install nlpaug
!pip install gensim

In [5]:
# %cd "/content/drive/MyDrive/project550-main"

/content/drive/MyDrive/project550-main


**Two types of data augmentation are applied—synonym replacement using WordNet and contextual word substitution using BERT—on both training and test sets. The augmented sentences are saved separately and their semantic similarity with the original sentences is evaluated using cosine similarity.**

In [2]:
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

In [3]:
import random
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

**import utils functions**

In [6]:
from utils import extract_all_sentences, clean_text
from utils_models import *


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


**Extract all sentences for each patient and put into a list. all_sentences is 2D list as an output.**

In [7]:
train_cc = "ADReSS-IS2020-data/train/transcription/cc"
train_cd = "ADReSS-IS2020-data/train/transcription/cd"
test = "ADReSS-IS2020-data-test/test/transcription"
all_sentences_cc = extract_all_sentences(train_cc)
all_sentences_cd = extract_all_sentences(train_cd)
all_sentences_test = extract_all_sentences(test)

**Apply cleaning step on all_sentences both for training and testing dataset. Output is a 2D list.**

In [8]:
random.seed(42)
np.random.seed(42)
cleaned_healthy_speech = [
    [clean_text(sentence) for sentence in sentence_list]
    for sentence_list in all_sentences_cc
]

cleaned_dementia_speech = [
    [clean_text(sentence) for sentence in sentence_list]
    for sentence_list in all_sentences_cd
]

cleaned_test_speech = [
    [clean_text(sentence) for sentence in sentence_list]
    for sentence_list in all_sentences_test
]

**Combine CC and CD to make training dataset**

In [9]:
cleaned_train_speech = cleaned_healthy_speech + cleaned_dementia_speech

**1. Synonym Augmentation**

In [10]:
syn_aug = naw.SynonymAug(aug_src='wordnet')
def sysonym_augment(sentence):
    try:
        result = syn_aug.augment(sentence)
        return result if isinstance(result, list) else [result]
    except Exception as e:
        print(f"Error augmenting sentence: {e}")
        return []

In [11]:
augmented_train_speech = [sum([sysonym_augment(s) for s in group], []) for group in cleaned_train_speech]
with open("sys_augmented_sentences_train.txt", 'w', encoding='utf-8') as f:
    for sentence_group in augmented_train_speech:
        joined_sentences = ", ".join(sentence_group)
        f.write(f"[{joined_sentences}]\n")

In [12]:
augmented_test_speech = [sum([sysonym_augment(s) for s in group], []) for group in cleaned_test_speech]
with open("sys_augmented_sentences_test.txt", 'w', encoding='utf-8') as f:
    for sentence_group in augmented_test_speech:
        joined_sentences = ", ".join(sentence_group)
        f.write(f"[{joined_sentences}]\n")


**2. Contextual Augmentation**

In [13]:
cont_aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")
def contextual_augment(sentence):
    try:
        result = cont_aug.augment(sentence)
        return result if isinstance(result, list) else [result]
    except Exception as e:
        print(f"Error augmenting sentence: {e}")
        return []


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [14]:
augmented_train_speech_cont = [sum([contextual_augment(s) for s in group], []) for group in cleaned_train_speech]
with open("cont_augmented_sentences_train.txt", 'w', encoding='utf-8') as f:
    for sentence_group in augmented_train_speech_cont:
        joined_sentences = ", ".join(sentence_group)
        f.write(f"[{joined_sentences}]\n")

In [15]:
augmented_test_speech_cont = [sum([contextual_augment(s) for s in group], []) for group in cleaned_test_speech]
with open("cont_augmented_sentences_test.txt", 'w', encoding='utf-8') as f:
    for sentence_group in augmented_test_speech_cont:
        joined_sentences = ", ".join(sentence_group)
        f.write(f"[{joined_sentences}]\n")

**Check The Similarity of Augmented and original data**

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def cosine_sim(sentence1, sentence2):
    vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", analyzer='char').fit_transform([sentence1, sentence2])
    cosine_sim_matrix = cosine_similarity(vectorizer[0:1], vectorizer[1:2])
    return cosine_sim_matrix[0][0]

In [20]:
flat_clean_train = [s for group in cleaned_train_speech for s in group]

with open('cont_augmented_sentences_train.txt', encoding='utf-8') as f:
    aug_train_sentences = [line.strip().strip('[]') for line in f.readlines()]
    flat_aug_train = [s.strip() for group in aug_train_sentences for s in group.split(',') if s.strip()]

for clean, aug in zip(flat_clean_train, flat_aug_train):
    sim = cosine_sim(clean, aug)
    print(f"Clean: {clean}\nAugmented: {aug}\nCosine Similarity: {sim:.4f}\n{'-'*50}")




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Clean: plate cup gram
Augmented: could see old hedge
Cosine Similarity: 0.3447
--------------------------------------------------
Clean: outside see outside hedge
Augmented: outside cup of jar you think
Cosine Similarity: 0.6214
--------------------------------------------------
Clean: outside cup cook jar would hata
Augmented: the counter kitchen
Cosine Similarity: 0.5383
--------------------------------------------------
Clean: call counter kitchen
Augmented: curtain dance
Cosine Similarity: 0.7180
--------------------------------------------------
Clean: curtain gram
Augmented: thomas gram
Cosine Similarity: 0.5824
--------------------------------------------------
Clean: window gram
Augmented: 2007 see go 2018
Cosine Similarity: 0.1447
--------------------------------------------------
Clean: action see go etc
Augmented: okay when
Cosine Similarity: 0.4117
--------------------------------------------------
Clean: okay

**Combine the augment data and original one**

In [38]:
def combine_clean_and_augmented(clean_path, cont_augmented_path, combined_path):
    with open(clean_path, 'r', encoding='utf-8') as clean_file, \
         open(cont_augmented_path, 'r', encoding='utf-8') as augmented_file, \
         open(combined_path, 'w', encoding='utf-8') as out_file:

        for clean_line, aug_line in zip(clean_file, augmented_file):
            clean_line = clean_line.strip()
            aug_line = aug_line.strip()
            out_file.write(f"{clean_line}\n")
            out_file.write(f"{aug_line}\n")

    print(f"Combined file saved at: {combined_path}")

In [32]:
cleaned_train_path = 'clean_train.txt'

with open(cleaned_train_path, 'w', encoding='utf-8') as f:
    for sentence_group in cleaned_train_speech:
        joined_sentences = ", ".join(sentence_group)
        f.write(f"[{joined_sentences}]\n")

print(f"Cleaned train speech saved at: {cleaned_train_path}")
cleaned_test_path = 'clean_test.txt'

with open(cleaned_test_path, 'w', encoding='utf-8') as f:
    for sentence_group in cleaned_test_speech:
        joined_sentences = ", ".join(sentence_group)
        f.write(f"[{joined_sentences}]\n")

print(f"Cleaned test speech saved at: {cleaned_test_path}")


Cleaned train speech saved at: clean_train.txt
Cleaned test speech saved at: clean_test.txt


In [39]:
combine_clean_and_augmented(
    clean_path='clean_train.txt',
    cont_augmented_path='cont_augmented_sentences_train.txt',
    combined_path='combined_clean_augmented_train.txt'
)

combine_clean_and_augmented(
    clean_path='clean_test.txt',
    cont_augmented_path='cont_augmented_sentences_test.txt',
    combined_path='combined_clean_augmented_test.txt'
)

Combined file saved at: combined_clean_augmented_train.txt
Combined file saved at: combined_clean_augmented_test.txt
