In [3]:
import nlpaug.augmenter.word as naw
from rich.progress import track
import random
import tqdm
import numpy as np
import torch

In [4]:
current_device = torch.cuda.current_device()
device_name = torch.cuda.get_device_name(current_device)
device_name

'NVIDIA GeForce RTX 4090 Laptop GPU'

In [6]:
cuda_available = torch.cuda.is_available()

print("Is CUDA available? :", cuda_available)

if cuda_available:
    deviceName = "cuda"
else:
    deviceName = "cpu"

Is CUDA available? : True


In [2]:
# Load the list from the .npy file
loaded_list = np.load('tinnitus_collected_v4.npy')
loaded_list

array(["25.0 21.0 22.0 20.0<freq_info>0.38 0.28 0.2<initial_thi_score>hearing loss and dementia:anxiety, annoyance:Relax while breathing slowly:I can't sleep.:stress:Try not to think about tinnitus (ignore):I want to sleep well until morning.:::mild headache:nervous:think of good things:anxiety with dizziness:worry:Listen to music. Refresh yourself with a phone call with a friend:to sleep until morning::Listen to music to help you sleep while breathing slowly and deeply:forgetfulness. Dementia worries:unrest:Chill out with a friend on speakerphone:falling asleep again:stress:take a deep breath. to ignore:seems to get worse:mild headache feeling:to ignore feel good thoughts:try not to worry:try to think comfortably:take a deep breath. feel good thoughts:I'm afraid it will get worse:stress:Listening to music and thinking about what makes you feel good makes you feel at ease:why does it sound so loud:momentary annoyance:Take a slow, deep breath, clear your mind, and fall asleep right away

### Augmentation for algorithm 2 (Augmentation #1)

In [3]:
augmentations = {}
augmentations["synonym_replace1"] = naw.SynonymAug(aug_src='wordnet', aug_min=10, aug_max=20)
augmentations["random_substitute"] = naw.ContextualWordEmbsAug(model_path="distilbert-base-uncased", device=deviceName, action="substitute", aug_p=0.5, top_k=10)
augmentations["synonym_replace2"] = naw.SynonymAug(aug_src='wordnet', aug_min=5, aug_max=10)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/eaglewatch/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/eaglewatch/nltk_data...


### Augmentation for algorithm 3 (Augmentation #2)

In [4]:
# augmentations = {}
# augmentations["synonym_replace1"] = naw.SynonymAug(aug_src='wordnet', aug_min=10, aug_max=20)
# augmentations["random_substitute"] = naw.ContextualWordEmbsAug(model_path="roberta-base", device=deviceName, action="substitute", aug_p=0.9, top_k=20)
# augmentations["synonym_replace2"] = naw.SynonymAug(aug_src='wordnet', aug_min=5, aug_max=10)
# augmentations["random_swap"] = naw.RandomWordAug(action="swap")
# augmentations["synonym_replace3"] = naw.SynonymAug(aug_src='wordnet', aug_min=10, aug_max=10)

### Augmentation for algorithm 4 (Augmentation #3)

In [5]:
# augmentations = {}
# augmentations["random_swap"] = naw.RandomWordAug(action="swap")
# augmentations["random_substitute"] = naw.ContextualWordEmbsAug(model_path="roberta-base", device=deviceName, action="substitute", aug_p=0.7, top_k=30)
# augmentations["synonym_replace2"] = naw.SynonymAug(aug_src='wordnet', aug_min=5, aug_max=10)
# augmentations["synonym_replace1"] = naw.SynonymAug(aug_src='wordnet', aug_min=10, aug_max=20)

In [6]:
augmentations

{'random_swap': <nlpaug.augmenter.word.random.RandomWordAug at 0x7f1d74480f10>,
 'random_substitute': <nlpaug.augmenter.word.context_word_embs.ContextualWordEmbsAug at 0x7f1d744edeb0>,
 'synonym_replace2': <nlpaug.augmenter.word.synonym.SynonymAug at 0x7f1d74480ee0>,
 'synonym_replace1': <nlpaug.augmenter.word.synonym.SynonymAug at 0x7f1d74480c40>}

### continuous_augumentation for algorithm 2 and 3 (augment 1 and 2)

In [7]:
def continuous_augumentation(augmentations, original_text):
    words = len(original_text.split(" "))
    if words == 1:
        text = augmentations["synonym_replace1"].augment(original_text)[0]
    else:
        text = original_text
        for k,v in augmentations.items():
            text = v.augment(text)[0]

    return text

### continuous_augumentation for algorithm 4 (augment 3)

In [27]:
def continuous_augumentation(augmentations, original_text, augmentation_set=1):
    words = len(original_text.split(" "))
    if words == 1:
        text = augmentations["synonym_replace1"].augment(original_text)[0]
    else:
        text = original_text
        for k,v in augmentations.items():
            if augmentation_set == 3:
                if k != "synonym_replace1":
                    text = v.augment(text)[0]
            else:
                text = v.augment(text)[0]

    return text

In [28]:
original_text = "sample dataset augmentation example"

In [29]:
continuous_augumentation(augmentations, original_text)

'sample verbal description of.'

### Bootstrap sampling with error

In [30]:
def bootstrap_sample_with_error(value, error_rate, error_percentage, total_samples):
    samples = []
    for _ in range(total_samples):  # Change 1000 to whatever number of samples you want
        if random.random() < error_rate:
            error_amount = value * (error_percentage / 100)
            sample = value + random.uniform(-error_amount, error_amount)
            samples.append(sample)
        else:
            samples.append(value)
    return sum(samples) / len(samples)  # Return the mean of the samples

In [31]:
def get_string_data_boostrapping_sampling(data_string, error_rate=0.3, error_percentage=20, total_samples=5):
    #data_str = "25.0 21.0 22.0 20.0"
    data_list = [float(item) for item in data_string.split()]

    # List to store the mean of the bootstrapped samples for each float value
    bootstrapped_means = [bootstrap_sample_with_error(item, error_rate, error_percentage, total_samples) for item in data_list]
    
    # Convert the results back to string format
    result_str = " ".join(f"{item:.2f}" for item in bootstrapped_means)
    return result_str

In [32]:
loaded_list[0]

"25.0 21.0 22.0 20.0<freq_info>0.38 0.28 0.2<initial_thi_score>hearing loss and dementia:anxiety, annoyance:Relax while breathing slowly:I can't sleep.:stress:Try not to think about tinnitus (ignore):I want to sleep well until morning.:::mild headache:nervous:think of good things:anxiety with dizziness:worry:Listen to music. Refresh yourself with a phone call with a friend:to sleep until morning::Listen to music to help you sleep while breathing slowly and deeply:forgetfulness. Dementia worries:unrest:Chill out with a friend on speakerphone:falling asleep again:stress:take a deep breath. to ignore:seems to get worse:mild headache feeling:to ignore feel good thoughts:try not to worry:try to think comfortably:take a deep breath. feel good thoughts:I'm afraid it will get worse:stress:Listening to music and thinking about what makes you feel good makes you feel at ease:why does it sound so loud:momentary annoyance:Take a slow, deep breath, clear your mind, and fall asleep right away if you

In [33]:
each_entry = loaded_list[0]
diaries = each_entry.split("<initial_thi_score>")[1].split("<diaries>")[0]
diaries_splits = diaries.split(":")

augument_diaries = [""] * len(diaries_splits)
for i in range(len(diaries_splits)):
    if len(diaries_splits[i]) > 0:
        augument_diaries[i] = continuous_augumentation(augmentations, diaries_splits[i])
updated_diaries = ":".join(augument_diaries)

updated_diaries

"release and aid Edit:, cyber onslaught:While get slow Edit:Give notice expect. . south.:accent:Essay them oregon incommode twice find above them:Want developer constitute gravel lay off presently.:::worry picture:flighty:Of come everything unitedly:with capture disorder:headache:To get laid firstly. More | Get another penis for beaver state only a:until game keep on:::To do while on the improve sleep but slowly non rather by nature:. Mind syndrome Edit:ferment:Pall until the insensate bite trash 🙂:Asleep … hard:tension:cryptic towards the 2d. Misplace information technology:to The obvious after:mild non tough:ignore Top ten interception proportion:non only if the verruca:to reprize thus far over again:a electrocution off. Repetition as fume disappearance:' m a need peerless choke no worse:accent:To fathom all bad astir what else would you be bad makes you more times bad:information technology seem we appear super oil production:infliction element:Select this deep, steady breath, relax

### Algorithm 2, 3, and 4

In [None]:
augumented_list = []
for k in range(42):
    each_entry = loaded_list[k]
    repeat = 1000 
    # for train : repeat = 1000
    # for valid : repeat = 250
    # for test  : repeat = 50 and add the original to test at the end


    # Algorithm 2 and 3 
    # e_rate = 0.3, e_pct = 40

    # Algorithm 4
    # e_rate = 0.5, e_pct = 50

    e_rate = 0.3
    e_pct = 40

    for each_repeat in tqdm.tqdm(range(repeat)):
        freq_info = each_entry.split("<freq_info>")[0]
        updated_freq_info = get_string_data_boostrapping_sampling(freq_info, error_rate=e_rate, error_percentage=e_pct, total_samples=5)

        initial_thi_score = each_entry.split("<initial_thi_score>")[0].split("<freq_info>")[1]

        updated_initial_thi_score = get_string_data_boostrapping_sampling(initial_thi_score, error_rate=e_rate, error_percentage=e_pct, total_samples=5)

        final_thi_score = each_entry.split("<initial_thi_score>")[1].split("<diaries>")[1].replace("<final_thi_score>", "")
        updated_final_thi_score = get_string_data_boostrapping_sampling(final_thi_score, error_rate=e_rate, error_percentage=e_pct, total_samples=5)

        diaries = each_entry.split("<initial_thi_score>")[1].split("<diaries>")[0]
        diaries_splits = diaries.split(":")

        augument_diaries = [""] * len(diaries_splits)
        for i in range(len(diaries_splits)):
            if len(diaries_splits[i]) > 0:
                augument_diaries[i] = continuous_augumentation(augmentations, diaries_splits[i])
        updated_diaries = ":".join(augument_diaries)

        augumented_list.append(updated_freq_info + "<freq_info>" + updated_initial_thi_score + "<initial_thi_score>" + updated_diaries + "<diaries>" + updated_final_thi_score +"<final_thi_score>")

### For training (42 x 1000 = 42k)

In [None]:
np.save('augumented_42k.npy', augumented_list)

### For Validation (42 x 250 = 10k)

In [None]:
np.save('augumented_10k.npy', augumented_list)

### For Testing (42 x 50 = 2k)

In [None]:
# 학술적으로는 이렇게 해야 함
#original_list = loaded_list.tolist()

#total_list = original_list + augumented_list

#np.save('augumented_2k.npy', total_list)

np.save('augumented_2k.npy', augumented_list)