# Génération d'un dataset augmenté
Le but de ce notebook est d'augmenter le dataset donné dans le but de tester la résistance du modèle à ces différentes augmentation.
### Imports

In [1]:
import json
import requests
import random
import numpy as np
import tqdm

#NLPAUG IMPORTS
import nlpaug.augmenter.char as char_aug
import nlpaug.augmenter.word as naw

#SKLEARN IMPORTS
import sklearn.model_selection as skms

### Chargement du dataset

In [2]:
#LOAD DATASET
with open("dataset.json", "r") as dataset_file:
   dataset = json.load(dataset_file)

In [3]:
grouped_dataset = {}
for data in dataset:
    if data.get('intent') in grouped_dataset.keys():
        grouped_dataset[data.get('intent')].append(data.get('sentence'))
    else:
        grouped_dataset[data.get('intent')] = [data.get('sentence')]

for intent in grouped_dataset.keys():
    random.shuffle(grouped_dataset.get(intent))
    print(f"Intent: {intent}, occurence: {len(grouped_dataset.get(intent))}")


Intent: irrelevant, occurence: 3413
Intent: find-restaurant, occurence: 419
Intent: find-hotel, occurence: 299
Intent: provide-showtimes, occurence: 124
Intent: purchase, occurence: 553
Intent: find-around-me, occurence: 362
Intent: find-flight, occurence: 113
Intent: find-train, occurence: 112


In [4]:
dataset_1 = []
dataset_2 = []
dataset_3 = []
dataset_4 = []

for intent in grouped_dataset.keys():
    sentences = grouped_dataset.get(intent)
    chunck_size = len(sentences)//4
    #rest = len(sentences)%4

    for i in range(0,chunck_size):
        dataset_1.append({"intent":intent,"sentence":sentences[i]})
    for j in range(chunck_size, chunck_size*2):
        dataset_2.append({"intent":intent,"sentence":sentences[j]})
    for k in range(chunck_size*2, chunck_size*3):
        dataset_3.append({"intent":intent,"sentence":sentences[k]})
    for l in range(chunck_size*3, chunck_size*4):
        dataset_4.append({"intent":intent,"sentence":sentences[l]})

print(f"Dataset 1 : {len(dataset_1)}")
print(f"Dataset 2 : {len(dataset_2)}")
print(f"Dataset 3 : {len(dataset_3)}")
print(f"Dataset 4 : {len(dataset_4)}")


Dataset 1 : 1346
Dataset 2 : 1346
Dataset 3 : 1346
Dataset 4 : 1346


In [7]:
#KEYBOARD AUGMENTATION
dataset_aug_1 = []
key_aug_1 = char_aug.KeyboardAug(lang="fr",aug_char_min=1, aug_char_max=1,aug_word_min=1,aug_word_max=1,include_upper_case=True,model_path="keyboard/fr-mobile.json")
for i in tqdm.trange(len(dataset_1)):
    keyboard_aug = key_aug_1.augment(dataset_1[i].get('sentence'), n=1)
    dataset_aug_1.append({"intent":dataset_1[i].get("intent"),"sentence":dataset_1[i].get("sentence"), "keyboard_aug":keyboard_aug})

  1%|▏         | 20/1346 [00:02<02:19,  9.52it/s]


KeyboardInterrupt: 

In [5]:
def augment_letter(sentence):
    index = random.randint(0, len(sentence)-1)
    while sentence[index] == " ":
        index = random.randint(0, len(sentence)-1)
    sentence_aug = f"{sentence[:index]}{sentence[index]}{sentence[index:]}"
    return sentence_aug

In [6]:
#LETTER AUGMENTATION
dataset_aug_2 = []
for i in tqdm.trange(len(dataset_2)):
    letter_aug = augment_letter(dataset_2[i].get('sentence'))
    dataset_aug_2.append({"intent":dataset_2[i].get("intent"),"sentence":dataset_2[i].get("sentence"), "letter_aug":letter_aug})

100%|██████████| 1346/1346 [00:00<00:00, 258022.54it/s]


In [7]:
def delete_letter(sentence):
    index = random.randint(0, len(sentence)-1)
    while sentence[index] == " ":
        index = random.randint(0, len(sentence)-1)
    sentence_aug = f"{sentence[:index]}{sentence[index+1:]}"
    return sentence_aug

In [8]:
#LETTER DELETE
dataset_aug_3 = []
for i in tqdm.trange(len(dataset_3)):
    letter_del = delete_letter(dataset_3[i].get('sentence'))
    dataset_aug_3.append({"intent":dataset_3[i].get("intent"),"sentence":dataset_3[i].get("sentence"), "letter_del":letter_del})

100%|██████████| 1346/1346 [00:00<00:00, 223931.35it/s]


In [11]:
#WORD AUGMENTATION
dataset_aug_4 = []
wordswap_aug = naw.RandomWordAug(action="swap")
for i in tqdm.trange(len(dataset_4)):
    word_aug = wordswap_aug.augment(dataset_4[i].get('sentence'), n=1)
    dataset_aug_4.append({"intent":dataset_4[i].get("intent"),"sentence":dataset_4[i].get("sentence"), "word_aug":word_aug})

  2%|▏         | 24/1346 [00:02<02:18,  9.53it/s]


KeyboardInterrupt: 

In [10]:
#SAVE AUGMENTED DATASETS
with open("datasets/dataset_keyaug.json", "w") as dataset_keyaug:
    json.dump(dataset_aug_1, dataset_keyaug, ensure_ascii=False)

with open("datasets/dataset_letteraug.json", "w") as dataset_letteraug:
    json.dump(dataset_aug_2, dataset_letteraug, ensure_ascii=False)

with open("datasets/dataset_letterdel.json", "w") as dataset_letterdel:
    json.dump(dataset_aug_3, dataset_letterdel, ensure_ascii=False)

with open("datasets/dataset_wordaug.json", "w") as dataset_wordaug:
    json.dump(dataset_aug_4, dataset_wordaug, ensure_ascii=False)

In [12]:
#MERGE DATASETS
dataset_aug = []
dataset_aug.extend(dataset_aug_1)
dataset_aug.extend(dataset_aug_2)
dataset_aug.extend(dataset_aug_3)
dataset_aug.extend(dataset_aug_4)

print(dataset_aug[0])

In [13]:
#SAVE AUGMENTED DATASET
with open("datasets/dataset_aug.json", "w") as dataset_aug_file:
    json.dump(dataset_aug, dataset_aug_file, ensure_ascii=False)