<img src="imgs/unicamp.png" width="150" height="150">

# Augment

In [None]:
# Default
import os

# Numerical and IO
import numpy as np
import pandas as pd

# Augmentation
import nltk
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

## Read Data

In [None]:
folder = 'data/'
files_path = os.listdir(folder)
files = {}

# Get in order
for i in range(1, len(files_path)+1):
    
    file = [f for f in files_path if f'{i}' == f.split('_')[0]][0]
    files[file] = pd.read_excel(folder+file)
    files[file].columns = ['question', 'answer']
    files[file].dropna(inplace=True)

## Augment

In [None]:
def augment(text, times=2):

    augmented = []
    
    for i in range(times):
        # Substitute with distilbert
        aug = naw.ContextualWordEmbsAug(
            model_path='distilbert-base-uncased', action="substitute")
        augmented.append(aug.augment(text))

        # Substitute with wordnet
        aug = naw.SynonymAug(aug_src='wordnet')
        augmented.append(aug.augment(text))
    
    return augmented

In [None]:
folder = 'augmented/'

# Go through files augmenting
for f in files:
    
    data = files[f]
    result = {'question': [], 'answer': []}
    
    for i, row in data.iterrows():
        
        que = row['question']
        ans = row['answer']
        
        result['question'].append(que)
        result['answer'].append(ans)
        
        augmented = augment(que)
        result['question'].extend(augmented)
        result['answer'].extend([ans]*len(augmented))
        
    pd.DataFrame.from_dict(result).to_csv(f'{folder}{f}', index=False)