In [181]:
from textattack.augmentation import \
    EasyDataAugmenter, BackTranslationAugmenter, WordNetAugmenter, CLAREAugmenter, \
    CheckListAugmenter, EmbeddingAugmenter, DeletionAugmenter, CharSwapAugmenter

from itertools import chain
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
df = pd.read_csv('../data/train/cr/train.txt', sep='|', header=None, names=['text'])
df['class'] = df['text'].apply(lambda x: x.split('\t')[0])
df['text'] = df['text'].apply(lambda x: x.split('\t')[1])
df = df[['class', 'text']]

df_test = df[0:20]
df_test

In [182]:
augmenter_dict = { 
    'eda_augmenter':EasyDataAugmenter(pct_words_to_swap=0.2,transformations_per_example=4),
    'wordnet_augmenter':WordNetAugmenter(pct_words_to_swap=0.2,transformations_per_example=4),
    'backtranslation_augmenter':BackTranslationAugmenter(pct_words_to_swap=0.2,transformations_per_example=4),
    'embedding_augmenter':EmbeddingAugmenter(pct_words_to_swap=0.2,transformations_per_example=4),
    'deletion_augmenter':DeletionAugmenter(pct_words_to_swap=0.2,transformations_per_example=4),
    'charswap_augmenter':CharSwapAugmenter(pct_words_to_swap=0.2,transformations_per_example=4)
}

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [183]:
augmenter_list = ['eda','wordnet','backtranslation','embedding','deletion','charswap']

In [214]:
def augment_text(df,aug_method,fraction=0.5,label_column='class',target_column='text',pct_words_to_swap=0.2,transformations_per_example=4,include_original=True):
    augmenter = augmenter_dict[aug_method.lower()+'_augmenter']
    augmenter.pct_words_to_swap = pct_words_to_swap
    augmenter.transformations_per_example = transformations_per_example
    # print('Augmenting with',str(augmenter))
    # print('percentage of words to swap:',augmenter.pct_words_to_swap)
    # print('number of transformations per example:',augmenter.transformations_per_example)
    df = df.sample(frac=fraction)
    text_list , class_list = [], []
    for c, txt in zip(df[label_column], df[target_column]):

        res = augmenter.augment(txt)
        if include_original:
            text_list.append(txt)
            class_list.append(c)
            for i in res:
                text_list.append(i)
                class_list.append(c)
        else:
            for i in range(len(res)):
                text_list.append(res[i])
                class_list.append(c)

    df_augmented = pd.DataFrame({target_column: text_list, label_column: class_list})

    return df_augmented

In [221]:
mydf = augment_text(df_test,'eda',fraction=1,label_column='class',target_column='text',pct_words_to_swap=0.2,transformations_per_example=4,include_original=False)

In [222]:
mydf

Unnamed: 0,text,class
0,i have stored around sixty 60 cd 's ( at 160kb...,1
1,i have this around 60 s 'cd ( at 160kbps ) on ...,1
2,i have stored approximately 60 400 's ( at 160...,1
3,i have stored around 60 's ( at 160kbps ) this...,1
4,the memory card fit nicely since i also have p...,1
...,...,...
75,lot also holds a It more diapers.,1
76,"The bottom line as ""Phone is cool one - only f...",1
77,"bottom is ""Phone is cool one - only for the mo...",1
78,"The can billet is ""Phone is coolheaded one - o...",1
