In [23]:
from pathlib import Path
import random

import pandas as pd
from tqdm import tqdm

import nlaugmenter

# Load Data

In [16]:
seed = 23 # 23 42 24666
init_df_path = Path(f"../../data/classification/baseline/train_{seed}.csv")
output_folder = Path(f"../../data/classification/nlaugmenter")
text_column = "Текст"
label_column = "Тональность"

In [36]:
df = pd.read_csv(init_df_path)
print(df.shape)


texts, labels = [], []
texts.extend(df[text_column].tolist())
labels.extend(df[label_column].tolist())

(11668, 4)


# NL Augmenter

In [29]:
# from nlaugmenter import transformations
# help(transformations)

In [30]:
from nlaugmenter.transformations.multilingual_lexicon_perturbation.transformation import MultilingualLexiconPerturbation
from nlaugmenter.transformations.multilingual_back_translation.transformation import MultilingualBackTranslation
from nlaugmenter.transformations.swap_characters.transformation import SwapCharactersPerturbation
from nlaugmenter.transformations.underscore_trick.transformation import UnderscoreTrick
from nlaugmenter.transformations.visual_attack_letters.transformation import VisualAttackLetters
from nlaugmenter.transformations.whitespace_perturbation.transformation import WhitespacePerturbation

In [31]:
transformation_mlp = MultilingualLexiconPerturbation(mlt_src_lang='en', mlt_tgt_lang='ru',prob_mix=0.9, seed=seed)
transformation_mlbt = MultilingualBackTranslation(src_lang='ru', pivot_lang='en')
transformation_swap = SwapCharactersPerturbation()
transformation_visual = VisualAttackLetters(perturb_pct=0.2)
transformation_wspace = WhitespacePerturbation()

[nltk_data] Downloading package punkt to /Users/a18692338/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [35]:
def do_augmentations(text, augmenters):
    
    augmenter = random.choice(augmenters)
        
    text_augmented = augmenter.generate(text)[0]        

    return text_augmented

In [37]:
AUGMENTERS = [
    transformation_mlp,
    transformation_mlbt,
    transformation_swap,
    transformation_visual,
    transformation_wspace
]
fails = 0

for i, raw in tqdm(df.iterrows()):
    
    text, label = raw[text_column], raw[label_column]
    try:
        new_text = do_augmentations(text, augmenters=AUGMENTERS)
        texts.append(new_text)
        labels.append(label)
    except:
        print(f"Fail: {text}")
        fails += 1
    
print(f"Failed {fails} times.")
print(len(texts), len(labels))

11668it [00:24, 479.00it/s]

Failed 0 times.
23336 23336





In [40]:
df_done = pd.DataFrame()

df_done[text_column] = texts
df_done[label_column] = labels

print(df_done.shape)
df_done.tail(10)

(23336, 2)


In [39]:
df_done.to_csv(output_folder / f"train_{seed}.csv", index=False, encoding="utf-8")