## Imports

In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from torch.utils.data import Dataset
from transformers import AutoTokenizer, InputFeatures

import time

from utils import load_documents, load_datasets
from Class_Balancing import *

import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
data_root = r'C:\Users\Maciek\Documents\Studia\Magisterka\eraser'
documents = load_documents(data_root)

train, val, test = load_datasets(data_root)

In [3]:
def get_input(ann):
    if len(ann.all_evidences())==0:
        docid  = ann.annotation_id
    else:
        (docid,) = set(ev.docid for ev in ann.all_evidences())
    doc = documents[docid]
    input = ''
    for sent in doc:
        input += ' '.join(sent)
    return input

In [4]:
def print_color(text, evidences):
    for ev in evidences:
        t = "\033[95m" + ev+ '\x1b[0m'
        if ev!='':
            text = t.join(text.split(ev))
    print(text)

In [5]:
class TrainerDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, evidences=None):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.evidences = evidences

        # Tokenize the input
        self.tokenized_inputs = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")   

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return InputFeatures(
            input_ids=self.tokenized_inputs['input_ids'][idx],
            attention_mask=self.tokenized_inputs['attention_mask'][idx],
            label=self.targets[idx])  

In [6]:
def create_dataset(dataset, tokenizer):
    targets = [1  if ann.classification != 'NEG' else 0 for ann in dataset]
    evidences = [[ev.text for ev in ann.all_evidences()] for ann in dataset]
    inputs = [get_input(ann) for ann in dataset]
    
    if len(targets)==len(evidences)==len(inputs):
        return TrainerDataset(inputs, targets, tokenizer, evidences)
    else:
        print("Something went wrong !!!!!")

In [7]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased", num_labels=2)

In [8]:
train_dataset = create_dataset(train, tokenizer)
eval_dataset = create_dataset(val, tokenizer)
test_dataset = create_dataset(test, tokenizer)

In [9]:
import pickle
dataset = "eraser_movie"

filehandler = open(f'{dataset}_train.obj',"wb")
pickle.dump(train_dataset,filehandler)
filehandler.close()

filehandler = open(f'{dataset}_eval.obj',"wb")
pickle.dump(eval_dataset,filehandler)
filehandler.close()

filehandler = open(f'{dataset}_test.obj',"wb")
pickle.dump(test_dataset,filehandler)
filehandler.close()

In [10]:
def get_time(time):
    """ Method converts number of seconds into time in format ___ h __ m __.__ s
    Args:
        time (float): Number of seconds
    Returns:
        str: Time in format ___ h __ m __.__ s
    """
    result = ""
    if time//3600 > 0:
        result += str(int(time//3600)) + " h  "
        time %= 3600
    if time//60 > 0:
        result += str(int(time//60)) + " m  "
        time %= 60
    if time//1 > 0:
        result += str(np.round(time,2)) + " s                      "
    return result


def balance_minority(train_dataset, fun, limited_range=False, random_seed=123, **kwargs):
    """ Method used to balance minority using some function fun (ex. replace_synonym, deepcopy, ...
    Args:
        train_dataset (TrainerDataset): Dataset to balance
        fun (Callable): Function used to balance minority
        limited_range (bool, optional): In case of time consuming balancing functions, user may choose to limit number of new examples
                                        from the size difference between the samples to minimum of 500 and 5 * number of positive examples. 
                                        Defaults to False.
        random_seed (int, optional): Random state. Defaults to 123.
    Returns:
        TrainerDataset: Balanced dataset
    """
    
    random.seed(random_seed)
    if limited_range:
        global counter
        counter = 0
        
    positives = np.array(train_dataset.inputs)[np.array(train_dataset.targets)==1]
    n_positive = len(positives)
    n_negative = len(train_dataset.targets) - n_positive
    
    
    generation_count = np.min([n_negative-n_positive, 5*n_positive, 10]) if limited_range else n_negative-n_positive
    
    new_inputs = [
            0 for i in range(generation_count)
        ]
    start_time = time.time()
    for i in range(generation_count):
        new_inputs[i] = fun(positives[np.random.randint(n_positive)], **kwargs)
        print(f"{i}/{generation_count}, est. time: {get_time((time.time()-start_time)/(i+1)*(generation_count-i))}", end="\r")

    balanced_inputs = train_dataset.inputs + new_inputs
    balanced_targets = train_dataset.targets + [1 for _ in range(generation_count)]
    
    np.random.seed(random_seed)
    np.random.shuffle(balanced_targets)
    np.random.seed(random_seed)
    np.random.shuffle(balanced_inputs)
    
    return TrainerDataset(balanced_inputs, balanced_targets, train_dataset.tokenizer)

In [13]:

def process_augmentation(x, aug):
    """ Helper method used to convert nlpaug method so that it works with balance_minority method
    Args:
        x (str): Text to base the augmentation on
        aug (Callable): Function from nlpaug library
    Returns:
        TrainerDataset: Augmented text
    """
    augment = aug.augment(x)
    if type(augment)==list:
        augment=augment[0]
    if augment == None:
        print(f"No augmentation applied to: {x}")
        return x
    return augment



def create_datasets(train_dataset, imbalance = 0.05, random_seed = 123, i=0):
    """ Method used to create imbalanced dataset, and balanced dataset based on it using selection of methods from nlpaug library and random oversampling
    Args:
        train_dataset (TrainerDataset): Text to base the augmentation on
        imbalance (float): Ratio of positive to negative examples in created imbalanced dataset
        random_seed(int): Random seed used for reproducibility purposes
        i(str): Addition to name of resulting files
    Returns:
        None
    """
    
    # Adding the imbalance
    train_dataset_imbalanced = add_imbalance(train_dataset, imbalance, random_seed = random_seed)
    
    # Saving imbalanced dataset to file
    filehandler = open(f'nlpaug/{dataset}_{int(imbalance*100)}_imbalanced_{i}.obj',"wb")
    pickle.dump(train_dataset_imbalanced,filehandler)
    filehandler.close()

    for method in [
        ("Spelling_mistake", naw.SpellingAug()),
        ("ROS", deepcopy),
        ("Synonym replacement", naw.SynonymAug(aug_src='wordnet')),
        ("Contextual_word_embedding", naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")),
        ("Summarization", nas.AbstSummAug(model_path='t5-base'))
        # ("Translation", naw.BackTranslationAug(from_model_name='facebook/wmt19-en-de', to_model_name='facebook/wmt19-de-en'))
    ]:
        # Creating datasets for model training
        start_time = time.time()
        if method[0]!="ROS":
            train_dataset_augmented = balance_minority(deepcopy(train_dataset_imbalanced), lambda x: process_augmentation(x, aug=method[1]),  random_seed = random_seed)
        else:
            train_dataset_augmented = balance_minority(deepcopy(train_dataset_imbalanced), deepcopy,  random_seed = random_seed)
            
        # Quality report printing
        print(f"{method[0]}: {get_time(time.time()-start_time)}                         ")
        _, counts = np.unique(train_dataset_augmented.targets, return_counts=True)
        ratio = counts[0]/np.sum(counts)
        print(f"Ratio: {ratio}")
        print("Some examples from positive class")
        print(np.sort((np.array(train_dataset_augmented.inputs)[np.array(train_dataset_augmented.targets)==1]))[-3:])
        print("========================================================================================================================================================")
        print()
        print()
        
        # Saving created dataset to file
        filehandler = open(f'nlpaug/{dataset}_{int(imbalance*100)}_{method[0]}_{i}.obj',"wb")
        pickle.dump(train_dataset_augmented,filehandler)
        filehandler.close()
    
    return 

In [14]:

random_seed = 123
np.random.seed(random_seed)
for imbalance in [0.1, 0.2]:
    for i in range(5):
        create_datasets(train_dataset, imbalance = imbalance, random_seed = np.random.randint(0,random_seed), i=i)
        

Spelling_mistake: 7.71 s                                               
Ratio: 0.5
Some examples from positive class
["with his last two films - shine and snow falling on cedars - australian director scott hicks has proven his cinematic flashbacks to be some of the best out there, and his latest, hearts in atlantis, is no different. its structure - beginning and ending in present day with one long flashback in the middle - is similar to the green mile, which is an bit ironic considering both were based on stephen king books. the parallels do n ' t end there, either. atlantis was adapted by william goldman, who had previously penned the big - screen version of misery and is in the process of working on the script for king ' s dreamcatcher. even the film ' s content is a bit reminiscent of mile. in fact, it ' s the perfect blend of the feel - good ' 60s nostalgia of stand by me (also by king) und mystical powder. hokum of mile. king ' s atlantis is a book comprised of five related short 