# Setting Up

In [None]:
import pandas as pd
import numpy as np
import copy
import random

In [None]:
df = pd.read_csv('data.csv')
df.head()

# Positive Sample Generation

### 1. Synonym Replacement

In [None]:
!pip install requests nlpaug

In [None]:
import nlpaug
import nlpaug.augmenter.word as naw

In [None]:
aug = naw.SynonymAug(aug_src='wordnet',aug_max=2)
synonyms = copy.deepcopy(df)
synonyms['premise'] = synonyms['premise'].map(lambda x:aug.augment(x,n=1)[0])
synonyms.head()

### Random Deletion

In [None]:
def random_deletion(words, p):

    words = words.split()

    #obviously, if there's only one word, don't delete it
    if len(words) == 1:
        return words

    #randomly delete words with probability p
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    #if you end up deleting all words, just return a random word
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    sentence = ' '.join(new_words)

    return sentence

In [None]:
del_rand = copy.deepcopy(df)
del_rand['premise'] = del_rand['premise'].map(lambda x: random_deletion(x, 0.2))
del_rand.head()

### Random Swap

In [None]:
def swap_word(new_words):

    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0

    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1

        if counter > 3:
            return new_words

    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
    return new_words

def random_swap(words, n):

    words = words.split()
    new_words = words.copy()
    # n is the number of words to be swapped
    for _ in range(n):
        new_words = swap_word(new_words)

    sentence = ' '.join(new_words)

    return sentence

swap_rand = copy.deepcopy(df)
swap_rand['premise'] = swap_rand['premise'].map(lambda x: random_swap(x, 1))
swap_rand.head()

### Random Insertion

In [None]:
from nltk.corpus import wordnet

def get_synonyms(word):

    synonyms = set()

    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)

    return list(synonyms)

def random_insertion(words, n):

    words = words.split()
    new_words = words.copy()

    for _ in range(n):
        add_word(new_words)

    sentence = ' '.join(new_words)
    return sentence

def add_word(new_words):

    synonyms = []
    counter = 0

    while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return

    random_synonym = synonyms[0]
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)



ins_rand = copy.deepcopy(df)
ins_rand['premise'] = ins_rand['premise'].map(lambda x: random_insertion(x, 1))
ins_rand.head()

### Backtranslation

In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
from transformers import MarianMTModel, MarianTokenizer

In [None]:
first_model_name = 'Helsinki-NLP/opus-mt-en-fr'
first_model_tkn = MarianTokenizer.from_pretrained(first_model_name)
first_model = MarianMTModel.from_pretrained(first_model_name)

second_model_name = 'Helsinki-NLP/opus-mt-fr-en'
second_model_tkn = MarianTokenizer.from_pretrained(second_model_name)
second_model = MarianMTModel.from_pretrained(second_model_name)

In [None]:
original_texts = ["This article aims to perform the back translation for text data augmentation",
          "It is the 25th article by Zoumana on Medium. He loves to give back to the community",
          "The first model translates from English to French, which is a temporary process",
          "The second model finally translates back all the temporary french text into English"]

In [None]:
def format_batch_texts(language_code, batch_texts):

  formated_bach = [">>{}<< {}".format(language_code, text) for text in batch_texts]

  return formated_bach

def perform_translation(batch_texts, model, tokenizer, language="fr"):
    # Prepare the text data into appropriate format for the model
    formated_batch_texts = format_batch_texts(language, batch_texts)

    # Generate translation using model
    translated = model.generate(**tokenizer(formated_batch_texts, return_tensors="pt", padding=True))

    # Convert the generated tokens indices back into text
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

    return translated_texts

def backkaro(x):
    translated_texts = perform_translation([x], first_model, first_model_tkn)
    back_translated_texts = perform_translation(translated_texts, second_model, second_model_tkn)
    return back_translated_texts[0]


In [None]:
backtr = copy.deepcopy(df)
backtr['premise'] = backtr['premise'].map(lambda x: backkaro(x))
backtr.head()

### Text augmentation using pretrained Masked Language Model

In [None]:
!pip install textattack

In [None]:
import textattack

In [None]:
from textattack.augmentation import CLAREAugmenter
clare_aug = CLAREAugmenter()
textaug = copy.deepcopy(df)
textaug['premise'] = textaug['premise'].map(lambda x: clare_aug.augment(x))

## Combining all the approaches

In [None]:
positive_samples = pd.concat([synonyms, del_rand, swap_rand, ins_rand, backtr], ignore_index=True)
positive_samples

# Negative Sample Generation

### Slide and concatenate approach

In [None]:
conc = copy.deepcopy(positive_samples)

In [None]:
conc['premise_tr'] = conc['premise'].shift(1)
conc['premise_tr'].loc[0] = conc['premise'].loc[1]
conc.head()

In [None]:
slide = pd.DataFrame(columns=['premise', 'hypothesis', 'label'])
slide[['premise', 'hypothesis', 'label']] = conc[['premise_tr','hypothesis','label']]
slide['label'] = 0
slide.head()

In [None]:
slide.shape

### Antonym Replacement

In [None]:
aug = naw.AntonymAug(name='Antonym_Aug', aug_min=1, aug_max=10, aug_p=0.3, lang='eng', stopwords=None, tokenizer=None,
                     reverse_tokenizer=None, stopwords_regex=None, verbose=0)

test_sentence_aug = aug.augment("very beautiful")

In [None]:
antonyms = copy.deepcopy(df)
antonyms['premise'] = antonyms['premise'].map(lambda x:aug.augment(x)[0])
antonyms['label'] = 0
antonyms.head()

# Model Training

In [None]:
final_dataset = pd.concat([positive_samples, slide, antonyms], ignore_index=True)

In [None]:
final_dataset

In [None]:
x = final_dataset["premise"].astype(str) + "[sep]" +  final_dataset["hypothesis"].astype(str)

In [None]:
!pip install -q sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("distilbert-base-nli-mean-tokens")
train_statements_embeddings = model.encode(list(x))


In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_statements_embeddings, final_dataset['label'], test_size=0.2, random_state=42)


Accuracy

In [None]:
from sklearn.linear_model import LogisticRegression
l_model = LogisticRegression(solver='liblinear', random_state=0)
l_model.fit(x_train, y_train)
l_model.score(x_test, y_test)

F1 Score

In [None]:
y_preds = l_model.predict(x_test)
from sklearn.metrics import f1_score
print(f1_score(y_test,y_preds))

Recall Score

In [None]:
from sklearn.metrics import recall_score
print(recall_score(y_test,y_preds))