This notebook performs data augmentation on the high quality data. It translates the intent into spanish and back into english.

#### INSTALL DEPENDENCIES

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install sacremoses

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://us

#### IMPORT LIBRARIES

In [2]:
!git clone https://github.com/nelson-nsc/COMP0087-NLP-project.git

Cloning into 'ucl-nlp'...
remote: Enumerating objects: 173, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 173 (delta 21), reused 28 (delta 17), pack-reused 137[K
Receiving objects: 100% (173/173), 7.49 MiB | 12.90 MiB/s, done.
Resolving deltas: 100% (63/63), done.


In [3]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer

#### DATA AUGMENTATION

In [5]:
## prepare data to augmented
df = pd.read_csv('/content/COMP0087-NLP-project/data/conala-hq-marian/hq_train.csv')
intent = df['intent'].values.tolist()

# define a method for adding special token to a batch of text
format_text = lambda code, txts: [f'>>{code}<< {txt}' for txt in txts]


In [6]:
## define the two models

# define english to spanish model
name1 = 'Helsinki-NLP/opus-mt-en-es'
eng2esp_tkn = MarianTokenizer.from_pretrained(name1)
eng2esp_model = MarianMTModel.from_pretrained(name1).to('cuda')  # model

# define spanish to english mdoel
name2 = 'Helsinki-NLP/opus-mt-es-en'
esp2eng_tkn = MarianTokenizer.from_pretrained(name2)   # tokenizer
esp2eng_model = MarianMTModel.from_pretrained(name2).to('cuda')  # model

Downloading (…)olve/main/source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [7]:
## define translate method
def translate(txts, model, tokenizer, to_language):
    # prepare the text data into appropriate format
    formated_texts = format_text(to_language, txts)
    
    # translate using model
    translated = model.generate(**tokenizer(formated_texts, return_tensors="pt", padding=True).to('cuda'))

    # convert the generated tokens indices back into text
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    
    return translated_texts


In [8]:
## define method to perform back translation
def back_translate(txts):
  # english to spanish
  spanish_txts = translate(txts, eng2esp_model, eng2esp_tkn, 'es')

  # spanish back to english
  english_txts = translate(spanish_txts, esp2eng_model, esp2eng_tkn, 'en')

  return english_txts


In [9]:
## perform the augmentation
intent_aug = [back_translate([txt])[0].lower() for txt in intent]



In [None]:
lst1 = intent_aug
lst2 = df['snippet'].values.tolist()

aug_intent_df = pd.DataFrame(list(zip(lst1, lst2)), columns =['intent', 'snippet'])

In [None]:
# save the augmented data
aug_intent_df.to_csv('/content/drive/MyDrive/coNaLa-data/train_aug.csv', index=False)