In [1]:
import numpy as np
import os
from typing import Optional
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd
import torch
from typing import List
from torch import Generator
from peft import LoraConfig, TaskType
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, ConcatDataset, Subset, random_split, RandomSampler
# transformer
from transformers.optimization import AdamW, get_scheduler, SchedulerType
# native
from NlpAnalytics import *

MY_DEVICE = torch.device("mps" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to /Users/lunli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# load dataset
np.random.seed(1)
PATH = '/Users/lunli/Library/CloudStorage/GoogleDrive-yaojn19880525@gmail.com/My Drive/Colab Notebooks/'
DATASET_NAME = 'amazon'
df_train = pd.read_csv(os.path.join(PATH, f'data/{DATASET_NAME}/amazon_train.csv'))
df_train_ = pd.DataFrame(df_train.groupby('label')['text'].apply(lambda s: s.sample(4)))
df_train_ = df_train_.reset_index()
sup_index = list(df_train_['level_1'])
df_train = df_train[~df_train.index.isin(sup_index)]


In [10]:
df_train_.to_csv('sup_train.csv', index = False)

In [11]:
# load valid and test

df_valid = pd.read_csv(os.path.join(PATH, f'data/{DATASET_NAME}/amazon_valid.csv'))
df_test = pd.read_csv(os.path.join(PATH, f'data/{DATASET_NAME}/amazon_test.csv'))

# remove the id,label_text columns
df_train = df_train.drop(['id','label_text'], axis = 1)
df_valid = df_valid.drop(['id','label_text'], axis = 1)
df_test = df_test.drop(['id','label_text'], axis = 1)

 ### Load tokenizer
tokenizer = BertLoader(load_tokenizer=True).tokenizer

df_train_ = DatasetNLP(input_df=df_train, 
                    tokenizer=tokenizer,
                    cols_to_tokenize=['text'],  
                    cols_label=['label'] )
df_valid_ = DatasetNLP(input_df=df_valid, 
                    tokenizer=tokenizer,  
                    cols_to_tokenize=['text'],  
                    cols_label=['label'] )
df_test_ = DatasetNLP(input_df=df_test, 
                    tokenizer=tokenizer,  
                    cols_to_tokenize=['text'],  
                    cols_label=['label'] )

In [12]:
# Get the name of the first model
first_model_name = 'Helsinki-NLP/opus-mt-en-fr'
second_model_name = 'Helsinki-NLP/opus-mt-fr-en'

In [13]:
### en -> fr
# Get the tokenizer
first_model_tkn = MarianTokenizer.from_pretrained(first_model_name)
# Load the pretrained model based on the name
first_model = MarianMTModel.from_pretrained(first_model_name)
### fr -> en
# Get the tokenizer
second_model_tkn = MarianTokenizer.from_pretrained(second_model_name)
# Load the pretrained model based on the name
second_model = MarianMTModel.from_pretrained(second_model_name)



In [14]:
# ### sample text
# original_texts = [
#     "This article aims to perform the back translation for text data augmentation",
#     "It is the 25th article by Zoumana on Medium. He loves to give back to the community",
#     "The first model translates from English to French, which is a temporary process",
#     "The second model finally translates back all the temporary french text into English"
# ]

def chunks(text, chunk_size):
    res = []
    total_length = len(text)
    for i in range(0, total_length, chunk_size):
        tmp = text[i:i+chunk_size]
        res.append(tmp)
    return res

# Generate translation using model
train_text = list(df_train['text'].values)[:11000]
translated = []
for src_text in chunks(train_text,10):
    batch = first_model_tkn.prepare_seq2seq_batch(src_text,return_tensors="pt").to(MY_DEVICE)
    generated = first_model.generate(**batch)
    translation:List[str] = first_model_tkn.batch_decode(generated, skip_special_tokens=True)
    translated.extend(translation)

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



In [15]:
# translated back

translated_back = []
for text in chunks(translated,10):
    batch_ = second_model_tkn.prepare_seq2seq_batch(text,return_tensors="pt").to(MY_DEVICE)
    generated_back = second_model.generate(**batch_)
    translation_back:List[str] = second_model_tkn.batch_decode(generated_back, skip_special_tokens=True)
    translated_back.extend(translation_back)

In [16]:
translated_back = list(map(str.lower,translated_back))
# generated augmentation data

# def combine_texts(original_texts, back_translated_batch):
#     return set(original_texts + back_translated_batch)

train_text = [x.strip(' ') for x in train_text]

# final_augmented = combine_texts(train_text, translated_back)

In [17]:
final_train = dict(zip(train_text, translated_back))
final_train = pd.DataFrame(final_train.items())
final_train.columns = ['ori_text', 'aug_text']
final_train

Unnamed: 0,ori_text,aug_text
0,wake me up at nine am on friday,wake me up at 9:00 a.m. on friday
1,set an alarm for two hours from now,set an alarm for two hours from now
2,olly quiet,it's very quiet.
3,stop,stop
4,olly pause for ten seconds,full break for 10 seconds
...,...,...
10954,check my email for new emails during the last ...,check my email for new emails during the last ...
10955,send the following email to my sister,send the following email to my sister
10956,did i get an email from mike,did i get an email from mike?
10957,has mike sent me an email,mike sent me an e-mail.


In [18]:
final_train.to_csv('final_train.csv', index = False)

In [None]:
def perform_translation(
        batch_texts : list, 
        model : MarianMTModel, 
        tokenizer : MarianTokenizer, 
        language : Optional[str]="fr",
        max_new_tokens : Optional[int]=128, 
        do_sample : Optional[bool]=True, 
        top_k : Optional[int]=50, 
        top_p : Optional[float]=0.9, 
        temperature : Optional[float]=0.9):
    # Prepare the text data into appropriate format for the model
    formated_batch_texts = format_batch_texts(language, batch_texts)
    # Generate translation using model
    translated = model.generate(**tokenizer(formated_batch_texts, return_tensors="pt", padding=True),
        max_new_tokens=max_new_tokens, 
        do_sample=do_sample, 
        top_k=top_k, 
        top_p=top_p, 
        temperature=temperature)
    # Convert the generated tokens indices back into text
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]    
    return translated_texts


In [None]:
def perform_translation_beam(
        batch_texts : list, 
        model : MarianMTModel, 
        tokenizer : MarianTokenizer, 
        language : Optional[str]="fr",
        max_new_tokens : Optional[int]=128):
    # Prepare the text data into appropriate format for the model
    formated_batch_texts = format_batch_texts(language, batch_texts)
    # Generate translation using model
    translated = model.generate(**tokenizer(formated_batch_texts, return_tensors="pt", padding=True),
        max_new_tokens=max_new_tokens, 
        do_sample=False,
        num_beam_groups=2,
        num_beams=4,
        diversity_penalty=100000.,
        num_return_sequences=2)
    # Convert the generated tokens indices back into text
    translated_texts = [tokenizer.decode(translated[i], skip_special_tokens=True) for i in np.arange(1, len(translated), 2)]
    return translated_texts



In [None]:
# Check the model translation from the original language (English) to French
translated_texts = perform_translation_beam(original_texts, first_model, first_model_tkn)
# Perform the translation back to English
back_translated_texts  = perform_translation_beam(translated_texts, second_model, second_model_tkn)
# print
print('---------------------------------------------')
for org, new in zip(original_texts, back_translated_texts):
    print(org + '\n')
    print(new)
    print('---------------------------------------------')