In [None]:
!pip install transformers
!pip install torch
!pip install datasets

In [None]:
mydir = "/content/drive/MyDrive/Dataset/"

In [None]:
import pandas as pd
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
step = pd.read_csv(mydir + "RAW_recipes.csv")

In [None]:
df = pd.DataFrame(columns=["text"])

In [None]:
#This is a function used select only the clean noisy sentence from the original dataset
#the dataset created is used for train the model on culinary context
def is_valid_sentence(i):
  if(len(i)<=2):
    return False
  if( (i[0]=="s" and i[1]==" ") or (i[0]=="t" and i[1]==" ") or (i[0]=="n" and i[1]==" ") or i[0]=="," or i[0]=='"' or i[1]=='"' or i[0]==":" or i[0]=="-" or i[-1]=="-" or i[-1]==" " or i[0]==" "):
    return False
  return '?' not in i and '!' not in i and '~' not in i

In [None]:
l = len(df)
for x in step["steps"][0:200000]:
  if(l>150000):                 #this is to get at most ~150k rows
    break

  for s in x[1:-2].split("', "):
    if(is_valid_sentence(s)):
      z = len(s.split(" "))
      if(z>3 and z<20):         #I decided to discard the sentences with too few and too many word
        df.loc[len(df)] = s[1:]
        l += 1

In [None]:
len(df)

150007

In [None]:
df.to_csv(mydir + "final.csv")

##Training with MLM task

In [None]:
df = pd.read_csv(mydir + "final.csv")

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

#Load the model and the tokenizer
#The first time the loaded model was "t5-small"
model_name = "moro01525/T5_MLM"
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
model.num_parameters()

60506624

In [None]:
import random, spacy

nlp = spacy.load("en_core_web_sm")
def mask_text(text, mask_token="<extra_id_0>"):
    doc = nlp(text)

    #select only the NOUN or the VERB for the masking task
    candidates = [token.text for token in doc if token.pos_ in ['NOUN', 'VERB']]

    #if there is no NOUN or VERB then masks a random word
    if not candidates:
        candidates = [token.text for token in doc]

    #select randomly the words for the masking
    words = text.split()
    num_masks = random.randint(1, max(1, int(len(words)*0.15)))   #The 15% of the words is masked
    masked_words = random.sample(candidates, num_masks)

    #mask the word
    masked_text = text
    for word in masked_words:
        masked_text = masked_text.replace(word, mask_token, 1)

    return masked_text

In [None]:
class CulinaryMLMDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, max_length=50):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        original_text = self.texts[idx]
        masked_text = mask_text(original_text)

        inputs = self.tokenizer(masked_text, return_tensors='pt', truncation=True, padding='max_length', max_length=self.max_length)
        labels = self.tokenizer(original_text, return_tensors='pt', truncation=True, padding='max_length', max_length=self.max_length).input_ids

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        labels = labels.squeeze()

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

In [None]:
train = df['text'][0:30000].tolist()          #Every epoch the indexes are changed (30'000 rows every epoch)
evaluation = df['text'][95000:100000].tolist()
dataset = CulinaryMLMDataset(train, tokenizer, 50)
eval = CulinaryMLMDataset(evaluation, tokenizer, 50)

In [None]:
model_dir = mydir + "T5_MLM"

In [None]:
from transformers import Trainer, TrainingArguments

#print(model_folder)
# Define the training arguments
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=1000,
    logging_steps=1000,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=eval
)

In [None]:
trainer.train()

In [None]:
trainer.save_model(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
!pip install huggingface_hub
from huggingface_hub import notebook_login
notebook_login()

In [None]:
trainer.push_to_hub(commit_message='T5 dopo 4 epoch (finale), rouge1: 0.93, rouge2: 0.88, rougeL: 0.93; BLEU Score: 0.7; Perplexity: 34.5')

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset, load_metric
import torch

In [None]:
#Create the test dataset with masked word
ds = pd.DataFrame(columns=["text"])
for i in df[102000:103000]["text"]:
  text = mask_text(i)
  ds.loc[len(ds)] = text

In [None]:
def calculate_perplexity(model, tokenizer, sentences):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for sentence in sentences:
            inputs = tokenizer(sentence, return_tensors='pt')
            outputs = model(**inputs, labels=inputs['input_ids'])
            loss = outputs.loss
            total_loss += loss.item()

    avg_loss = total_loss / len(sentences)
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

perplexity = calculate_perplexity(model, tokenizer, ds["text"])
print(f'Perplexity: {perplexity}')

Perplexity: 34.53046417236328


In [None]:
from nltk.translate.bleu_score import sentence_bleu

def calculate_bleu_score(model, tokenizer, masked_sentences, original_sentences):
    model.eval()
    total_bleu = 0.0
    with torch.no_grad():
        for masked, original in zip(masked_sentences, original_sentences):
            inputs = tokenizer(masked, return_tensors='pt')
            outputs = model.generate(inputs['input_ids'], max_length=50, num_beams=5, early_stopping=True)
            decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
            reference_tokens = original.split()
            output_tokens = decoded_output.split()
            bleu_score = sentence_bleu([reference_tokens], output_tokens)
            total_bleu += bleu_score

    avg_bleu_score = total_bleu / len(masked_sentences)
    return avg_bleu_score

bleu_score = calculate_bleu_score(model, tokenizer, ds["text"], df[102000:103000]["text"])
print(f'BLEU Score: {bleu_score}')

BLEU Score: 0.6967825995856721


In [None]:
!pip install rouge_score

In [None]:
from rouge_score import rouge_scorer

def calculate_rouge_score(model, tokenizer, masked_sentences, original_sentences):
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    total_rouge1 = 0.0
    total_rouge2 = 0.0
    total_rougeL = 0.0
    with torch.no_grad():
        for masked, original in zip(masked_sentences, original_sentences):
            inputs = tokenizer(masked, return_tensors='pt')
            outputs = model.generate(inputs['input_ids'], max_length=50, num_beams=5, early_stopping=True)
            decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
            scores = scorer.score(original, decoded_output)
            total_rouge1 += scores['rouge1'].fmeasure
            total_rouge2 += scores['rouge2'].fmeasure
            total_rougeL += scores['rougeL'].fmeasure

    avg_rouge1 = total_rouge1 / len(masked_sentences)
    avg_rouge2 = total_rouge2 / len(masked_sentences)
    avg_rougeL = total_rougeL / len(masked_sentences)
    return {'rouge1': avg_rouge1, 'rouge2': avg_rouge2, 'rougeL': avg_rougeL}

# Calcola il ROUGE Score
rouge_scores = calculate_rouge_score(model, tokenizer, ds["text"], df[102000:103000]["text"])
print(f'ROUGE Scores: {rouge_scores}')

ROUGE Scores: {'rouge1': 0.9331053913306484, 'rouge2': 0.8797187884119245, 'rougeL': 0.9329877442718251}
