# Importing Libraries

In [14]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration , TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

# Importing Model

In [2]:
# Load model and tokenizer
model_name = "google-t5/t5-base"
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Dataset Preprocessing

### Dataset Class

In [13]:
class QADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):

        self.data = dataframe
        self.questions = self.data["question"]
        self.answers = self.data["answer"]
        self.context = self.data["context"]
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]
        context = self.context[idx]

        # Tokenize the input (question and answer)
        question_tokenized = self.tokenizer(question, context, padding="max_length", max_length=self.max_length, 
                                            truncation=True, add_special_tokens=True)

        # Get the tokenized answer
        answer_tokenized = self.tokenizer(answer, context, padding="max_length", max_length=128, 
                                            truncation=True, add_special_tokens=True)

        labels = torch.tensor(answer_tokenized['input_ids'], dtype=torch.long)
        labels[labels ==0] = -100

        # Return the tokenized question-answer pair, attention mask, and the start/end positions
        return {
            'input_ids': torch.tensor(question_tokenized['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(question_tokenized['attention_mask'], dtype=torch.long),
            'labels': labels,
            'decoder_attention_mask': torch.tensor(answer_tokenized['attention_mask'], dtype=torch.long)
        }

### Data Load

In [4]:
df = pd.read_csv('./qa_dataset_en.csv', nrows=4000) # read only 4000 rows
df.head(5)

Unnamed: 0,context,question,answer
0,More than half of the Makhzen's expenditures w...,Question: What were the consequences of the Ma...,Answer: The Makhzen's expenditures led to a de...
1,"In the 1890s, the French administration and mi...",Question: What were the main reasons behind th...,Answer: The main reasons behind the French ann...
2,"Morocco nominally was ruled by its sultan, the...",Question: What were the main reasons behind th...,Answer: The French saw Morocco as a strategic ...
3,General Hubert Lyautey wanted a more aggressiv...,Question: What was the outcome of the Algecira...,Answer: The Algeciras Conference of 1906 forma...
4,Morocco experienced a famine from 1903 to 1907...,Question: What were the main causes of the Mor...,Answer: The Moroccan Famine from 1903 to 1907 ...


In [5]:
df['question'] = df['question'].str.replace('Question:', '', regex=False) #removing 'Question:' keyword
df['answer'] = df['answer'].str.replace('Answer:', '', regex=False) #removing 'Answer:' keyword
df.head(5)

Unnamed: 0,context,question,answer
0,More than half of the Makhzen's expenditures w...,What were the consequences of the Makhzen's e...,The Makhzen's expenditures led to a deteriora...
1,"In the 1890s, the French administration and mi...",What were the main reasons behind the French ...,The main reasons behind the French annexation...
2,"Morocco nominally was ruled by its sultan, the...",What were the main reasons behind the French'...,The French saw Morocco as a strategic locatio...
3,General Hubert Lyautey wanted a more aggressiv...,What was the outcome of the Algeciras Confere...,The Algeciras Conference of 1906 formalized F...
4,Morocco experienced a famine from 1903 to 1907...,What were the main causes of the Moroccan Fam...,The Moroccan Famine from 1903 to 1907 was cau...


### Data Split

In [6]:
#data split
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

# Reset indices
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

#print(train_data['question'])
print("Train Data: ", len(train_data))
print("Val Data: ", len(val_data))

train_dataset = QADataset(train_data, tokenizer)
val_dataset = QADataset(val_data, tokenizer)

Train Data:  2892
Val Data:  724


# Model Training

### Training Arguments

In [7]:
training_args = TrainingArguments(
    output_dir='./results',     
    num_train_epochs=3,        
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,   
    warmup_steps=100,               
    weight_decay=0.03,
    learning_rate=0.00001,
    logging_steps=100,
    eval_steps=300,
    eval_strategy="steps",
    save_steps=300,
    load_best_model_at_end=True    
)

### Train

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
300,0.0879,0.049377
600,0.0469,0.035549
900,0.0407,0.032908


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1086, training_loss=0.2423902014561999, metrics={'train_runtime': 1021.6739, 'train_samples_per_second': 8.492, 'train_steps_per_second': 1.063, 'total_flos': 5283318658498560.0, 'train_loss': 0.2423902014561999, 'epoch': 3.0})

### Model Save

In [9]:
model.save_pretrained("t5_model")
tokenizer.save_pretrained("t5_tokenizer")

('t5_tokenizer\\tokenizer_config.json',
 't5_tokenizer\\special_tokens_map.json',
 't5_tokenizer\\spiece.model',
 't5_tokenizer\\added_tokens.json',
 't5_tokenizer\\tokenizer.json')

### Load Trained Model

In [10]:
model = T5ForConditionalGeneration.from_pretrained("t5_model")
tokenizer = T5TokenizerFast.from_pretrained("t5_tokenizer")

# Model Evaluation

In [11]:
def predict_answer(context, question, ref_answer=None):
    inputs = tokenizer(question, context, max_length=512, padding="max_length", truncation=True, add_special_tokens=True)
    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(model.device).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(model.device).unsqueeze(0)

    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
  
    predicted_answer = tokenizer.decode(outputs.flatten(), skip_special_tokens=True)
    
    if ref_answer:
        # Load the Bleu metric
        bleu = evaluate.load("google_bleu")
        score = bleu.compute(predictions=[predicted_answer], 
                            references=[ref_answer])
    
        print("Context: ", context)
        print("Question: ", question)
        print("\nReference answer: ", ref_answer)
        print("\nPredicted answer: ", predicted_answer)
        print("\nBLEU Score: ", score)

In [12]:
import evaluate  # Bleu
context =  df.iloc[100]['context']
question =  df.iloc[100]['question']
answer =  df.iloc[100]['answer']

predict_answer(context, question, answer)



Context:  Tangier offers four types of education systems: Arabic, French, Spanish and English. Each offers classes starting from pre-Kindergarten up to the 12th grade, as for German in the three last years of high school. The Baccalauréat, or high school diploma are the diplomas offered after clearing the 12 grades.

Question:   What are the four types of education systems offered in Tangier?

Reference answer:   The four types of education systems offered in Tangier are Arabic, French, Spanish, and English. Each offers classes starting from pre-Kindergarten up to the 12th grade, with the Baccalauréat being the diploma offered after clearing the 12th grade.

Predicted answer:  What are the four types of education systems offered in Tangier?

BLEU Score:  {'google_bleu': 0.16470588235294117}
