# Importing Libraries

In [1]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration , TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import evaluate  # Bleu



# Importing Model

In [2]:
# Load model and tokenizer
model_name = "google-t5/t5-base"
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Dataset Preprocessing

### Dataset Class

In [3]:
class QADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):

        self.data = dataframe
        self.questions = self.data["question"]
        self.answers = self.data["answer"]
        self.context = self.data["context"]
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]
        context = self.context[idx]

        # Tokenize the input (question and answer)
        question_tokenized = self.tokenizer(question, context, padding="max_length", max_length=self.max_length, 
                                            truncation=True, add_special_tokens=True)

        # Get the tokenized answer
        answer_tokenized = self.tokenizer(answer, context, padding="max_length", max_length=92, 
                                            truncation=True, add_special_tokens=True)

        labels = torch.tensor(answer_tokenized['input_ids'], dtype=torch.long)
        labels[labels ==0] = -100

        # Return the tokenized question-answer pair, attention mask, and the start/end positions
        return {
            'input_ids': torch.tensor(question_tokenized['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(question_tokenized['attention_mask'], dtype=torch.long),
            'labels': labels,
            'decoder_attention_mask': torch.tensor(answer_tokenized['attention_mask'], dtype=torch.long)
        }

### Data Load

In [4]:
df = pd.read_csv('./qa_dataset_en.csv', nrows=4000) # read only 4000 rows
df.head(5)

Unnamed: 0,context,question,answer
0,More than half of the Makhzen's expenditures w...,Question: What were the consequences of the Ma...,Answer: The Makhzen's expenditures led to a de...
1,"In the 1890s, the French administration and mi...",Question: What were the main reasons behind th...,Answer: The main reasons behind the French ann...
2,"Morocco nominally was ruled by its sultan, the...",Question: What were the main reasons behind th...,Answer: The French saw Morocco as a strategic ...
3,General Hubert Lyautey wanted a more aggressiv...,Question: What was the outcome of the Algecira...,Answer: The Algeciras Conference of 1906 forma...
4,Morocco experienced a famine from 1903 to 1907...,Question: What were the main causes of the Mor...,Answer: The Moroccan Famine from 1903 to 1907 ...


In [5]:
df['question'] = df['question'].str.replace('Question:', '', regex=False) #removing 'Question:' keyword
df['answer'] = df['answer'].str.replace('Answer:', '', regex=False) #removing 'Answer:' keyword
df.head(5)

Unnamed: 0,context,question,answer
0,More than half of the Makhzen's expenditures w...,What were the consequences of the Makhzen's e...,The Makhzen's expenditures led to a deteriora...
1,"In the 1890s, the French administration and mi...",What were the main reasons behind the French ...,The main reasons behind the French annexation...
2,"Morocco nominally was ruled by its sultan, the...",What were the main reasons behind the French'...,The French saw Morocco as a strategic locatio...
3,General Hubert Lyautey wanted a more aggressiv...,What was the outcome of the Algeciras Confere...,The Algeciras Conference of 1906 formalized F...
4,Morocco experienced a famine from 1903 to 1907...,What were the main causes of the Moroccan Fam...,The Moroccan Famine from 1903 to 1907 was cau...


### Data Split

In [6]:
#data split
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

# Reset indices
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

#print(train_data['question'])
print("Train Data: ", len(train_data))
print("Val Data: ", len(val_data))

train_dataset = QADataset(train_data, tokenizer)
val_dataset = QADataset(val_data, tokenizer)

Train Data:  2892
Val Data:  724


# Model Training

### Training Arguments

In [7]:
training_args = TrainingArguments(
    output_dir='./results',     
    num_train_epochs=5,        
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,   
    warmup_steps=100,               
    weight_decay=0.03,
    learning_rate=0.00001,
    logging_steps=100,
    eval_steps=200,
    eval_strategy="steps",
    save_steps=200,
    load_best_model_at_end=True    
)

### Train

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
200,1.1196,0.934206
400,0.8914,0.793696
600,0.8371,0.740579
800,0.8001,0.716435
1000,0.7859,0.701673
1200,0.7741,0.707219
1400,0.7305,0.684484
1600,0.7183,0.678327
1800,0.7024,0.672765
2000,0.7162,0.667945


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=3615, training_loss=0.7641356102823387, metrics={'train_runtime': 1292.1619, 'train_samples_per_second': 11.191, 'train_steps_per_second': 2.798, 'total_flos': 8805531097497600.0, 'train_loss': 0.7641356102823387, 'epoch': 5.0})

### Model Save

In [9]:
model.save_pretrained("t5_model")
tokenizer.save_pretrained("t5_tokenizer")

('t5_tokenizer\\tokenizer_config.json',
 't5_tokenizer\\special_tokens_map.json',
 't5_tokenizer\\spiece.model',
 't5_tokenizer\\added_tokens.json',
 't5_tokenizer\\tokenizer.json')

### Load Trained Model

In [10]:
model = T5ForConditionalGeneration.from_pretrained("t5_model")
tokenizer = T5TokenizerFast.from_pretrained("t5_tokenizer")

# Model Evaluation

In [37]:
def predict_answer(context, question, ref_answer=None):
    inputs = tokenizer(question, context, max_length=512, padding="max_length", truncation=True, add_special_tokens=True)
    input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long).to(model.device).unsqueeze(0)
    attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(model.device).unsqueeze(0)

    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
  
    predicted_answer = tokenizer.decode(outputs.flatten(), skip_special_tokens=True)
    
    if ref_answer:   
        print("Context: ", context)
        print("Question: ", question)
        print("\nReference answer: ", ref_answer)
        print("\nPredicted answer: ", predicted_answer)


In [38]:
context =  df.iloc[50]['context']
question =  df.iloc[50]['question']
answer =  df.iloc[50]['answer']

predict_answer(context, question, answer)

Context:  The Greeks knew this town as Tingis and, with some modification, record the Berber legends of its founding. Supposedly Tinjis, daughter of Atlas and widow of Antaeus, slept with Hercules and bore him the son Syphax. After Tinjis' death, Syphax then founded the port and named it in her honour.[13] The gigantic skeleton and tomb of Antaeus were tourist attractions for ancient visitors.[13] The Caves of Hercules, where he supposedly rested on Cape Spartel during his labors, remain one today.[citation needed]

Question:   Who was the daughter of Atlas and widow of Antaeus in Greek mythology?

Reference answer:   The daughter of Atlas and widow of Antaeus in Greek mythology was Tinjis.

Predicted answer:  The daughter of Atlas and widow of Antaeus in Greek mythology was Tinjis


In [39]:
context =  df.iloc[250]['context']
question =  df.iloc[250]['question']
answer =  df.iloc[250]['answer']

predict_answer(context, question, answer)

Context:  Things soon began to fall apart.  A nine-year plague enveloped Morocco in 1598–1607, weakening the country tremendously, and taking al-Mansur in 1603.[100]  His successor Abu Faris Abdallah was acclaimed in Marrakesh, but the jurists of Fez elevated his brother Zidan al-Nasir instead. Zidan managed to prevail and entered Marrakesh in 1609.  But now another brother, Muhammad al-Sheikh al-Ma'mun revolted in the north, and soon Zidan was reduced to Marrakesh.[101]  As Saadian power buckled, Morocco fell into anarchy and fragmented into smaller pieces for much of the next century.  Zidan was driven out of Marrakesh by a religious leader, the self-proclaimed mahdi Ahmed ibn Abi Mahalli in 1612, and was restored only in 1614 with the assistance of another religious leader, Yahya ibn Abdallah, a Sufi marabout from the High Atlas, who subsequently tried to exert his own power over the city from 1618 until his death in 1626.  Zidan somehow found the time and resources during all this 

In [40]:
context =  df.iloc[300]['context']
question =  df.iloc[300]['question']
answer =  df.iloc[300]['answer']

predict_answer(context, question, answer)

Context:  
Casablanca remained a modestly sized port, with a population reaching around 12,000 within a few years of the French conquest and arrival of French colonialists in the town, at first administrators within a sovereign sultanate, in 1906. By 1921, this was to rise to 110,000,[13] largely through the development of bidonvilles.
Question:   What was the population of Casablanca during the French colonial period?

Reference answer:   The population of Casablanca during the French colonial period was around 12,000 in 1906, which grew to 110,000 by 1921.

Predicted answer:  Casablanca's population during the French colonial period was around 12,000 people.


In [43]:
context =  df.iloc[165]['context']
question =  df.iloc[165]['question']
answer =  df.iloc[165]['answer']

predict_answer(context, question, answer)

Context:  Rabat features a Mediterranean climate (Csa) with warm to hot, dry summers and mild, damp winters. Located along the Atlantic Ocean, Rabat has a mild, temperate climate, shifting from cool in winter to warm days in the summer months. The nights are always cool (or cold in winter, it can reach sub 0 °C (32 °F) sometimes), with daytime temperatures generally rising about 7–8 °C (13–14 °F). The winter highs typically reach only 17.2 °C (63.0 °F) in December–February. Summer daytime highs usually hover around 25 °C (77.0 °F), but may occasionally exceed 30 °C (86.0 °F), especially during heat waves. Summer nights are usually pleasant and cool, ranging between 11 °C (51.8 °F) and 19 °C (66.2 °F) and rarely exceeding 20 °C (68.0 °F). Rabat belongs to the sub-humid bioclimatic zone with an average annual precipitation of 560 mm (22 in).

Question:   What is the climate like in Rabat, Morocco?

Reference answer:   Rabat has a Mediterranean climate with warm to hot, dry summers and mi

In [42]:
context =  df.iloc[210]['context']
question =  df.iloc[210]['question']
answer =  df.iloc[210]['answer']

predict_answer(context, question, answer)

Context:  The history of Marrakesh, a city in southern Morocco, stretches back nearly a thousand years. The country of Morocco itself is named after it.

Question:   What is the name of the country that Marrakesh is located in?

Reference answer:   Morocco.

Predicted answer:  The name of the country that Marrakesh is located in is Morocco.
