In [1]:
pip install datasets evaluate transformers[sentencepiece]


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Import the necessary libraries
import nltk
import nltk
import torch
import transformers
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

#### Load and split the dataset
- Load the Quora Question Answer Dataset from Hugging Face
- Split the dataset into training and testing sets (80% train, 20% test)

In [3]:
# Load and split the dataset
dataset = load_dataset("toughdata/quora-question-answer-dataset")
dataset = dataset["train"].train_test_split(test_size=0.2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/485 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/56402 [00:00<?, ? examples/s]

#### Load the tokenizer and model
Use the T5 tokenizer and T5 model from Hugging Face's transformers library

In [4]:
# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Load the data collator
- The data collator dynamically pads the inputs and labels to the longest sequence in the batch

In [5]:
# Load the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

prefix = "answer the question: "


In [6]:
# Apply the preprocess function to the dataset
def preprocess_function(examples):
    """Add prefix to the sentences, tokenize the text, and set the labels"""
    # The "inputs" are the tokenized answer:
    inputs = [prefix + doc for doc in examples["question"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    # The "labels" are the tokenized outputs:
    labels = tokenizer(text_target=examples["answer"], max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/45121 [00:00<?, ? examples/s]

Map:   0%|          | 0/11281 [00:00<?, ? examples/s]

In [7]:
# Install the rouge_score library for evaluation
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=2cee1e24ec02a61693db2bcd81dd8e932ed1c1bcc35381669ff76e4fc11af739
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


#### Set up ROUGE score for evaluation

In [8]:
# Set up Rouge score for evaluation
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [9]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

In [10]:

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    push_to_hub=False
)




In [None]:
# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,

    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

# Print the evaluation results
print("Evaluation Results:", eval_results)


In [None]:
import matplotlib.pyplot as plt

# Extract ROUGE scores
rouge_scores = eval_results['eval_rougeLsum']
rouge_names = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']

# Assuming eval_results contains detailed scores for each ROUGE metric
rouge_1 = eval_results['eval_rouge1']
rouge_2 = eval_results['eval_rouge2']
rouge_l = eval_results['eval_rougeLsum']

# Create a bar plot for ROUGE scores
plt.figure(figsize=(10, 5))
scores = [rouge_1, rouge_2, rouge_l]
plt.bar(rouge_names, scores, color=['skyblue', 'lightgreen', 'salmon'])
plt.xlabel('ROUGE Metrics')
plt.ylabel('Scores')
plt.title('ROUGE Scores for T5 Model on Quora Dataset')
plt.show()


# Load BERT tokenizer and model

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Load BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Preprocess function for BERT
def preprocess_function_bert(examples):
    inputs = [prefix + doc for doc in examples["question"]]
    model_inputs = bert_tokenizer(inputs, max_length=128, truncation=True, padding=True)
    labels = bert_tokenizer(examples["answer"], max_length=128, truncation=True, padding=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset_bert = dataset.map(preprocess_function_bert, batched=True)

# Training arguments
training_args_bert = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

# Trainer
trainer_bert = Trainer(
    model=bert_model,
    args=training_args_bert,
    train_dataset=tokenized_dataset_bert["train"],
    eval_dataset=tokenized_dataset_bert["test"],
    tokenizer=bert_tokenizer,
    compute_metrics=compute_metrics
)

# Train BERT model
trainer_bert.train()


In [None]:
# Evaluate the BERT model
eval_results_bert = trainer_bert.evaluate()

# Extract ROUGE scores for BERT
rouge_1_bert = eval_results_bert['eval_rouge1']
rouge_2_bert = eval_results_bert['eval_rouge2']
rouge_l_bert = eval_results_bert['eval_rougeLsum']

# Create a bar plot for BERT ROUGE scores
plt.figure(figsize=(10, 5))
scores_bert = [rouge_1_bert, rouge_2_bert, rouge_l_bert]
plt.bar(rouge_names, scores_bert, color=['skyblue', 'lightgreen', 'salmon'])
plt.xlabel('ROUGE Metrics')
plt.ylabel('Scores')
plt.title('ROUGE Scores for BERT Model on Quora Dataset')
plt.show()


# Load GPT tokenizer and model

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load GPT tokenizer and model
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')

# Preprocess function for GPT
def preprocess_function_gpt(examples):
    inputs = [prefix + doc for doc in examples["question"]]
    model_inputs = gpt_tokenizer(inputs, max_length=128, truncation=True, padding=True)
    labels = gpt_tokenizer(examples["answer"], max_length=128, truncation=True, padding=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset_gpt = dataset.map(preprocess_function_gpt, batched=True)

# Training arguments
training_args_gpt = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

# Trainer
trainer_gpt = Trainer(
    model=gpt_model,
    args=training_args_gpt,
    train_dataset=tokenized_dataset_gpt["train"],
    eval_dataset=tokenized_dataset_gpt["test"],
    tokenizer=gpt_tokenizer,
    compute_metrics=compute_metrics
)

# Train GPT model
trainer_gpt.train()


In [None]:
# Evaluate the GPT model
eval_results_gpt = trainer_gpt.evaluate()

# Extract ROUGE scores for GPT
rouge_1_gpt = eval_results_gpt['eval_rouge1']
rouge_2_gpt = eval_results_gpt['eval_rouge2']
rouge_l_gpt = eval_results_gpt['eval_rougeLsum']

# Create a bar plot for GPT ROUGE scores
plt.figure(figsize=(10, 5))
scores_gpt = [rouge_1_gpt, rouge_2_gpt, rouge_l_gpt]
plt.bar(rouge_names, scores_gpt, color=['skyblue', 'lightgreen', 'salmon'])
plt.xlabel('ROUGE Metrics')
plt.ylabel('Scores')
plt.title('ROUGE Scores for GPT Model on Quora Dataset')
plt.show()


In [None]:
# Assuming we have already evaluated T5, BERT, and GPT models
scores_t5 = [rouge_1, rouge_2, rouge_l]
scores_bert = [rouge_1_bert, rouge_2_bert, rouge_l_bert]
scores_gpt = [rouge_1_gpt, rouge_2_gpt, rouge_l_gpt]

# Combine the scores for each model
scores = {
    "T5": scores_t5,
    "BERT": scores_bert,
    "GPT": scores_gpt
}

# Create a bar plot for comparison
labels = rouge_names
x = np.arange(len(labels))  # the label locations
width = 0.2  # the width of the bars

fig, ax = plt.subplots(figsize=(10, 6))
rects1 = ax.bar(x - width, scores_t5, width, label='T5', color='skyblue')
rects2 = ax.bar(x, scores_bert, width, label='BERT', color='lightgreen')
rects3 = ax.bar(x + width, scores_gpt, width, label='GPT', color='salmon')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_xlabel('ROUGE Metrics')
ax.set_ylabel('Scores')
ax.set_title('ROUGE Scores Comparison for T5, BERT, and GPT Models on Quora Dataset')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

fig.tight_layout()

plt.show()


# IMPLEMENTING THE PERT

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset
import evaluate
import nltk
import numpy as np
import matplotlib.pyplot as plt

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

# Load the dataset
dataset = load_dataset("toughdata/quora-question-answer-dataset")
dataset = dataset["train"].train_test_split(test_size=0.2)

# Define the prefix
prefix = "answer the question: "

# Preprocess function
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["question"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    labels = tokenizer(text_target=examples["answer"], max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Load evaluation metric
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

# Compute metrics
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    push_to_hub=False
)

# Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
eval_results_pert = trainer.evaluate()

# Extract ROUGE scores
rouge_1_pert = eval_results_pert['eval_rouge1']
rouge_2_pert = eval_results_pert['eval_rouge2']
rouge_l_pert = eval_results_pert['eval_rougeLsum']

# Visualization
rouge_names = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']
scores_pert = [rouge_1_pert, rouge_2_pert, rouge_l_pert]

plt.figure(figsize=(10, 5))
plt.bar(rouge_names, scores_pert, color=['skyblue', 'lightgreen', 'salmon'])
plt.xlabel('ROUGE Metrics')
plt.ylabel('Scores')
plt.title('ROUGE Scores for PERT Model on Quora Dataset')
plt.show()
