**Importing quora-question-answer dataset**

In [None]:

import pandas as pd

df = pd.read_json("hf://datasets/toughdata/quora-question-answer-dataset/Quora-QuAD.jsonl", lines=True)
df.head()

In [None]:
df.describe()

In [None]:
df = df.dropna(subset=['question', 'answer'])

In [None]:
df.info()

**Data Preprocessing**

* Tokenize the text
* Remove stop words
* Apply lemantization to reduce words to their base forms

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
#downloading necessary nltk data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
#initializing stop words and lemmantizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
#pre-processing text
def preprocess_text(text):
    tokens = word_tokenize(text.lower()) #tokenization
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words] #stop word removal
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens] #lemmantization
    return ' '.join(lemmatized_tokens)

df['processedQuestion'] = df['question'].apply(preprocess_text)
df['processedAnswer'] = df['answer'].apply(preprocess_text)

In [None]:
df.head()

In [None]:
df.info()

**Splitting the data into training and test**

In [None]:
!pip install datasets
!pip install accelerate -U
!pip install --upgrade pip setuptools wheel

!pip install transformers
!pip install git+https://github.com/mlfoundations/open_lm.git


In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from open_lm.hf import *

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

#function to prepare data in order to train the models
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

***Using transformers model from Huggingface to test various NLP models on our dataset***

1. GPT model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, AutoModelForSeq2SeqLM

modelGPT = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizerGPT = AutoTokenizer.from_pretrained("gpt2")

if tokenizerGPT.pad_token is None:
    tokenizerGPT.add_special_tokens({'pad_token': '[PAD]'})
    modelGPT.resize_token_embeddings(len(tokenizerGPT))

In [None]:
#tokenizing the input data
def preprocess_function(examples):
    inputs = tokenizerGPT(examples['processedQuestion'], truncation=True, padding='max_length', max_length=256)
    labels = tokenizerGPT(examples['processedAnswer'], truncation=True, padding='max_length', max_length=256)
    inputs['labels'] = labels['input_ids']
    return inputs

tokenized_datasets = datasets.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create Trainer instance
trainer = Trainer(
    model=modelGPT,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# Train the model
trainer.train()

2. Pegasus Xsum

In [None]:
modelpeg = AutoModelForCausalLM.from_pretrained('google/pegasus-xsum',is_decoder = True)
tokenizerpeg = AutoTokenizer.from_pretrained('google/pegasus-xsum')

if tokenizerpeg.pad_token is None:
    tokenizerpeg.add_special_tokens({'pad_token': '[PAD]'})
    modelpeg.resize_token_embeddings(len(tokenizerpeg))

In [None]:
#tokenizing the input data
def preprocess_function(examples):
    inputs = tokenizerpeg(examples['processedQuestion'], truncation=True, padding='max_length', max_length=256)
    labels = tokenizerpeg(examples['processedAnswer'], truncation=True, padding='max_length', max_length=256)
    inputs['labels'] = labels['input_ids']
    return inputs

tokenized_datasets = datasets.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create Trainer instance
trainer = Trainer(
    model=modelpeg,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# Train the model
trainer.train()

3. T5 Model

In [None]:
modelT5 = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
tokenizerT5 = AutoTokenizer.from_pretrained("t5-small")

if tokenizerT5.pad_token is None:
    tokenizerT5.add_special_tokens({'pad_token': '[PAD]'})
    modelT5.resize_token_embeddings(len(tokenizerT5))

In [None]:
#tokenizing the input data
def preprocess_function(examples):
    inputs = tokenizerT5(examples['processedQuestion'], truncation=True, padding='max_length', max_length=256)
    labels = tokenizerT5(examples['processedAnswer'], truncation=True, padding='max_length', max_length=256)
    inputs['labels'] = labels['input_ids']
    return inputs

tokenized_datasets = datasets.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create Trainer instance
trainer = Trainer(
    model=modelT5,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# Train the model
trainer.train()

4. BART Model

In [None]:
modelBart = AutoModelForCausalLM.from_pretrained("facebook/bart-base")
tokenizerBart = AutoTokenizer.from_pretrained("facebook/bart-base")

if tokenizerBart.pad_token is None:
    tokenizerBart.add_special_tokens({'pad_token': '[PAD]'})
    modelBart.resize_token_embeddings(len(tokenizerBart))

In [None]:
#tokenizing the input data
def preprocess_function(examples):
    inputs = tokenizerBart(examples['processedQuestion'], truncation=True, padding='max_length', max_length=256)
    labels = tokenizerBart(examples['processedAnswer'], truncation=True, padding='max_length', max_length=256)
    inputs['labels'] = labels['input_ids']
    return inputs

tokenized_datasets = datasets.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create Trainer instance
trainer = Trainer(
    model=modelBart,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# Train the model
trainer.train()

5. Apple DCLM-Baseline-7B

In [None]:
modelApple = AutoModelForCausalLM.from_pretrained("apple/DCLM-Baseline-7B")
tokenizerApple = AutoTokenizer.from_pretrained("apple/DCLM-Baseline-7B")

if tokenizerApple.pad_token is None:
    tokenizerApple.add_special_tokens({'pad_token': '[PAD]'})
    modelApple.resize_token_embeddings(len(tokenizerApple))

In [None]:
#tokenizing the input data
def preprocess_function(examples):
    inputs = tokenizerApple(examples['processedQuestion'], truncation=True, padding='max_length', max_length=256)
    labels = tokenizerApple(examples['processedAnswer'], truncation=True, padding='max_length', max_length=256)
    inputs['labels'] = labels['input_ids']
    return inputs

tokenized_datasets = datasets.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create Trainer instance
trainer = Trainer(
    model=modelApple,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# Train the model
trainer.train()

**Evaluating Models and Vizualization**

*On the basis of ROUGE, BLEU, and F1-Score*

In [None]:
#installing required libraries
!pip install rouge-score nltk matplotlib seaborn plotly

In [None]:
import numpy as np
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sklearn.metrics import f1_score

#function to calculate ROUGE score
def calculate_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(pred, ref) for pred, ref in zip(predictions, references)]
    return np.mean([score['rougeL'].fmeasure for score in scores])

#function to calculate BLEU score
def calculate_bleu(predictions, references):
    smoothie = SmoothingFunction().method4
    scores = [sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie) for pred, ref in zip(predictions, references)]
    return np.mean(scores)

#function to F1-score
def calculate_f1(predictions, references):
    predictions_flat = [item for sublist in predictions for item in sublist.split()]
    references_flat = [item for sublist in references for item in sublist.split()]
    return f1_score(references_flat, predictions_flat, average='weighted')

In [None]:
#function for model evaluation
def evaluate_model(model, tokenizer, test_dataset):
    predictions = []
    references = []
    
    for example in test_dataset:
        question = example['processedQuestion']
        reference = example['processedAnswer']
        
        input_ids = tokenizer.encode(question, return_tensors="pt")
        output_ids = model.generate(input_ids, max_length=256, pad_token_id=tokenizer.pad_token_id)
        
        prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        
        predictions.append(prediction)
        references.append(reference)
    
    rouge_score = calculate_rouge(predictions, references)
    bleu_score = calculate_bleu(predictions, references)
    f1 = calculate_f1(predictions, references)
    
    return rouge_score, bleu_score, f1

In [None]:
#evaluating individual models

gpt2_rouge, gpt2_bleu, gpt2_f1 = evaluate_model(modelGPT, tokenizerGPT, tokenized_datasets['validation'])
print(f'GPT-2 - ROUGE: {gpt2_rouge}, BLEU: {gpt2_bleu}, F1: {gpt2_f1}')

t5_rouge, t5_bleu, t5_f1 = evaluate_model(modelT5, tokenizerT5, tokenized_datasets['validation'])
print(f'T5 - ROUGE: {t5_rouge}, BLEU: {t5_bleu}, F1: {t5_f1}')

peg_rouge, peg_bleu, peg_f1 = evaluate_model(modelpeg, tokenizerpeg, tokenized_datasets['validation'])
print(f'peg - ROUGE: {peg_rouge}, BLEU: {peg_bleu}, F1: {peg_f1}')

Bart_rouge, Bart_bleu, Bart_f1 = evaluate_model(modelBart, tokenizerBart, tokenized_datasets['validation'])
print(f'Bart - ROUGE: {Bart_rouge}, BLEU: {Bart_bleu}, F1: {Bart_f1}')

Apple_rouge, Apple_bleu, Apple_f1 = evaluate_model(modelApple, tokenizerApple, tokenized_datasets['validation'])
print(f'Apple - ROUGE: {Apple_rouge}, BLEU: {Apple_bleu}, F1: {Apple_f1}')

In [None]:
#vizualization of data distribution
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Data Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['processedQuestion'].apply(len), bins=50, kde=True)
plt.title('Distribution of Question Lengths')
plt.xlabel('Length of Questions')
plt.ylabel('Frequency')
plt.show()

In [None]:
performance_data = {
    'Model': ['GPT','T5','Pegasus','BART','DCLM-Baseline-7B'],
    'ROUGE': [gpt2_rouge,t5_rouge,peg_rouge,Bart_rouge,Apple_rouge],
    'BLEU': [gpt2_bleu,t5_bleu,peg_bleu,Bart_bleu,Apple_bleu],
    'F1-Score': [gpt2_f1,t5_f1,peg_f1,Bart_f1,Apple_f1]
}

performance_df = pd.DataFrame(performance_data)

fig = px.bar(performance_df, x='Model', y=['ROUGE', 'BLEU', 'F1-Score'], barmode='group',
             title='Model Performance Comparison')
fig.show()