# Installations

In [None]:
# Feature Based Installations
!pip install contractions

In [None]:
# Transformer Based Installations

!pip install transformers
!pip install datasets
!pip install pytorch-lightning==1.2.7 
!pip install torchtext==0.6 torch
!pip install evaluate
!pip install bert_score

# Raw Data Collection

### Feature Extraction Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, f1_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

In [None]:
import re
import string
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

### Transformer Based Imports

In [None]:
import pytorch_lightning as pl
from torch.utils.data import  Dataset,DataLoader

### Data Reading --> Train & Test Data

In [None]:
train_df = pd.read_json("train.jsonl", lines = True)
test_df = pd.read_json("validation.jsonl", lines = True)

In [None]:
print(f"Train Data Shape: {train_df.shape}")
print(f"Test Data Shape: {test_df.shape}")

# Data Cleaning & Data Processing

### Feature Based Preprocessing

In [None]:
def format_data_for_feature_based_models(text):
  return " ".join(text)

In [None]:
def list_to_string(spoiler_type):
  if spoiler_type[0] == "passage":
    return 0
  elif spoiler_type[0] == "phrase":
    return 1
  else:
    return 2

In [None]:
f_train_df = train_df[["targetTitle", "targetParagraphs", "tags"]]
f_test_df = test_df[["targetTitle", "targetParagraphs", "tags"]]

In [None]:
f_train_df["targetParagraphs"] = f_train_df["targetParagraphs"].apply(format_data_for_feature_based_models)
f_test_df["targetParagraphs"] = f_test_df["targetParagraphs"].apply(format_data_for_feature_based_models)

In [None]:
f_train_df["tags"] = f_train_df['tags'].apply(list_to_string)
f_test_df["tags"] = f_test_df['tags'].apply(list_to_string)

In [None]:
f_train_df.head(3)

In [None]:
f_train_df.iloc[0]['targetParagraphs']

In [None]:
f_test_df.head(3)

In [None]:
f_test_df.iloc[0]['targetParagraphs']

In [None]:
f_train_df.rename(columns={"targetTitle":"Post", "targetParagraphs":"Content", "tags":"Spoiler_Type"}, inplace = True)
f_test_df.rename(columns={"targetTitle":"Post", "targetParagraphs":"Content", "tags":"Spoiler_Type"}, inplace = True)

In [None]:
train_df.columns

In [None]:
test_df.columns

In [None]:
f_train_df.columns

In [None]:
f_test_df.columns

In [None]:
def get_features(train_data, test_data, model="bag_of_words"):
  if model == "bag_of_words":
    vectorizer = CountVectorizer()
    vectorizer.fit(train_data["Content"])
    return vectorizer.transform(train_data["Content"]), vectorizer.transform(test_data["Content"])
  elif model == "tf_idf_model":
    vectorizer = TfidfVectorizer(stop_words = "english", max_features=10000)
    return vectorizer.fit_transform(train_data["Content"]), vectorizer.transform(test_data["Content"])    
  elif model == "****":
    pass

# Classification

## Feature Based Classification

### Logistic Regression

#### Bag Of Words Model

In [None]:
# Using Bag Of Words Model
train_x_bag, test_x_bag = get_features(f_train_df, f_test_df, "bag_of_words")

# 1. Multi-Class
model = LogisticRegression(max_iter = 1000, multi_class = "multinomial", class_weight = "balanced")
model.fit(train_x_bag, f_train_df['Spoiler_Type'])

predict_test_multi_bag = model.predict(test_x_bag)

print("Y_test shape:", f_test_df['Spoiler_Type'].shape)
print("Predict_test_multi_bag:", predict_test_multi_bag.shape)
  # Evaluation Metrics:

print("\n")
print("F1-Score Multi Class")
print("F1-Score for Multi Class Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_multi_bag, average="weighted")))
print("Accuracy-Score for Multi Class Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_multi_bag)))

# 2. OneVsRest
model = OneVsRestClassifier(LogisticRegression(max_iter = 1000, class_weight = "balanced"))
model.fit(train_x_bag, f_train_df['Spoiler_Type'])

predict_test_ovr_bag = model.predict(test_x_bag)

  # Evaluation Metrics:
print("\n")
print("F1-Score One Vs Rest")
print("F1-Score for One Vs Rest Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_ovr_bag, average="weighted")))
print("Accuracy-Score for One Vs Rest Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_ovr_bag)))


# 3. OneVsOne

model = OneVsOneClassifier(LogisticRegression(max_iter = 1000, class_weight = "balanced"))
model.fit(train_x_bag, f_train_df['Spoiler_Type'])

predict_test_ovo_bag = model.predict(test_x_bag)

  # Evaluation Metrics:
print("\n")
print("F1-Score One Vs One")
print("F1-Score for One Vs One Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_ovo_bag, average="weighted")))
print("Accuracy-Score for One Vs One Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_ovo_bag)))


#### TF-IDF Model

In [None]:
# Using TF-IDF Model
train_x_bag, test_x_bag = get_features(f_train_df, f_test_df, "tf_idf_model")

# 1. Multi-Class
model = LogisticRegression(max_iter = 1000, multi_class = "multinomial", class_weight = "balanced")
model.fit(train_x_bag, f_train_df['Spoiler_Type'])

predict_test_multi_bag = model.predict(test_x_bag)

print("Y_test shape:", f_test_df['Spoiler_Type'].shape)
print("Predict_test_multi_bag:", predict_test_multi_bag.shape)
  # Evaluation Metrics:

print("\n")
print("F1-Score Multi Class")
print("F1-Score for Multi Class Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_multi_bag, average="weighted")))
print("Accuracy-Score for Multi Class Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_multi_bag)))

# 2. OneVsRest
model = OneVsRestClassifier(LogisticRegression(max_iter = 1000, class_weight = "balanced"))
model.fit(train_x_bag, f_train_df['Spoiler_Type'])

predict_test_ovr_bag = model.predict(test_x_bag)

  # Evaluation Metrics:
print("\n")
print("F1-Score One Vs Rest")
print("F1-Score for One Vs Rest Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_ovr_bag, average="weighted")))
print("Accuracy-Score for One Vs Rest Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_ovr_bag)))


# 3. OneVsOne

model = OneVsOneClassifier(LogisticRegression(max_iter = 1000, class_weight = "balanced"))
model.fit(train_x_bag, f_train_df['Spoiler_Type'])

predict_test_ovo_bag = model.predict(test_x_bag)

  # Evaluation Metrics:
print("\n")
print("F1-Score One Vs One")
print("F1-Score for One Vs One Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_ovo_bag, average="weighted")))
print("Accuracy-Score for One Vs One Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_ovo_bag)))


## Support Vector Machines

### Bag-Of-Model

In [None]:
# Using Bag Of Words Model
train_x_bag, test_x_bag = get_features(f_train_df, f_test_df, "bag_of_words")

# 1. Multi-Class
model = SVC(max_iter=1000, class_weight="balanced", decision_function_shape="ovr")
model.fit(train_x_bag, f_train_df['Spoiler_Type'])

predict_test_ovr_bag = model.predict(test_x_bag)

  # Evaluation Metrics:
print("\n")
print("F1-Score One Vs Rest Class")
print("F1-Score for One Vs Rest Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_ovr_bag, average="weighted")))
print("Accuracy-Score for One Vs Rest Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_ovr_bag)))


# 3. OneVsOne

model = SVC(max_iter=1000, class_weight="balanced", decision_function_shape="ovo")
model.fit(train_x_bag, f_train_df['Spoiler_Type'])

predict_test_ovo_bag = model.predict(test_x_bag)

  # Evaluation Metrics:
print("\n")
print("F1-Score One Vs One")
print("F1-Score for One Vs One Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_ovo_bag, average="weighted")))
print("Accuracy-Score for One Vs One Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_ovo_bag)))


### TF-IDF Model

In [None]:
# Using TF-IDF Model
train_x_bag, test_x_bag = get_features(f_train_df, f_test_df, "tf_idf_model")

# 1. Multi-Class
model = SVC(max_iter=1000, class_weight="balanced", decision_function_shape="ovr")
model.fit(train_x_bag, f_train_df['Spoiler_Type'])

predict_test_ovr_bag = model.predict(test_x_bag)

  # Evaluation Metrics:
print("\n")
print("F1-Score One Vs Rest Class")
print("F1-Score for One Vs Rest Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_ovr_bag, average="weighted")))
print("Accuracy-Score for One Vs Rest Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_ovr_bag)))


# 3. OneVsOne

model = SVC(max_iter=1000, class_weight="balanced", decision_function_shape="ovo")
model.fit(train_x_bag, f_train_df['Spoiler_Type'])

predict_test_ovo_bag = model.predict(test_x_bag)

  # Evaluation Metrics:
print("\n")
print("F1-Score One Vs One")
print("F1-Score for One Vs One Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_ovo_bag, average="weighted")))
print("Accuracy-Score for One Vs One Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_ovo_bag)))


## Naive Bayes

### Bag Of Words Model

In [None]:
# Using Bag Of Words Model
train_x_bag, test_x_bag = get_features(f_train_df, f_test_df, "bag_of_words")

# 1. Multi-Class
model = MultinomialNB()
model.fit(train_x_bag, f_train_df['Spoiler_Type'])

predict_test_multi_bag = model.predict(test_x_bag)

# Evaluation Metrics:
print("\n")
print("F1-Score Multi Class")
print("F1-Score for Multi Class Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_multi_bag, average="weighted")))
print("Accuracy-Score for Multi Class Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_multi_bag)))

# 2. OneVsRest
model = OneVsRestClassifier(MultinomialNB())
model.fit(train_x_bag, f_train_df['Spoiler_Type'])

predict_test_ovr_bag = model.predict(test_x_bag)

# Evaluation Metrics:
print("\n")
print("F1-Score One Vs Rest")
print("F1-Score for One Vs Rest Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_ovr_bag, average="weighted")))
print("Accuracy-Score for One Vs Rest Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_ovr_bag)))


# 3. OneVsOne
model = OneVsOneClassifier(MultinomialNB())
model.fit(train_x_bag, f_train_df['Spoiler_Type'])
predict_test_ovo_bag = model.predict(test_x_bag)

# Evaluation Metrics:
print("\n")
print("F1-Score One Vs One")
print("F1-Score for One Vs One Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_ovo_bag, average="weighted")))
print("Accuracy-Score for One Vs One Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_ovo_bag)))


### TF-IDF Model

In [None]:
# Using Bag Of Words Model
train_x_bag, test_x_bag = get_features(f_train_df, f_test_df, "tf_idf_model")

# 1. Multi-Class
model = MultinomialNB()
model.fit(train_x_bag, f_train_df['Spoiler_Type'])

predict_test_multi_bag = model.predict(test_x_bag)

# Evaluation Metrics:
print("\n")
print("F1-Score Multi Class")
print("F1-Score for Multi Class Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_multi_bag, average="weighted")))
print("Accuracy-Score for Multi Class Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_multi_bag)))

# 2. OneVsRest
model = OneVsRestClassifier(MultinomialNB())
model.fit(train_x_bag, f_train_df['Spoiler_Type'])

predict_test_ovr_bag = model.predict(test_x_bag)

  # Evaluation Metrics:
print("\n")
print("F1-Score One Vs Rest")
print("F1-Score for One Vs Rest Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_ovr_bag, average="weighted")))
print("Accuracy-Score for One Vs Rest Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_ovr_bag)))


# 3. OneVsOne
model = OneVsOneClassifier(MultinomialNB())
model.fit(train_x_bag, f_train_df['Spoiler_Type'])
predict_test_ovo_bag = model.predict(test_x_bag)

  # Evaluation Metrics:
print("\n")
print("F1-Score One Vs One")
print("F1-Score for One Vs One Test Data:{:.2f}%".format(f1_score(f_test_df['Spoiler_Type'], predict_test_ovo_bag, average="weighted")))
print("Accuracy-Score for One Vs One Test Data:{:.2f}%".format(accuracy_score(f_test_df['Spoiler_Type'], predict_test_ovo_bag)))

## Transformer Based Classification

### Roberta Based Classification

In [None]:
!pip install transformers -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m107.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m122.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')

In [None]:
import pandas as pd
train_df = pd.read_json("/content/drive/MyDrive/NLP_dataset/train.jsonl", lines = True)
test_df = pd.read_json("/content/drive/MyDrive/NLP_dataset/validation.jsonl", lines = True)

In [None]:
print("train data shape:",train_df.shape)
print("validation data shape:",test_df.shape)

In [None]:
train_df_bkp = train_df.copy()
test_df_bkp = test_df.copy()

In [None]:
train_df = train_df[["postText", "targetTitle", "targetParagraphs", "tags"]]
test_df = test_df[["postText", "targetTitle", "targetParagraphs", "tags"]]

In [None]:
train_df.rename(columns={"postText":"Post","targetTitle":"Title", "targetParagraphs":"Content", "tags":"Spoiler_Type"}, inplace = True)
# train_df.head(2)
test_df.rename(columns={"postText":"Post","targetTitle":"Title", "targetParagraphs":"Content", "tags":"Spoiler_Type"}, inplace = True)
# test_df.head(2)

In [None]:
def numeric_labels(spoiler_type):
  if spoiler_type[0] == "passage":
    return 1
  elif spoiler_type[0] == "phrase":
    return 0
  else:
    return 2

In [None]:
def list_to_string_concat(TgtPara):
  res = ' '.join(TgtPara)
  return res

In [None]:
test_df["Spoiler_Type"] = test_df["Spoiler_Type"].apply(numeric_labels)
train_df["Spoiler_Type"] = train_df["Spoiler_Type"].apply(numeric_labels)

In [None]:
train_df["Content"] = train_df["Content"].apply(list_to_string_concat)
test_df["Content"] = test_df["Content"].apply(list_to_string_concat)

In [None]:
train_df["Post"] = train_df["Post"].apply(list_to_string_concat)
test_df["Post"] = test_df["Post"].apply(list_to_string_concat)

In [None]:
train_df = train_df.fillna('')
test_df = test_df.fillna('')

In [None]:
#Lower-case:
train_df['Post'] = train_df['Post'].str.lower()
train_df['Title'] = train_df['Title'].str.lower()
train_df['Content'] = train_df['Content'].str.lower()
#Lower-case:
test_df['Post'] = test_df['Post'].str.lower()
test_df['Title'] = test_df['Title'].str.lower()
test_df['Content'] = test_df['Content'].str.lower()

In [None]:
df_multi = train_df.copy()

In [None]:
df_multi = df_multi[df_multi['Spoiler_Type']==2]

In [None]:
df_multi.reset_index(drop=True,inplace=True)

In [None]:
# import spacy

# nlp = spacy.load('en_core_web_sm')

# def get_synonym(word):
#     """
#     Get the most suitable synonym for a word
#     """
#     synonyms = []
#     for syn in wordnet.synsets(word):
#         for lemma in syn.lemmas():
#             synonyms.append(lemma.name())
    
#     # Calculate the similarity scores of all synonyms to the original word
#     word_similarities = []
#     for synonym in synonyms:
#         word_similarities.append((synonym, nlp(word).similarity(nlp(synonym))))
    
#     # Sort synonyms by their similarity score
#     word_similarities.sort(key=lambda x: x[1], reverse=True)
    
#     # Select the first synonym if there is at least one with a non-zero similarity score
#     for syn in word_similarities:
#         if word != syn[0] and syn[1] > 0:
#             return syn[0]
    
#     # Return the original word if there are no suitable synonyms
#     return word

In [None]:
# def change_synonyms(sentence):
#   doc = nlp(sentence)
#   new_sentence = []
#   for token in doc:
#       if token.pos_ == "NOUN":
#           new_sentence.append(get_synonym(token.text))
#       else:
#           new_sentence.append(token.text)

#   altered_sentence = ' '.join(new_sentence)
#   return altered_sentence

In [None]:
# df_multi['Post'] = df_multi['Post'].apply(change_synonyms)
# df_multi['Title'] = df_multi['Title'].apply(change_synonyms)
# df_multi['Content'] = df_multi['Content'].apply(change_synonyms)

In [None]:
# save the dataframe to a CSV file with headers
df_multi.to_csv('multi_part_train_data_new.csv', index=False, header=True)

In [None]:
#Lower-case:
df_multi['Post'] = df_multi['Post'].str.lower()
df_multi['Title'] = df_multi['Title'].str.lower()
df_multi['Content'] = df_multi['Content'].str.lower()

In [None]:
df_merged = pd.concat([train_df, df_multi], ignore_index=True, sort=False)

In [None]:
train_df_bkp = train_df.copy()

0    1367
1    1274
2    1118
Name: Spoiler_Type, dtype: int64

In [None]:
train_df = df_merged.copy()

cuda


In [None]:
train_df["concat_input"] = train_df[['Post','Content','Title']].apply(lambda x: x['Post'] + ' ? ' + x['Title'] + ' -- ' + x['Content'], axis=1)
test_df["concat_input"] = test_df[['Post','Content','Title']].apply(lambda x: x['Post'] + ' ? ' + x['Title'] + ' -- ' + x['Content'], axis=1)

In [None]:
train_df.drop(["Post"], inplace = True, axis = 1)
train_df.drop(["Content"], inplace = True, axis = 1)
train_df.drop(["Title"], inplace = True, axis = 1)
test_df.drop(["Post"], inplace = True, axis = 1)
test_df.drop(["Content"], inplace = True, axis = 1)
test_df.drop(["Title"], inplace = True, axis = 1)

In [None]:
train_df['Spoiler_Type'].value_counts()

In [None]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

import gc
gc.collect()
torch.cuda.empty_cache() 

cuda


In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Load pre-trained RoBERTa model and tokenizer
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [None]:
model.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

In [None]:
from sklearn.model_selection import train_test_split
X_train = list(train_df["concat_input"])
y_train = list(train_df["Spoiler_Type"])
X_test = list(test_df["concat_input"])
y_test = list(test_df["Spoiler_Type"])
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1,stratify=y_train)
# X_test, X_val, y_test, y_val = train_test_split(X_val, y_val, test_size = 0.5, stratify=y_val)

In [None]:
import numpy as np

# Convert lists to numpy arrays
X_train = np.array(X_train)
# X_val = np.array(X_val)
y_train = np.array(y_train)
# y_val = np.array(y_val)

# Prepare training and validation data
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=512)
# val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True, max_length=512)
train_labels = torch.tensor(y_train)
# val_labels = torch.tensor(y_val)

In [None]:
train_input_ids = torch.tensor(train_encodings['input_ids'])
train_attention_mask = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(y_train).to(device)

train_dataset = torch.utils.data.TensorDataset(train_input_ids.to(device), train_attention_mask.to(device), train_labels)

In [None]:
# Set up optimizer and learning rate scheduler
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
optimizer = torch.optim.Adam(model.parameters(), lr=4e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

# Set up training loop
epochs = 5
for epoch in range(epochs):
    # Train model for one epoch
    model.train()
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': batch[2].to(device)}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # # Evaluate model on validation set
    # model.eval()
    # val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)
    # with torch.no_grad():
    #     num_correct = 0
    #     num_total = 0
    #     for batch in val_loader:
    #         inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': batch[2].to(device)}
    #         outputs = model(**inputs)
    #         logits = outputs.logits
    #         preds = torch.argmax(logits, dim=1)
    #         num_correct += torch.sum(preds == batch[2])
    #         num_total += len(batch[2])
    #     acc = num_correct / num_total
    #     print(f'Epoch {epoch+1} - val accuracy: {acc:.4f}')

    # Update learning rate scheduler
    scheduler.step()

In [None]:
import numpy as np
X_test = np.array(X_test)
y_test = np.array(y_test)

# Prepare training and validation data
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512)
test_labels = torch.tensor(y_test)

test_input_ids = torch.tensor(test_encodings['input_ids'])
test_attention_mask = torch.tensor(test_encodings['attention_mask'])
test_labels = torch.tensor(y_test).to(device)

test_dataset = torch.utils.data.TensorDataset(test_input_ids.to(device), test_attention_mask.to(device), test_labels)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_path = '/content/drive/MyDrive/NLP_dataset/models/roberta_classification_model.pt'

In [None]:
# Iterate over the evaluation dataset and make predictions
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

model.eval()
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)
y_true = []
y_pred = []
with torch.no_grad():
    num_correct = 0
    num_total = 0
    for batch in test_loader:
        inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': batch[2].to(device)}
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        y_true.extend(batch[2].tolist())
        y_pred.extend(preds.tolist())
        num_correct += torch.sum(preds == batch[2])
        num_total += len(batch[2])

# Calculate the accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy:.4f}')
# Calculate the F1 score
f1 = f1_score(y_true, y_pred, average='weighted')
print(f'F1 score: {f1:.4f}')
#Prev acc
acc = num_correct / num_total
print(f'Test accuracy: {acc:.4f}')

Accuracy: 0.6963
F1 score: 0.6959
Test accuracy: 0.6962


In [None]:
# Iterate over the evaluation dataset and make predictions
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

model.eval()
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)
y_true = []
y_pred = []
with torch.no_grad():
    num_correct = 0
    num_total = 0
    for batch in test_loader:
        inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': batch[2].to(device)}
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        y_true.extend(batch[2].tolist())
        y_pred.extend(preds.tolist())
        num_correct += torch.sum(preds == batch[2])
        num_total += len(batch[2])

# Calculate the accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy:.4f}')
# Calculate the F1 score
f1 = f1_score(y_true, y_pred, average='weighted')
print(f'F1 score: {f1:.4f}')
#Prev acc
acc = num_correct / num_total
print(f'Test accuracy: {acc:.4f}')

Accuracy: 0.6963
F1 score: 0.6959
Test accuracy: 0.6962


# Phrase

In [None]:
import transformers
from datasets import load_dataset, load_metric
import pandas as pd
from datasets import Dataset
from datasets.load import DatasetDict
from transformers import AutoTokenizer
import torch
import gc
import collections
import evaluate
from evaluate import load
import numpy as np
from tqdm.auto import tqdm
from transformers import default_data_collator
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
print(transformers.__version__)

In [None]:
model_checkpoint = "Palak/microsoft_deberta-large_squad"
batch_size = 2

In [None]:
class DatasetFormatter:
  def __init__(self) -> None:
    self.train_df = pd.read_json("train.jsonl", lines = True)
    self.test_df = pd.read_json("validation.jsonl", lines = True)

  def list_to_string(self, spoiler_type):
    if spoiler_type[0] == "phrase":
      return 0
    elif spoiler_type[0] == "passage":
      return 1
    elif spoiler_type[0] == "multi":
      return 2

  def return_text(self, text):
    return text[0]

  def preprocess_data(self, data):

    no_of_rows = data.shape[0]
    formatted_data = []
    for index in range(no_of_rows):
      complete_description = " ".join(data.iloc[index].to_dict()["targetParagraphs"])
      row = {}
      row["id"] = data.iloc[index].to_dict()["uuid"]
      row["context"] = complete_description,
      row["question"] = data.iloc[index].to_dict()["postText"][0],
      row["answers"] = {
          "text": data.iloc[index].to_dict()["spoiler"],
          "answer_start": [complete_description.find(data.iloc[index].to_dict()["spoiler"][0])]
      }
      formatted_data.append(row)
    
    return formatted_data

  def get_formatted_dataset(self):
    train_df = self.train_df
    test_df = self.test_df

    train_df["tags"] = train_df["tags"].apply(self.list_to_string)
    test_df["tags"] = test_df["tags"].apply(self.list_to_string)

    train_df = train_df[train_df['tags']==0]
    test_df = test_df[test_df['tags']==0]

    train_df = dataset_formatte_obj.preprocess_data(train_df)
    test_df = dataset_formatte_obj.preprocess_data(test_df)

    train_df= pd.DataFrame(train_df)
    test_df = pd.DataFrame(test_df)

    train_df["context"] = train_df["context"].apply(self.return_text)
    test_df["context"] = test_df["context"].apply(self.return_text)


    train_df["question"] = train_df["question"].apply(self.return_text)
    test_df["question"] = test_df["question"].apply(self.return_text)

    validation_df = train_df.iloc[1258:]
    train_df = train_df.iloc[:1174]

    dataset_train = Dataset.from_pandas(train_df)
    dataset_validation = Dataset.from_pandas(validation_df)
    dataset_test =  Dataset.from_pandas(test_df)

    datasets = DatasetDict()

    datasets["train"] = dataset_train
    datasets["validation"] = dataset_validation
    datasets["test"] = dataset_test

    return datasets


dataset_formatte_obj = DatasetFormatter()

datasets = dataset_formatte_obj.get_formatted_dataset()

In [None]:
class Preprocessor:
  def __init__(self, model_checkpoint, max_length, doc_stride) -> None:
    self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    self.max_length = max_length
    self.doc_stride = doc_stride
    self.pad_on_right = self.tokenizer.padding_side == "right"

  def prepare_train_features(self, examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]

    tokenized_examples = self.tokenizer(
        examples["question" if self.pad_on_right else "context"],
        examples["context" if self.pad_on_right else "question"],
        truncation="only_second" if self.pad_on_right else "only_first",
        max_length=self.max_length,
        stride=self.doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    offset_mapping = tokenized_examples.pop("offset_mapping")

    start_positions, end_positions = [], []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(self.tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if self.pad_on_right else 0):
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if self.pad_on_right else 0):
                token_end_index -= 1
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples

max_length = 384
doc_stride = 128
proprocessor_obj = Preprocessor(model_checkpoint, max_length, doc_stride)
tokenized_datasets = datasets.map(proprocessor_obj.prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

In [None]:
class FineTune:
  def __init__(self, model_checkpoint) -> None:
    self.model_checkpoint = model_checkpoint
    self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_checkpoint)
    self.setup()
    self.trainer = None

  def setup(self):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    self.model.to(device)
    print(f'Working on {device}')

    gc.collect()
    torch.cuda.empty_cache()
    print("GPU Cache removed")

  def trainer_model(self):
    model_name = self.model_checkpoint.split("/")[-1]
    args = TrainingArguments(
        f"{model_name}-finetuned-webis",
        evaluation_strategy = "epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=1,
        weight_decay=0.01,
        push_to_hub=False,
    )

    data_collator = default_data_collator
    trainer = Trainer(
    self.model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=proprocessor_obj.tokenizer,
    )
    self.trainer = trainer

  def get_trainer(self):
    return self.trainer


In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip install --upgrade accelerate

In [None]:
fine_tune_obj = FineTune(model_checkpoint)
fine_tune_obj.trainer_model()
trainer = fine_tune_obj.get_trainer()

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
test_datasets = DatasetDict()
test_datasets["test"] = datasets["test"]

In [None]:
trainer.save_model("test-squad-trained")

In [None]:
class Evaluate:
  def __init__(self, proprocessor_obj):
    self.tokenizer = proprocessor_obj.tokenizer
    # The maximum length of a feature (question and context)
    self.max_length = proprocessor_obj.max_length
    self.doc_stride = proprocessor_obj.doc_stride
    self.pad_on_right = self.tokenizer.padding_side == "right"


  def prepare_validation_features(self, examples):
      examples["question"] = [q.lstrip() for q in examples["question"]]

      tokenized_examples = proprocessor_obj.tokenizer(
          examples["question" if self.pad_on_right else "context"],
          examples["context" if self.pad_on_right else "question"],
          truncation="only_second" if self.pad_on_right else "only_first",
          max_length=max_length,
          stride=doc_stride,
          return_overflowing_tokens=True,
          return_offsets_mapping=True,
          padding="max_length",
      )
      sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

      tokenized_examples["example_id"] = []

      for i in range(len(tokenized_examples["input_ids"])):
          sequence_ids = tokenized_examples.sequence_ids(i)
          context_index = 1 if self.pad_on_right else 0

          sample_index = sample_mapping[i]
          tokenized_examples["example_id"].append(examples["id"][sample_index])
          tokenized_examples["offset_mapping"][i] = [
              (o if sequence_ids[k] == context_index else None)
              for k, o in enumerate(tokenized_examples["offset_mapping"][i])
          ]

      return tokenized_examples
  
  def prepare_test_features(self, examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]

    tokenized_examples = self.tokenizer(
        examples["question" if self.pad_on_right else "context"],
        examples["context" if self.pad_on_right else "question"],
        truncation="only_second" if self.pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if self.pad_on_right else 0

        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

  def postprocess_qa_predictions(self, examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)  
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()


    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")


    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]

        min_null_score = None 
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]

            cls_index = features[feature_index]["input_ids"].index(self.tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[example["id"]] = best_answer["text"]

    return predictions



evaluate_obj = Evaluate(proprocessor_obj)

In [None]:
validation_features = datasets["validation"].map(
    evaluate_obj.prepare_validation_features,
    batched=True,
    remove_columns=datasets["validation"].column_names
)
raw_predictions = trainer.predict(validation_features)
validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

In [None]:
test_features = datasets["test"].map(
    evaluate_obj.prepare_test_features,
    batched=True,
    remove_columns=datasets["test"].column_names
)
raw_predictions = trainer.predict(test_features)
test_features.set_format(type=test_features.format["type"], columns=list(test_features.features.keys()))

In [None]:
final_predictions = evaluate_obj.postprocess_qa_predictions(test_datasets["test"], test_features, raw_predictions.predictions)

In [None]:
metric = load_metric("squad")

In [None]:
formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in test_datasets["test"]]
metric.compute(predictions=formatted_predictions, references=references)

In [None]:
class EvaluationMetric:

  def __init__(self):
    self.bleu = evaluate.load('bleu')
    self.meteor = evaluate.load('meteor')
    self.bertscore = load("bertscore")

  def get_bleu_score(self, formatted_predictions, references):
    results = 0
    count = 0
    for each in formatted_predictions:
      count += 1
      for ref in references:

        if each["id"] == ref["id"]:
          results += self.bleu.compute(predictions = [f'"{each["prediction_text"].strip()}"'], references = [[f'"{ref["answers"]["text"][0].strip()}"']]).get("bleu")

    bleu_score = results/count
    return bleu_score

  def get_meteor_score(self, formatted_predictions, references):
    results = 0
    count = 0
    for each in formatted_predictions:
      count += 1
      for ref in references:

        if each["id"] == ref["id"]:
          results += self.meteor.compute(predictions = [f'"{each["prediction_text"].strip()}"'], references = [f'"{ref["answers"]["text"][0].strip()}"']).get("meteor")

    meteor_score = results/count
    return meteor_score
  
  def get_bert_score(self, formatted_predictions, references):
    results = 0
    total_precision = 0
    total_recall = 0
    total_f1 = 0

    count = 0

    for each in formatted_predictions:
      count += 1
      for ref in references:

        if each["id"] == ref["id"]:
          results = self.bertscore.compute(predictions=[f'"{each["prediction_text"].strip()}"'], references=[f'"{ref["answers"]["text"][0].strip()}"'], lang="en")
          total_precision += results.get("precision")[0]
          total_recall += results.get("recall")[0]
          total_f1 += results.get("f1")[0]

    avg_precision = total_precision/count
    avg_recall = total_recall/count
    avg_f1 = total_f1/count

    return avg_precision, avg_recall, avg_f1
    

In [None]:
evaluation_metric_obj = EvaluationMetric()
bleu_score = evaluation_metric_obj.get_bleu_score(formatted_predictions, references)
meteor_score = evaluation_metric_obj.get_meteor_score(formatted_predictions, references)
avg_precision, avg_recall, avg_f1 = evaluation_metric_obj.get_bert_score(formatted_predictions, references)

print("BLEU Score: ", bleu_score)
print("METEOR Score: ", meteor_score)
print("Avg. Precision value: ", avg_precision)
print("Avg. Recall value: ", avg_recall)
print("Avg. F1 value: ", avg_f1)

# Passage

In [None]:
model_checkpoint = "thatdramebaazguy/roberta-base-squad"
batch_size = 2

In [None]:
class DatasetFormatter:
  def __init__(self) -> None:
    self.train_df = pd.read_json("train.jsonl", lines = True)
    self.test_df = pd.read_json("validation.jsonl", lines = True)

  def list_to_string(self, spoiler_type):
    if spoiler_type[0] == "phrase":
      return 0
    elif spoiler_type[0] == "passage":
      return 1
    elif spoiler_type[0] == "multi":
      return 2

  def return_text(self, text):
    return text[0]

  def preprocess_data(self, data):

    no_of_rows = data.shape[0]
    formatted_data = []
    for index in range(no_of_rows):
      complete_description = " ".join(data.iloc[index].to_dict()["targetParagraphs"])
      row = {}
      row["id"] = data.iloc[index].to_dict()["uuid"]
      row["context"] = complete_description,
      row["question"] = data.iloc[index].to_dict()["postText"][0],
      row["answers"] = {
          "text": data.iloc[index].to_dict()["spoiler"],
          "answer_start": [complete_description.find(data.iloc[index].to_dict()["spoiler"][0])]
      }
      formatted_data.append(row)
    
    return formatted_data

  def get_formatted_dataset(self):
    train_df = self.train_df
    test_df = self.test_df

    train_df["tags"] = train_df["tags"].apply(self.list_to_string)
    test_df["tags"] = test_df["tags"].apply(self.list_to_string)

    # Taking only Pharse dataset
    train_df = train_df[train_df['tags']==1]
    test_df = test_df[test_df['tags']==1]

    train_df = dataset_formatte_obj.preprocess_data(train_df)
    test_df = dataset_formatte_obj.preprocess_data(test_df)

    train_df= pd.DataFrame(train_df)
    test_df = pd.DataFrame(test_df)

    # Removing the list
    train_df["context"] = train_df["context"].apply(self.return_text)
    test_df["context"] = test_df["context"].apply(self.return_text)

    # Removing the list
    train_df["question"] = train_df["question"].apply(self.return_text)
    test_df["question"] = test_df["question"].apply(self.return_text)

    # Spliting the train and validation set from training dataset
    validation_df = train_df.iloc[1258:]
    train_df = train_df.iloc[:1174]

    dataset_train = Dataset.from_pandas(train_df)
    dataset_validation = Dataset.from_pandas(validation_df)
    dataset_test =  Dataset.from_pandas(test_df)

    datasets = DatasetDict()

    datasets["train"] = dataset_train
    datasets["validation"] = dataset_validation
    datasets["test"] = dataset_test

    return datasets


dataset_formatte_obj = DatasetFormatter()

datasets = dataset_formatte_obj.get_formatted_dataset()

In [None]:
class Preprocessor:
  def __init__(self, model_checkpoint, max_length, doc_stride) -> None:
    self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    self.max_length = max_length
    self.doc_stride = doc_stride
    self.pad_on_right = self.tokenizer.padding_side == "right"

  def prepare_train_features(self, examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]

    tokenized_examples = self.tokenizer(
        examples["question" if self.pad_on_right else "context"],
        examples["context" if self.pad_on_right else "question"],
        truncation="only_second" if self.pad_on_right else "only_first",
        max_length=self.max_length,
        stride=self.doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    offset_mapping = tokenized_examples.pop("offset_mapping")


    start_positions, end_positions = [], []


    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(self.tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:

            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if self.pad_on_right else 0):
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if self.pad_on_right else 0):
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples

max_length = 384
doc_stride = 128
proprocessor_obj = Preprocessor(model_checkpoint, max_length, doc_stride)
tokenized_datasets = datasets.map(proprocessor_obj.prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

In [None]:
class FineTune:
  def __init__(self, model_checkpoint) -> None:
    self.model_checkpoint = model_checkpoint
    self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_checkpoint)
    self.setup()
    self.trainer = None

  def setup(self):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    self.model.to(device)
    print(f'Working on {device}')

    gc.collect()
    torch.cuda.empty_cache()
    print("GPU Cache removed")

  def trainer_model(self):
    model_name = self.model_checkpoint.split("/")[-1]
    args = TrainingArguments(
        f"{model_name}-finetuned-webis",
        evaluation_strategy = "epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=1,
        weight_decay=0.01,
        push_to_hub=False,
    )

    data_collator = default_data_collator
    trainer = Trainer(
    self.model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=proprocessor_obj.tokenizer,
    )
    self.trainer = trainer

  def get_trainer(self):
    return self.trainer


In [None]:
fine_tune_obj = FineTune(model_checkpoint)
fine_tune_obj.trainer_model()
trainer = fine_tune_obj.get_trainer()

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
test_datasets = DatasetDict()
test_datasets["test"] = datasets["test"]

In [None]:
trainer.save_model("test-squad-trained")

In [None]:

class Evaluate:
  def __init__(self, proprocessor_obj):
    self.tokenizer = proprocessor_obj.tokenizer
    self.max_length = proprocessor_obj.max_length
    self.doc_stride = proprocessor_obj.doc_stride
    self.pad_on_right = self.tokenizer.padding_side == "right"


  def prepare_validation_features(self, examples):
      examples["question"] = [q.lstrip() for q in examples["question"]]

      tokenized_examples = proprocessor_obj.tokenizer(
          examples["question" if self.pad_on_right else "context"],
          examples["context" if self.pad_on_right else "question"],
          truncation="only_second" if self.pad_on_right else "only_first",
          max_length=max_length,
          stride=doc_stride,
          return_overflowing_tokens=True,
          return_offsets_mapping=True,
          padding="max_length",
      )

      sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

      tokenized_examples["example_id"] = []

      for i in range(len(tokenized_examples["input_ids"])):
          sequence_ids = tokenized_examples.sequence_ids(i)
          context_index = 1 if self.pad_on_right else 0


          sample_index = sample_mapping[i]
          tokenized_examples["example_id"].append(examples["id"][sample_index])

          tokenized_examples["offset_mapping"][i] = [
              (o if sequence_ids[k] == context_index else None)
              for k, o in enumerate(tokenized_examples["offset_mapping"][i])
          ]

      return tokenized_examples
  
  def prepare_test_features(self, examples):

    examples["question"] = [q.lstrip() for q in examples["question"]]

    tokenized_examples = self.tokenizer(
        examples["question" if self.pad_on_right else "context"],
        examples["context" if self.pad_on_right else "question"],
        truncation="only_second" if self.pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )


    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if self.pad_on_right else 0


        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

  def postprocess_qa_predictions(self, examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)  
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")


    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]

        min_null_score = None 
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]
            cls_index = features[feature_index]["input_ids"].index(self.tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:

                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[example["id"]] = best_answer["text"]

    return predictions



evaluate_obj = Evaluate(proprocessor_obj)

In [None]:
validation_features = datasets["validation"].map(
    evaluate_obj.prepare_validation_features,
    batched=True,
    remove_columns=datasets["validation"].column_names
)
raw_predictions = trainer.predict(validation_features)
validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

In [None]:
test_features = datasets["test"].map(
    evaluate_obj.prepare_test_features,
    batched=True,
    remove_columns=datasets["test"].column_names
)
raw_predictions = trainer.predict(test_features)
test_features.set_format(type=test_features.format["type"], columns=list(test_features.features.keys()))

In [None]:
final_predictions = evaluate_obj.postprocess_qa_predictions(test_datasets["test"], test_features, raw_predictions.predictions)

In [None]:
metric = load_metric("squad")

In [None]:
formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in test_datasets["test"]]
metric.compute(predictions=formatted_predictions, references=references)

In [None]:
class EvaluationMetric:

  def __init__(self):
    self.bleu = evaluate.load('bleu')
    self.meteor = evaluate.load('meteor')
    self.bertscore = load("bertscore")

  def list_to_string(self, spoiler_type):
    if spoiler_type[0] == "phrase":
      return 0
    elif spoiler_type[0] == "passage":
      return 1
    elif spoiler_type[0] == "multi":
      return 2

  def test_target_paragraphs(self, data):
    target_paragraphs = []
    for row in data.values.tolist():
      target_paragraphs.append((row[0],row[4]))
    return target_paragraphs
  
  def return_target_paragraph(self, id, prediction_text):
    test_df = pd.read_json("validation.jsonl", lines = True)
    test_df["tags"] = test_df["tags"].apply(self.list_to_string)
    test_df = test_df[test_df["tags"] == 1]
    target_paragraphs = self.test_target_paragraphs(test_df)

    for each in target_paragraphs:
      if each[0] == id:
        for sentence in each[1]:
          if prediction_text in sentence:
            return sentence

    return prediction_text 

  def get_bleu_score(self, formatted_predictions, references):
    results = 0
    count = 0
    for each in formatted_predictions:
      count += 1
      for ref in references:

        if each["id"] == ref["id"]:
          prediction = self.return_target_paragraph(each["id"],each["prediction_text"])
          results += self.bleu.compute(predictions = [f'"{prediction.strip()}"'], references = [[f'"{ref["answers"]["text"][0].strip()}"']]).get("bleu")

    bleu_score = results/count
    return bleu_score

  def get_meteor_score(self, formatted_predictions, references):
    results = 0
    count = 0
    for each in formatted_predictions:
      count += 1
      for ref in references:

        if each["id"] == ref["id"]:
          prediction = self.return_target_paragraph(each["id"],each["prediction_text"])
          results += self.meteor.compute(predictions = [f'"{prediction.strip()}"'], references = [f'"{ref["answers"]["text"][0].strip()}"']).get("meteor")

    meteor_score = results/count
    return meteor_score
  
  def get_bert_score(self, formatted_predictions, references):
    results = 0
    total_precision = 0
    total_recall = 0
    total_f1 = 0

    count = 0

    for each in formatted_predictions:
      count += 1
      for ref in references:

        if each["id"] == ref["id"]:
          prediction = self.return_target_paragraph(each["id"],each["prediction_text"])
          results = self.bertscore.compute(predictions=[f'"{prediction.strip()}"'], references=[f'"{ref["answers"]["text"][0].strip()}"'], lang="en")
          total_precision += results.get("precision")[0]
          total_recall += results.get("recall")[0]
          total_f1 += results.get("f1")[0]

    avg_precision = total_precision/count
    avg_recall = total_recall/count
    avg_f1 = total_f1/count

    return avg_precision, avg_recall, avg_f1   

In [None]:
evaluation_metric_obj = EvaluationMetric()
bleu_score = evaluation_metric_obj.get_bleu_score(formatted_predictions, references)
meteor_score = evaluation_metric_obj.get_meteor_score(formatted_predictions, references)
avg_precision, avg_recall, avg_f1 = evaluation_metric_obj.get_bert_score(formatted_predictions, references)

print("BLEU Score: ", bleu_score)
print("METEOR Score: ", meteor_score)
print("Avg. Precision value: ", avg_precision)
print("Avg. Recall value: ", avg_recall)
print("Avg. F1 value: ", avg_f1)

# Summarization

In [None]:
# !pip3 install datasets
# !pip3 install pytorch-lightning==1.2.7
# !pip3 install torchtext==0.6 torch

In [None]:
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
)

from tqdm.auto import tqdm

from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns
from torch.utils.data import  Dataset,DataLoader
import pytorch_lightning as pl
import torch
import gc
import pandas as pd
import numpy as np
from datasets import load_dataset, load_metric

%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style="whitegrid", palette='muted', font_scale = 1.2)
rcParams['figure.figsize'] = 16,10

In [None]:
MODEL_NAME = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

In [None]:
train_df = pd.read_json("train.jsonl", lines = True)
test_df = pd.read_json("validation.jsonl", lines = True)

In [None]:
def list_to_string(spoiler_type):
  if spoiler_type[0] == "phrase":
    return 0
  elif spoiler_type[0] == "passage":
    return 1
  elif spoiler_type[0] == "multi":
    return 2

In [None]:
train_df["tags"] = train_df["tags"].apply(list_to_string)
test_df["tags"] = test_df["tags"].apply(list_to_string)

In [None]:
train_df = train_df[train_df['tags']==2]
test_df = test_df[test_df['tags']==2]

In [None]:
def preprocess_data(data):

  no_of_rows = data.shape[0]
  formatted_data = []
  for index in range(no_of_rows):
    complete_description = " ".join(data.iloc[index].to_dict()["targetParagraphs"])
    row = {}
    row["id"] = data.iloc[index].to_dict()["uuid"]
    row["context"] = complete_description,
    row["question"] = data.iloc[index].to_dict()["postText"][0],
    row["answers"] = {
        "text": data.iloc[index].to_dict()["spoiler"],
        "answer_start": [complete_description.find(data.iloc[index].to_dict()["spoiler"][0])]
    }
    formatted_data.append(row)
  
  return formatted_data

In [None]:
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

In [None]:
train_df= pd.DataFrame(train_df)
test_df = pd.DataFrame(test_df)

In [None]:
def return_text(text):
  return text[0]

In [None]:
train_df["context"] = train_df["context"].apply(return_text)
test_df["context"] = test_df["context"].apply(return_text)

In [None]:
train_df["question"] = train_df["question"].apply(return_text)
test_df["question"] = test_df["question"].apply(return_text)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
def preprocess_answers(text):
  return ",".join(text["text"])

In [None]:
train_df["answers"] = train_df["answers"].apply(preprocess_answers)

In [None]:
test_df["answers"] = test_df["answers"].apply(preprocess_answers)

In [None]:
train_df = train_df[["context", "question", "answers"]]
test_df = test_df[["context", "question", "answers"]]

In [None]:
def join_context_question(data):
  final_df = []
  for each in data:
    final_df.append([each[0]+" ? "+each[1], each[2]])
  
  return pd.DataFrame(final_df, columns = ["context", "answers"])

In [None]:
train_df = join_context_question(train_df.values.tolist())

In [None]:
test_df = join_context_question(test_df.values.tolist())

In [None]:
train_df["context"] = train_df["context"].str.encode("ascii", "ignore").str.decode("ascii")
train_df["answers"] = train_df["answers"].str.encode("ascii", "ignore").str.decode("ascii")

In [None]:
test_df["context"] = test_df["context"].str.encode("ascii", "ignore").str.decode("ascii")
test_df["answers"] = test_df["answers"].str.encode("ascii", "ignore").str.decode("ascii")

In [None]:
class ClickbaitSummaryDataset(Dataset):
  def __init__(self,
               data,
               tokenizer,
               text_max_token_len= 512,
               summary_max_token_len= 128):
    self.tokenizer = tokenizer
    self.data = data
    self.text_max_token_len = text_max_token_len
    self.summary_max_token_len = summary_max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    text = data_row["context"]

    text_encoding = self.tokenizer(text,
                              max_length= self.text_max_token_len,
                              padding = "max_length",
                              truncation=True,
                              return_attention_mask = True,
                              add_special_tokens = True,
                              return_tensors = "pt"
                              )
    summary_encoding = self.tokenizer(data_row["answers"],
                              max_length= self.text_max_token_len,
                              padding = "max_length",
                              truncation=True,
                              return_attention_mask = True,
                              add_special_tokens = True,
                              return_tensors = "pt"
                              )
    labels = summary_encoding["input_ids"]
    labels[labels==0] = -100

    return dict(text=text, summary = data_row["answers"], text_input_ids=text_encoding["input_ids"].flatten(),
                                              text_attention_mask = text_encoding["attention_mask"].flatten(),
                                              labels = labels.flatten(),
                                              labels_attention_mask=summary_encoding["attention_mask"].flatten()
                                              )

In [None]:
class ClickbaitSummaryDataModule(pl.LightningDataModule):
  def __int__(self, train_df, test_df, tokenizer, BATCH_SIZE):
    super().__init__()
  
  def setup(self, stage=None):
    self.train_dataset = ClickbaitSummaryDataset(
        self.train_df,
        self.tokenizer,
        self.text_max_token_len,
        self.summary_max_token_len
    )
    self.test_dataset = ClickbaitSummaryDataset(
        self.test_df,
        self.tokenizer,
        self.text_max_token_len,
        self.summary_max_token_len
    )

  def train_dataloader(self):
    return DataLoader(self.train_dataset, batch_size = self.batch_size, shuffle=True, num_workers = 2)
  
  def test_dataloader(self):
    return DataLoader(self.test_dataset, batch_size = self.batch_size, shuffle = True, num_workers = 2)

  def val_dataloader(self):
    return DataLoader(self.test_dataset, batch_size = self.batch_size, shuffle = True, num_workers = 2)

In [None]:
class ClickbaitSummaryModel(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)
  
  def forward(self, input_ids, attention_mask, decoder_attention_mask, labels = None):
    output = self.model(
        input_ids,
        attention_mask= attention_mask,
        labels = labels,
        decoder_attention_mask = decoder_attention_mask
    )

    return output.loss, output.logits

  def training_step(self, batch, batch_size):
    input_ids = batch["text_input_ids"]
    attention_mask = batch["text_attention_mask"]
    labels = batch["labels"]
    labels_attention_mask = batch["labels_attention_mask"]

    loss, outputs = self(input_ids = input_ids,
                         attention_mask = attention_mask,
                         decoder_attention_mask = labels_attention_mask,
                         labels = labels)
    
    return loss
    
    def validation_step(self, batch, batch_size):
      input_ids = batch["text_input_ids"]
      attention_mask = batch["text_attention_mask"]
      labels = batch["labels"]
      labels_attention_mask = batch["labels_attention_mask"]

      loss, outputs = self(input_ids = input_ids,
                          attention_mask = attention_mask,
                          decoder_attention_mask = labels_attention_mask,
                          labels = labels)
      return loss

    def test_step(self, batch, batch_size):
      input_ids = batch["text_input_ids"]
      attention_mask = batch["text_attention_mask"]
      labels = batch["labels"]
      labels_attention_mask = batch["labels_attention_mask"]

      loss, outputs = self(input_ids = input_ids,
                          attention_mask = attention_mask,
                          decoder_attention_mask = labels_attention_mask,
                          labels = labels)
      return loss


  def configure_optimizers(self):
    return AdamW(self.parameters(), lr = 0.0001)

In [None]:
N_EPOCHS = 5
BATCH_SIZE = 2

In [None]:
data_module = ClickbaitSummaryDataModule(train_df, test_df, tokenizer, BATCH_SIZE)
data_module.train_df = train_df
data_module.test_df = test_df
data_module.tokenizer = tokenizer
data_module.text_max_token_len = 512
data_module.summary_max_token_len = 128
data_module.batch_size = BATCH_SIZE

In [None]:
model = ClickbaitSummaryModel()

In [None]:
trainer = pl.Trainer(
    gpus= 1,
    max_epochs = N_EPOCHS,
    progress_bar_refresh_rate = 40
)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
gc.collect()
torch.cuda.empty_cache() 
model = model.to(device)

In [None]:
trainer.fit(model, data_module)

In [None]:
trained_model = ClickbaitSummaryModel.load_from_checkpoint(
    trainer.checkpoint_callback.best_model_path
)

trained_model.freeze()

In [None]:
def summarizeText(text):
  text_encoding = tokenizer(
      text, 
      max_length = 512,
      padding = 'max_length',
      truncation = True,
      return_attention_mask = True,
      add_special_tokens = True,
      return_tensors = 'pt'
  )

  generated_ids = trained_model.model.generate(
      input_ids = text_encoding['input_ids'],
      attention_mask = text_encoding['attention_mask'],
      max_length = 150,
      num_beams = 3,
      repetition_penalty=2.5,
      length_penalty = 1.0,
      early_stopping=True
  )


  preds = [ tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            for gen_id in generated_ids
  ]
  
  return "".join(preds)

In [None]:
test_dataset_len= len(test_df)

In [None]:
formatted_predictions = [{"id": i, "prediction_text": summarizeText(context)} for i, context in enumerate(test_df["context"])]
references = [{"id": i, "answers": {"answer_start":[0], "text":[ex]}} for i, ex in enumerate(test_df["answers"])]
result = metric.compute(predictions=formatted_predictions, references=references)

In [None]:
import evaluate
bleu = evaluate.load('bleu')
results = 0
prediction_test = []
reference_test = []

for i in range(test_dataset_len):
  prediction_test.append(formatted_predictions[i]["prediction_text"].strip())
  reference_test.append(references[i]["answers"]["text"][0].strip())

results = bleu.compute(predictions = prediction_test, references = reference_test).get("bleu")

print(f"BLEU SCORE: {results}")

In [None]:
meteor = evaluate.load('meteor')
results = 0

results = meteor.compute(predictions = prediction_test, references = reference_test).get("meteor")

print(f"METEOR SCORE: {results}")

In [None]:
!pip install bert_score

In [None]:
# BERTScore
from statistics import mean

bert_score = evaluate.load("bertscore")

results = bert_score.compute(predictions = prediction_test, references = reference_test, lang='en')

print("BERT F1 Score: ", mean(results.get("f1")))

In [None]:
!pip3 install rouge_score

In [None]:
rouge = evaluate.load('rouge')

results = rouge.compute(predictions = prediction_test, references = reference_test)


print(results.keys())

print(f"ROGUE SCORE (Uni gram): {results['rouge1']}")
print(f"ROGUE SCORE (Bi gram): {results['rouge2']}")
print(f"ROUGE SCORE (LCS): {results['rougeL']}")
print(f"ROGUE SCORE (Lsum): {results['rougeLsum']}")