In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install bert_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import requests
import json
import torch
import os
from tqdm import tqdm

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import re
import string
import nltk
from nltk.corpus import stopwords
from datasets import load_metric
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# read data from the json files
train_df = pd.read_json("train.jsonl", lines = True)
test_df = pd.read_json("validation.jsonl", lines = True)

print("train data shape:",train_df.shape)
print("Test data shape:",test_df.shape)


train_df = train_df[["uuid", "postText", "targetTitle", "targetParagraphs", "spoiler", "provenance", "tags", "spoilerPositions"]]
train_df.rename(columns={"postText":"clickbait", "targetTitle": "title", "targetParagraphs":"document", "spoilerPositions": "positions", "tags":"type"}, inplace = True)

test_df = test_df[["uuid", "postText", "targetTitle", "targetParagraphs", "spoiler", "provenance", "tags", "spoilerPositions"]]
test_df.rename(columns={"postText":"clickbait", "targetTitle": "title", "targetParagraphs":"document", "spoilerPositions":"positions", "tags":"type"}, inplace = True)

# Function to Make values in Spoiler_Type(list to string)
def list_to_string(spoiler_type):
  if spoiler_type[0] == "phrase":
    return 0
  elif spoiler_type[0] == "passage":
    return 1
  elif spoiler_type[0] == "multi":
    return 2

train_df["type"] = train_df["type"].apply(list_to_string)
test_df["type"] = test_df["type"].apply(list_to_string)

train data shape: (3200, 14)
Test data shape: (800, 14)


In [None]:
# total_df_bkp = total_df.copy()
train_df = train_df[train_df['type']==1]
test_df = test_df[test_df['type']==1]


validation_df = train_df.iloc[1258:]
train_df = train_df.iloc[:1174]


In [None]:
"""

train_data = [
    {
        "context": "Mistborn is a series of epic fantasy novels written by American author Brandon Sanderson.",
        "qas": [
            {
                "id": "00001",
                "is_impossible": False,
                "question": "Who is the author of the Mistborn series?",
                "answers": [
                    {
                        "text": "Brandon Sanderson",
                        "answer_start": 71,
                    }
                ],
            }
        ],
    }
  ]

"""

train_formatted_data = []

for index, row in train_df.iterrows():
    row["document"].append(row["title"])

    train_formatted_data.append({
        "human_spoiler":row["provenance"].get("humanSpoiler"),
        "context": ' '.join(row["document"]),
        "qas": [
            {
                "id": row["uuid"],
                "is_impossible": False,
                "question": row["clickbait"][0],
                "answers": [
                    {
                        "text": row["spoiler"][0],
                        "answer_start": ' '.join(row["document"]).find(row["spoiler"][0]),
                    }
                ],
            }
        ]
    })


eval_formatted_data = []

for index, row in validation_df.iterrows():
    row["document"].append(row["title"])
    
    eval_formatted_data.append({
        "human_spoiler":row["provenance"].get("humanSpoiler"),
        "context": ' '.join(row["document"]),
        "qas": [
            {
                "id": row["uuid"],
                "is_impossible": False,
                "question": row["clickbait"][0],
                "answers": [
                    {
                        "text": row["spoiler"][0],
                        "answer_start": ' '.join(row["document"]).find(row["spoiler"][0]),
                    }
                ],
            }
        ]
    })

test_formatted_data = []

for index, row in test_df.iterrows():
    row["document"].append(row["title"])
    
    test_formatted_data.append({
        "human_spoiler":row["provenance"].get("humanSpoiler"),
        "context": ' '.join(row["document"]),
        "qas": [
            {
                "id": row["uuid"],
                "is_impossible": False,
                "question": row["clickbait"][0],
                "answers": [
                    {
                        "text": row["spoiler"][0],
                        "answer_start": ' '.join(row["document"]).find(row["spoiler"][0]),
                    }
                ],
            }
        ]
    })


In [None]:
train_formatted_data

[{'human_spoiler': 'They Threw A Football',
  'context': 'It’ll be just like old times this weekend for Tom Brady and Wes Welker. Welker revealed Friday morning on a Miami radio station that he contacted Brady because he’ll be in town for Sunday’s game between the New England Patriots and Miami Dolphins at Gillette Stadium. It seemed like a perfect opportunity for the two to catch up. But Brady’s definition of "catching up" involves far more than just a meal. In fact, it involves some literal "catching" as the Patriots quarterback looks to stay sharp during his four-game Deflategate suspension. "I hit him up to do dinner Saturday night. He’s like, ‘I’m going to be flying in from Ann Arbor later (after the Michigan-Colorado football game), but how about that morning we go throw?’ " Welker said on WQAM, per The Boston Globe. "And I’m just sitting there, I’m like, ‘I was just thinking about dinner, but yeah, sure. I’ll get over there early and we can throw a little bit.’ " Welker was one 

In [None]:
def read_data(formatted_data):  

  contexts = []
  questions = []
  answers = []
  human_spoilers = []

  for passage in formatted_data:
    context = passage['context']
    human_spoiler = passage["human_spoiler"]
    for qa in passage['qas']:
      question = qa['question']
      for answer in qa['answers']:
        contexts.append(context)
        questions.append(question)
        answers.append(answer)
        human_spoilers.append(human_spoiler)

  return contexts, questions, answers, human_spoilers

In [None]:
train_contexts, train_questions, train_answers, train_human_spoilers = read_data(train_formatted_data)
valid_contexts, valid_questions, valid_answers, valid_human_spoilers = read_data(eval_formatted_data)
test_contexts, test_questions, test_answers, test_human_spoilers = read_data(test_formatted_data)

In [None]:

def add_end_idx(answers, contexts):
  for answer, context in zip(answers, contexts):
    gold_text = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(gold_text)

    # sometimes squad answers are off by a character or two so we fix this
    if context[start_idx:end_idx] == gold_text:
      answer['answer_end'] = end_idx
    elif context[start_idx-1:end_idx-1] == gold_text:
      answer['answer_start'] = start_idx - 1
      answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == gold_text:
      answer['answer_start'] = start_idx - 2
      answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(valid_answers, valid_contexts)
add_end_idx(test_answers, test_contexts)

In [None]:
from transformers import BertTokenizerFast, RobertaTokenizerFast
from transformers import AutoTokenizer, DebertaV2ForQuestionAnswering, DebertaTokenizerFast, DebertaTokenizer

tokenizer = RobertaTokenizerFast.from_pretrained("deepset/roberta-large-squad2")

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
valid_encodings = tokenizer(valid_contexts, valid_questions, truncation=True, padding=True)
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
no_of_encodings = len(train_encodings['input_ids'])
print(f'We have {no_of_encodings} context-question pairs')

We have 1174 context-question pairs


In [None]:
def add_token_positions(encodings, answers):
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

    # if start position is None, the answer passage has been truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    if end_positions[-1] is None:
      end_positions[-1] = tokenizer.model_max_length

  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(valid_encodings, valid_answers)
add_token_positions(test_encodings, test_answers)

In [None]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids)

In [None]:
train_dataset = Dataset(train_encodings)
valid_dataset = Dataset(valid_encodings)
test_dataset = Dataset(test_encodings)

In [None]:
from torch.utils.data import DataLoader

# Define the dataloaders
train_loader = DataLoader(train_dataset, batch_size=3, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=3)
test_loader = DataLoader(test_dataset, batch_size=3)

In [None]:
from transformers import BertForQuestionAnswering, RobertaForQuestionAnswering, DebertaForQuestionAnswering

model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2")

In [None]:
# Check on the available device - use GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Working on {device}')

Working on cuda


In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
from transformers import AdamW

N_EPOCHS = 3
LEARNING_RATE = 2e-5
optim = AdamW(model.parameters(), lr=LEARNING_RATE)

model.to(device)
model.train()

for epoch in range(N_EPOCHS):
    epoch_loss = 0
    epoch_acc = 0
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

        epoch_loss += loss.item()

        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        acc = ((start_pred == start_positions).sum() + (end_pred == end_positions).sum()) / (2 * len(start_positions))
        epoch_acc += acc.item()

        loop.set_description(f'Epoch {epoch+1}')
        loop.set_postfix(loss=loss.item(), acc=acc.item())

    epoch_loss /= len(train_loader)
    epoch_acc /= len(train_loader)
    print(f'Train Loss:{epoch_loss}, Traning accuracy: {epoch_acc}')

Epoch 1: 100%|██████████| 392/392 [07:03<00:00,  1.08s/it, acc=0, loss=nan]


Train Loss:nan, Traning accuracy: 0.15051020792096245


Epoch 2: 100%|██████████| 392/392 [07:02<00:00,  1.08s/it, acc=0, loss=6.25]


Train Loss:nan, Traning accuracy: 0.001275510242094799


Epoch 3: 100%|██████████| 392/392 [07:02<00:00,  1.08s/it, acc=0, loss=6.21]

Train Loss:nan, Traning accuracy: 0.0017006803227930653





In [None]:
model_path = '/content/drive/MyDrive/Ajaykumar_NLP/RoBERTa_passage'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('/content/drive/MyDrive/Ajaykumar_NLP/RoBERTa_passage/tokenizer_config.json',
 '/content/drive/MyDrive/Ajaykumar_NLP/RoBERTa_passage/special_tokens_map.json',
 '/content/drive/MyDrive/Ajaykumar_NLP/RoBERTa_passage/vocab.json',
 '/content/drive/MyDrive/Ajaykumar_NLP/RoBERTa_passage/merges.txt',
 '/content/drive/MyDrive/Ajaykumar_NLP/RoBERTa_passage/added_tokens.json',
 '/content/drive/MyDrive/Ajaykumar_NLP/RoBERTa_passage/tokenizer.json')

In [None]:
from transformers import RobertaForQuestionAnswering, RobertaTokenizerFast

model_path = '/content/drive/MyDrive/Ajaykumar_NLP/RoBERTa_passage'
model = RobertaForQuestionAnswering.from_pretrained(model_path)
tokenizer = RobertaTokenizerFast.from_pretrained(model_path)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Working on {device}')

model = model.to(device)

Working on cuda


In [None]:
model.eval()

val_loss = 0
val_acc = 0
total_samples = 0

with torch.no_grad():
  for batch in tqdm(valid_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_true = batch['start_positions'].to(device)
    end_true = batch['end_positions'].to(device)

    total_samples += input_ids.size(0)
    
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_true, end_positions=end_true)

    loss = outputs.loss
    start_pred = torch.argmax(outputs.start_logits, dim=1)
    end_pred = torch.argmax(outputs.end_logits, dim=1)

    val_loss += loss.item() * input_ids.size(0)
    val_acc += ((start_pred == start_true).sum() + (end_pred == end_true).sum()).item()

val_loss /= total_samples
val_acc /= 2 * total_samples

print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")


100%|██████████| 6/6 [00:01<00:00,  3.38it/s]

Validation Loss: 6.2383, Validation Accuracy: 0.0000





In [None]:
def get_prediction(context, question):
  inputs = tokenizer.encode_plus(question, context, return_tensors='pt', max_length = 512, padding=True, truncation=True, add_special_tokens = True).to(device)
  # padding=True, truncation=True, add_special_tokens = True
  outputs = model(**inputs)
  
  answer_start = torch.argmax(outputs[0])  
  answer_end = torch.argmax(outputs[1]) + 1 
  
  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
  return answer
  
def question_answer(context, question,answer):
  prediction = get_prediction(context,question)
  return prediction
     

In [None]:
for context, question, answer, human_spoiler in zip(test_contexts, test_questions, test_answers, test_human_spoilers):
  print("context: ", context)
  print("question: ", question)
  print("answer: ", answer)
  print("human spoiler: ", human_spoiler)
  print("prediction: ", question_answer(context, question, answer).strip())
  break

context:  Five Nights at Freddy’s creator Scott Cawthon takes to Steam to tease a possible delay for Five Nights at Freddy’s: Sister Location, the fifth game in the series. For the past couple of years, horror gaming fans have been able to look forward to one new entry in the Five Nights at Freddy’s series after another, with four core games, one RPG spinoff, and a novel released so far. The next game in the franchise, Five Nights at Freddy’s: Sister Location, was scheduled to release this coming Friday, October 7th, but if developer Scott Cawthon is to be believed, the project has been delayed by a few months. According to a post by Cawthon on the Five Nights at Freddy’s: Sister Location Steam page, the game is being delayed because it’s too dark. Cawthon said that some of the plot elements are so disturbing that they are making him feel sick, and so he is thinking about delaying the game so that he can rework it entirely "into something kid-friendly." Delays happen in the gaming indu

In [None]:
# BLEU score
import evaluate
bleu = evaluate.load('bleu')
results = 0

for context, question, answer, human_spoiler in zip(test_contexts, test_questions, test_answers, test_human_spoilers):
  if human_spoiler:
    predictions = [f'"{question_answer(context, question, answer).strip()}"']

    if predictions == ['']:
      predictions = ["None"]

    references = [
                  [f'"{human_spoiler.strip()}"'], 
    ]

    results += bleu.compute(predictions=predictions, references=references, max_order = 2).get("bleu")

bleu_score = results/len(test_questions)
print("BLEU Score: ", bleu_score)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Question: Five Nights at Freddy’s Sequel Delayed for Weird Reason
Prediction: �s creator Scott Cawthon takes to Steam to tease a possible delay for Five Nights at Freddy’s: Sister Location, the fifth game in the series. For the past couple of years, horror gaming fans have been able to look forward to one new entry in the Five Nights at Freddy’s series after another, with four core games, one RPG spinoff, and a novel released so far. The next game in the franchise, Five Nights at Freddy’s: Sister Location, was scheduled to release this coming Friday, October 7th, but if developer Scott Cawthon is to be believed, the project has been delayed by a few months. According to a post by Cawthon on the Five Nights at Freddy’
True Answer: {'text': 'some of the plot elements are so disturbing that they are making him feel sick', 'answer_start': 767, 'answer_end': 845}
Question: A man swallowed a microSD card and you won't believe what happened next!
Prediction:  remain
True Answer: {'text': 'a m

In [None]:
# METEOR Score

meteor = evaluate.load('meteor')

results = 0

for context, question, answer, human_spoiler in zip(test_contexts, test_questions, test_answers, test_human_spoilers):
  if human_spoiler:
    predictions = [f'"{question_answer(context, question, answer).strip()}"']

    if predictions == ['']:
      predictions = ["None"]

    references = [
                  f'"{human_spoiler.strip()}"'
    ]
    results += meteor.compute(predictions=predictions, references=references).get("meteor")
    # break

meteor_score = results/len(test_questions)
print("METEOR Score: ", meteor_score)


Downloading builder script:   0%|          | 0.00/6.81k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Question: Five Nights at Freddy’s Sequel Delayed for Weird Reason
Prediction: �s creator Scott Cawthon takes to Steam to tease a possible delay for Five Nights at Freddy’s: Sister Location, the fifth game in the series. For the past couple of years, horror gaming fans have been able to look forward to one new entry in the Five Nights at Freddy’s series after another, with four core games, one RPG spinoff, and a novel released so far. The next game in the franchise, Five Nights at Freddy’s: Sister Location, was scheduled to release this coming Friday, October 7th, but if developer Scott Cawthon is to be believed, the project has been delayed by a few months. According to a post by Cawthon on the Five Nights at Freddy’
True Answer: {'text': 'some of the plot elements are so disturbing that they are making him feel sick', 'answer_start': 767, 'answer_end': 845}
Question: A man swallowed a microSD card and you won't believe what happened next!
Prediction:  remain
True Answer: {'text': 'a m

In [None]:
# BERTScore

from evaluate import load
bertscore = load("bertscore")

results = 0
total_precision = 0
total_recall = 0
total_f1 = 0

count = 0

for context, question, answer, human_spoiler in zip(test_contexts, test_questions, test_answers, test_human_spoilers):
  if human_spoiler:
    predictions = [f'"{question_answer(context, question, answer).strip()}"']

    if predictions == ['']:
      predictions = ["None"]

    references = [
                  f'"{human_spoiler.strip()}"'
    ]
    results = bertscore.compute(predictions=predictions, references=references, lang="en")
    total_precision += results.get("precision")[0]
    total_recall += results.get("recall")[0]
    total_f1 += results.get("f1")[0]
    count += 1

avg_precision = total_precision/len(test_questions)
avg_recall = total_recall/len(test_questions)
avg_f1 = total_f1/len(test_questions)

print("Avg. Precision value: ", avg_precision)
print("Avg. Recall value: ", avg_recall)
print("Avg. F1 value: ", avg_f1)


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Question: Five Nights at Freddy’s Sequel Delayed for Weird Reason
Prediction: �s creator Scott Cawthon takes to Steam to tease a possible delay for Five Nights at Freddy’s: Sister Location, the fifth game in the series. For the past couple of years, horror gaming fans have been able to look forward to one new entry in the Five Nights at Freddy’s series after another, with four core games, one RPG spinoff, and a novel released so far. The next game in the franchise, Five Nights at Freddy’s: Sister Location, was scheduled to release this coming Friday, October 7th, but if developer Scott Cawthon is to be believed, the project has been delayed by a few months. According to a post by Cawthon on the Five Nights at Freddy’
True Answer: {'text': 'some of the plot elements are so disturbing that they are making him feel sick', 'answer_start': 767, 'answer_end': 845}


Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Question: A man swallowed a microSD card and you won't believe what happened next!
Prediction:  remain
True Answer: {'text': 'a man who swallowed a 64GB microSD card and then pooped it into a strainer', 'answer_start': 244, 'answer_end': 318}
Question: You won't believe this stunning "Harry Potter" revelation about Professor McGonagall
Prediction: s
True Answer: {'text': 'McGonagall was appointed as Dumbledore’s assistant in 1956, not as his replacement.', 'answer_start': 1728, 'answer_end': 1811}
Question: J.J. Abrams has an answer on if there will be a post-credits scene in the new 'Star Wars'
Prediction:  for the "Star Wars" series as well... but now J.J. Abrams has definitely put an end to that conversation. The subject came up when Abrams was on stage for a "Star Wars: The Force Awakens" press conference held today in Los Angeles. The filmmaker was directly asked if fans can expect to see a post-credits scene or Easter Egg at the end of the sequel, and he quickly dismissed the ide