In [None]:
!pip install datasets 
!pip install evaluate
!pip install bert_score
!pip install transformers==4.28.0

In [None]:
import transformers
from datasets import load_dataset, load_metric
import pandas as pd
from datasets import Dataset
from datasets.load import DatasetDict
from transformers import AutoTokenizer
import torch
import gc
import collections
import evaluate
from evaluate import load
import numpy as np
from tqdm.auto import tqdm
from transformers import default_data_collator
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
print(transformers.__version__)

In [None]:
model_checkpoint = "Palak/microsoft_deberta-large_squad"
batch_size = 2

In [None]:
class DatasetFormatter:
  def __init__(self, test_df_arg) -> None:
    self.train_df = pd.read_json("train.jsonl", lines = True)
    self.test_df = pd.read_json("validation.jsonl", lines = True)

  def list_to_string(self, spoiler_type):
    if spoiler_type[0] == "phrase":
      return 0
    elif spoiler_type[0] == "passage":
      return 1
    elif spoiler_type[0] == "multi":
      return 2

  def return_text(self, text):
    return text[0]

  def preprocess_data(self, data):

    no_of_rows = data.shape[0]
    formatted_data = []
    for index in range(no_of_rows):
      complete_description = " ".join(data.iloc[index].to_dict()["targetParagraphs"])
      row = {}
      row["id"] = data.iloc[index].to_dict()["uuid"]
      row["context"] = complete_description,
      row["question"] = data.iloc[index].to_dict()["postText"][0],
      row["answers"] = {
          "text": data.iloc[index].to_dict()["spoiler"],
          "answer_start": [complete_description.find(data.iloc[index].to_dict()["spoiler"][0])]
      }
      formatted_data.append(row)
    
    return formatted_data

  def get_formatted_dataset(self):
    train_df = self.train_df
    test_df = self.test_df

    train_df["tags"] = train_df["tags"].apply(self.list_to_string)
    test_df["tags"] = test_df["tags"].apply(self.list_to_string)

    # Taking only Pharse dataset
    train_df = train_df[train_df['tags']==0]
    test_df = test_df[test_df['tags']==0]

    train_df = dataset_formatte_obj.preprocess_data(train_df)
    test_df = dataset_formatte_obj.preprocess_data(test_df)

    train_df= pd.DataFrame(train_df)
    test_df = pd.DataFrame(test_df)

    # Removing the list
    train_df["context"] = train_df["context"].apply(self.return_text)
    test_df["context"] = test_df["context"].apply(self.return_text)

    # Removing the list
    train_df["question"] = train_df["question"].apply(self.return_text)
    test_df["question"] = test_df["question"].apply(self.return_text)

    # Spliting the train and validation set from training dataset
    validation_df = train_df.iloc[1258:]
    train_df = train_df.iloc[:1174]

    dataset_train = Dataset.from_pandas(train_df)
    dataset_validation = Dataset.from_pandas(validation_df)
    dataset_test =  Dataset.from_pandas(test_df)

    datasets = DatasetDict()

    datasets["train"] = dataset_train
    datasets["validation"] = dataset_validation
    datasets["test"] = dataset_test

    return datasets

dataset_formatte_obj = DatasetFormatter()

datasets = dataset_formatte_obj.get_formatted_dataset()

In [None]:
import pandas as pd

### Preprocessing the training data

In [None]:
class Preprocessor:
  def __init__(self, model_checkpoint, max_length, doc_stride) -> None:
    self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    # The maximum length of a feature (question and context)
    self.max_length = max_length
    self.doc_stride = doc_stride
    self.pad_on_right = self.tokenizer.padding_side == "right"

  def prepare_train_features(self, examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]

    tokenized_examples = self.tokenizer(
        examples["question" if self.pad_on_right else "context"],
        examples["context" if self.pad_on_right else "question"],
        truncation="only_second" if self.pad_on_right else "only_first",
        max_length=self.max_length,
        stride=self.doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    offset_mapping = tokenized_examples.pop("offset_mapping")

    start_positions, end_positions = [], []

    
    for i, offsets in enumerate(offset_mapping):
        
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(self.tokenizer.cls_token_id)

        
        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]

        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if self.pad_on_right else 0):
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if self.pad_on_right else 0):
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples

max_length = 384
doc_stride = 128
proprocessor_obj = Preprocessor(model_checkpoint, max_length, doc_stride)
tokenized_datasets = datasets.map(proprocessor_obj.prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

In [None]:
class FineTune:
  def __init__(self, model_checkpoint) -> None:
    self.model_checkpoint = model_checkpoint
    self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_checkpoint)
    self.setup()
    self.trainer = None

  def setup(self):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    self.model.to(device)
    print(f'Working on {device}')

    gc.collect()
    torch.cuda.empty_cache()
    print("GPU Cache removed")

  def trainer_model(self):
    model_name = self.model_checkpoint.split("/")[-1]
    args = TrainingArguments(
        f"{model_name}-finetuned-webis",
        evaluation_strategy = "epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=0.1,
        weight_decay=0.01,
        push_to_hub=False,
    )

    data_collator = default_data_collator
    trainer = Trainer(
    self.model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=proprocessor_obj.tokenizer,
    )
    self.trainer = trainer

  def get_trainer(self):
    return self.trainer


In [None]:
fine_tune_obj = FineTune(model_checkpoint)
fine_tune_obj.trainer_model()
trainer = fine_tune_obj.get_trainer()

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
test_datasets = DatasetDict()
test_datasets["test"] = datasets["test"]

In [None]:
trainer.save_model("test-squad-trained")

In [None]:

class Evaluate:
  def __init__(self, proprocessor_obj):
    self.tokenizer = proprocessor_obj.tokenizer
    self.max_length = proprocessor_obj.max_length
    self.doc_stride = proprocessor_obj.doc_stride
    self.pad_on_right = self.tokenizer.padding_side == "right"


  def prepare_validation_features(self, examples):
      
      examples["question"] = [q.lstrip() for q in examples["question"]]

     
      tokenized_examples = proprocessor_obj.tokenizer(
          examples["question" if self.pad_on_right else "context"],
          examples["context" if self.pad_on_right else "question"],
          truncation="only_second" if self.pad_on_right else "only_first",
          max_length=max_length,
          stride=doc_stride,
          return_overflowing_tokens=True,
          return_offsets_mapping=True,
          padding="max_length",
      )

      
      sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

      tokenized_examples["example_id"] = []

      for i in range(len(tokenized_examples["input_ids"])):
          sequence_ids = tokenized_examples.sequence_ids(i)
          context_index = 1 if self.pad_on_right else 0

          sample_index = sample_mapping[i]
          tokenized_examples["example_id"].append(examples["id"][sample_index])

          tokenized_examples["offset_mapping"][i] = [
              (o if sequence_ids[k] == context_index else None)
              for k, o in enumerate(tokenized_examples["offset_mapping"][i])
          ]

      return tokenized_examples
  
  def prepare_test_features(self, examples):
    
    examples["question"] = [q.lstrip() for q in examples["question"]]

   
    tokenized_examples = self.tokenizer(
        examples["question" if self.pad_on_right else "context"],
        examples["context" if self.pad_on_right else "question"],
        truncation="only_second" if self.pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

   
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if self.pad_on_right else 0

        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

  def postprocess_qa_predictions(self, examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)  
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            
            offset_mapping = features[feature_index]["offset_mapping"]

            cls_index = features[feature_index]["input_ids"].index(self.tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[example["id"]] = best_answer["text"]

    return predictions

evaluate_obj = Evaluate(proprocessor_obj)

In [None]:
validation_features = datasets["validation"].map(
    evaluate_obj.prepare_validation_features,
    batched=True,
    remove_columns=datasets["validation"].column_names
)
raw_predictions = trainer.predict(validation_features)
validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

In [None]:
test_features = datasets["test"].map(
    evaluate_obj.prepare_test_features,
    batched=True,
    remove_columns=datasets["test"].column_names
)
raw_predictions = trainer.predict(test_features)
test_features.set_format(type=test_features.format["type"], columns=list(test_features.features.keys()))

In [None]:
final_predictions = evaluate_obj.postprocess_qa_predictions(test_datasets["test"], test_features, raw_predictions.predictions)

In [None]:
metric = load_metric("squad")

In [None]:
formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in test_datasets["test"]]
metric.compute(predictions=formatted_predictions, references=references)

In [None]:
from bert_score import score
from evaluate import load
bertscore = load("bertscore")

class EvaluationMetric:

  def __init__(self):
    self.bleu = evaluate.load('bleu')
    self.meteor = evaluate.load('meteor')
    self.bertscore = load("bertscore")

  def return_target_paragraph(self, id, prediction_text):
    test_df = pd.read_json("validation.jsonl", lines = True)
    test_df["tags"] = test_df["tags"].apply(self.list_to_string)
    test_df = test_df[test_df["tags"] == 1]
    target_paragraphs = self.test_target_paragraphs(test_df)

    for each in target_paragraphs:
      if each[0] == id:
        for sentence in each[1]:
          if prediction_text in sentence:
            return sentence

    return prediction_text 

  def test_target_paragraphs(self, data):
    target_paragraphs = []

    for row in data.values.tolist():
      target_paragraphs.append((row[0],row[3]))
    
    return target_paragraphs

  def list_to_string(self, spoiler_type):
    if spoiler_type[0] == "phrase":
      return 0
    elif spoiler_type[0] == "passage":
      return 1
    elif spoiler_type[0] == "multi":
      return 2

  def get_bleu_score(self, formatted_predictions, references):
    results = 0
    count = 0
    predictions_test = []
    references_test = []

    for each in formatted_predictions:
      count += 1
      for ref in references:

        if each["id"] == ref["id"]:
          post_prediction = self.return_target_paragraph(each["id"],each["prediction_text"].strip())

          predictions_test.append(post_prediction)
          references_test.append([ref["answers"]["text"][0].strip()])

    for i in range(10):
      print("Prediction comparision: ", i)
      print("Model prediction --> ", predictions_test[i])
      print("Reference --> ", references_test[i])
      print('\n')

    result = self.bleu.compute(predictions = predictions_test, references = references_test).get("bleu")

    # bleu_score = results/count
    # return bleu_score
    return result

  def get_meteor_score(self, formatted_predictions, references):
    results = 0
    count = 0
    predictions_test = []
    references_test = []

    for each in formatted_predictions:
      count += 1
      for ref in references:

        if each["id"] == ref["id"]:
          post_prediction = self.return_target_paragraph(each["id"],each["prediction_text"].strip())
          predictions_test.append(post_prediction)

          # predictions_test.append(each["prediction_text"].strip())
          references_test.append(ref["answers"]["text"][0].strip())
    
    result = self.meteor.compute(predictions = predictions_test, references = references_test).get("meteor")

    # meteor_score = results/count
    # return meteor_score
    return result
  
  def get_bert_score(self, formatted_predictions, references):
    results = 0
    total_precision = 0
    total_recall = 0
    total_f1 = 0
    predictions_test = []
    references_test = []

    count = 0

    for each in formatted_predictions:
      count += 1
      for ref in references:

        if each["id"] == ref["id"]:
          post_prediction = self.return_target_paragraph(each["id"],each["prediction_text"].strip())
          predictions_test.append(post_prediction)

          # predictions_test.append(each["prediction_text"].strip())
          references_test.append(ref["answers"]["text"][0].strip())
    
    results = bertscore.compute(predictions=predictions_test, references=references_test, lang = "en")

    
    return results
    

In [None]:
from statistics import mean

evaluation_metric_obj = EvaluationMetric()
bleu_score = evaluation_metric_obj.get_bleu_score(formatted_predictions, references)
meteor_score = evaluation_metric_obj.get_meteor_score(formatted_predictions, references)
bert_score = evaluation_metric_obj.get_bert_score(formatted_predictions, references)

print("BLEU Score: ", bleu_score)
print("METEOR Score: ", meteor_score)
print("BERT Score: ", mean(bert_score.get("f1")))