In [1]:
!pip install datasets
!pip install evaluate
!pip install bert_score
!pip install transformers==4.28.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collec

In [2]:
import transformers
from datasets import load_dataset, load_metric
import pandas as pd
from datasets import Dataset
from datasets.load import DatasetDict
from transformers import AutoTokenizer
import torch
import gc
import collections
import evaluate
from evaluate import load
import numpy as np
from tqdm.auto import tqdm
from transformers import default_data_collator
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

print(transformers.__version__)

4.28.0


In [3]:
model_checkpoint = "Palak/microsoft_deberta-large_squad"
batch_size = 2

In [26]:
class DatasetFormatter:
  def __init__(self) -> None:
    self.train_df = pd.read_json("train.jsonl", lines = True)
    self.test_df = pd.read_json("validation.jsonl", lines = True)
    # self.test_df = test_df_arg

  def list_to_string(self, spoiler_type):
    if spoiler_type[0] == "phrase":
      return 0
    elif spoiler_type[0] == "passage":
      return 1
    elif spoiler_type[0] == "multi":
      return 2

  def return_text(self, text):
    return text[0]

  def preprocess_data(self, data):

    no_of_rows = data.shape[0]
    formatted_data = []
    for index in range(no_of_rows):
      complete_description = "".join(data.iloc[index].to_dict()["targetParagraphs"])
      row = {}
      row["id"] = data.iloc[index].to_dict()["uuid"]
      row["context"] = complete_description,
      row["question"] = data.iloc[index].to_dict()["postText"][0],
      row["answers"] = {
          "text": data.iloc[index].to_dict()["spoiler"],
          "answer_start": [complete_description.find(data.iloc[index].to_dict()["spoiler"][0])]
      }
      formatted_data.append(row)
    
    return formatted_data

  def preprocess_data_test(self, data):

    no_of_rows = data.shape[0]
    formatted_data = []
    for index in range(no_of_rows):
      paragraph = data.iloc[index].to_dict()["targetParagraphs"]
      complete_description = " ".join(paragraph)
      row = {}
      row["id"] = data.iloc[index].to_dict()["uuid"]
      row["context"] = complete_description,
      row["question"] = data.iloc[index].to_dict()["postText"],
      row["answers"] = {
          "text": data.iloc[index].to_dict()["spoiler"],
          "answer_start": [complete_description.find(data.iloc[index].to_dict()["spoiler"][0])]
      }
      formatted_data.append(row)
    
    return formatted_data


  def get_formatted_dataset(self):
    train_df = self.train_df
    test_df = self.test_df

    train_df["tags"] = train_df["tags"].apply(self.list_to_string)
    test_df["tags"] = test_df["tags"].apply(self.list_to_string)

    # Taking only Pharse dataset
    train_df = train_df[train_df['tags']==0]
    # test_df = test_df[test_df['tags']==0]

    train_df = dataset_formatte_obj.preprocess_data(train_df)
    test_df = dataset_formatte_obj.preprocess_data(test_df)

    train_df= pd.DataFrame(train_df)
    test_df = pd.DataFrame(test_df)

    # Removing the list
    train_df["context"] = train_df["context"].apply(self.return_text)
    test_df["context"] = test_df["context"].apply(self.return_text)

    # Removing the list
    train_df["question"] = train_df["question"].apply(self.return_text)
    test_df["question"] = test_df["question"].apply(self.return_text)

    # Spliting the train and validation set from training dataset
    validation_df = train_df.iloc[1258:]
    train_df = train_df.iloc[:1174]

    dataset_train = Dataset.from_pandas(train_df)
    dataset_validation = Dataset.from_pandas(validation_df)
    dataset_test =  Dataset.from_pandas(test_df)

    datasets = DatasetDict()

    datasets["train"] = dataset_train
    datasets["validation"] = dataset_validation
    datasets["test"] = dataset_test

    return datasets

dataset_formatte_obj = DatasetFormatter()

datasets = dataset_formatte_obj.get_formatted_dataset()

### Preprocessing the training data

In [27]:
class Preprocessor:
  def __init__(self, model_checkpoint, max_length, doc_stride) -> None:
    self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    self.max_length = max_length
    self.doc_stride = doc_stride
    self.pad_on_right = self.tokenizer.padding_side == "right"

  def prepare_train_features(self, examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]

    tokenized_examples = self.tokenizer(
        examples["question" if self.pad_on_right else "context"],
        examples["context" if self.pad_on_right else "question"],
        truncation="only_second" if self.pad_on_right else "only_first",
        max_length=self.max_length,
        stride=self.doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    offset_mapping = tokenized_examples.pop("offset_mapping")

    start_positions, end_positions = [], []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(self.tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]

        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if self.pad_on_right else 0):
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if self.pad_on_right else 0):
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples

max_length = 384
doc_stride = 128
proprocessor_obj = Preprocessor(model_checkpoint, max_length, doc_stride)
tokenized_datasets = datasets.map(proprocessor_obj.prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/1174 [00:00<?, ? examples/s]

Map:   0%|          | 0/109 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 2731
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 242
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 2325
    })
})

In [7]:
class FineTune:
  def __init__(self, model_checkpoint) -> None:
    self.model_checkpoint = model_checkpoint
    self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_checkpoint)
    self.setup()
    self.trainer = None

  def setup(self):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    self.model.to(device)
    print(f'Working on {device}')

    gc.collect()
    torch.cuda.empty_cache()
    print("GPU Cache removed")

  def trainer_model(self):
    model_name = self.model_checkpoint.split("/")[-1]
    args = TrainingArguments(
        f"{model_name}-finetuned-webis",
        evaluation_strategy = "epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=0.1,
        weight_decay=0.01,
        push_to_hub=False,
    )

    data_collator = default_data_collator
    trainer = Trainer(
    self.model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=proprocessor_obj.tokenizer,
    )
    self.trainer = trainer

  def get_trainer(self):
    return self.trainer


In [8]:
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.19.0


In [9]:
fine_tune_obj = FineTune(model_checkpoint)
fine_tune_obj.trainer_model()
trainer = fine_tune_obj.get_trainer()

Downloading (…)lve/main/config.json:   0%|          | 0.00/785 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

Working on cuda
GPU Cache removed


In [10]:
trainer.train()



Epoch,Training Loss,Validation Loss
0,No log,1.37046


TrainOutput(global_step=137, training_loss=1.6912549986456433, metrics={'train_runtime': 147.0789, 'train_samples_per_second': 1.857, 'train_steps_per_second': 0.931, 'total_flos': 222623454938112.0, 'train_loss': 1.6912549986456433, 'epoch': 0.1})

In [11]:
trainer.evaluate()

{'eval_loss': 1.3704603910446167,
 'eval_runtime': 29.1269,
 'eval_samples_per_second': 8.308,
 'eval_steps_per_second': 4.154,
 'epoch': 0.1}

In [28]:
test_datasets = DatasetDict()
test_datasets["test"] = datasets["test"]

In [29]:
trainer.save_model("test-squad-trained")

In [30]:

class Evaluate:
  def __init__(self, proprocessor_obj):
    self.tokenizer = proprocessor_obj.tokenizer
    # The maximum length of a feature (question and context)
    self.max_length = proprocessor_obj.max_length
    self.doc_stride = proprocessor_obj.doc_stride
    self.pad_on_right = self.tokenizer.padding_side == "right"


  def prepare_validation_features(self, examples):
      examples["question"] = [q.lstrip() for q in examples["question"]]

      tokenized_examples = proprocessor_obj.tokenizer(
          examples["question" if self.pad_on_right else "context"],
          examples["context" if self.pad_on_right else "question"],
          truncation="only_second" if self.pad_on_right else "only_first",
          max_length=max_length,
          stride=doc_stride,
          return_overflowing_tokens=True,
          return_offsets_mapping=True,
          padding="max_length",
      )

     
      sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

      tokenized_examples["example_id"] = []

      for i in range(len(tokenized_examples["input_ids"])):
          sequence_ids = tokenized_examples.sequence_ids(i)
          context_index = 1 if self.pad_on_right else 0

          sample_index = sample_mapping[i]
          tokenized_examples["example_id"].append(examples["id"][sample_index])

          tokenized_examples["offset_mapping"][i] = [
              (o if sequence_ids[k] == context_index else None)
              for k, o in enumerate(tokenized_examples["offset_mapping"][i])
          ]

      return tokenized_examples
  
  def prepare_test_features(self, examples):
    
    examples["question"] = [q.lstrip() for q in examples["question"]]

    
    tokenized_examples = self.tokenizer(
        examples["question" if self.pad_on_right else "context"],
        examples["context" if self.pad_on_right else "question"],
        truncation="only_second" if self.pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if self.pad_on_right else 0

        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

  def postprocess_qa_predictions(self, examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)  
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in enumerate(tqdm(examples)):
        
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
           
            offset_mapping = features[feature_index]["offset_mapping"]

            cls_index = features[feature_index]["input_ids"].index(self.tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[example["id"]] = best_answer["text"]

    return predictions



evaluate_obj = Evaluate(proprocessor_obj)

In [31]:
validation_features = datasets["validation"].map(
    evaluate_obj.prepare_validation_features,
    batched=True,
    remove_columns=datasets["validation"].column_names
)
raw_predictions = trainer.predict(validation_features)
validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

Map:   0%|          | 0/109 [00:00<?, ? examples/s]

In [32]:
test_features = datasets["test"].map(
    evaluate_obj.prepare_test_features,
    batched=True,
    remove_columns=datasets["test"].column_names
)
raw_predictions = trainer.predict(test_features)
test_features.set_format(type=test_features.format["type"], columns=list(test_features.features.keys()))

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [17]:
final_predictions = evaluate_obj.postprocess_qa_predictions(test_datasets["test"], test_features, raw_predictions.predictions)

Post-processing 800 example predictions split into 2325 features.


  0%|          | 0/800 [00:00<?, ?it/s]

In [34]:
metric = load_metric("squad")

In [35]:
formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in test_datasets["test"]]
metric.compute(predictions=formatted_predictions, references=references)

{'exact_match': 31.75, 'f1': 47.08840862770544}

In [36]:
from bert_score import score
from evaluate import load
bertscore = load("bertscore")

class EvaluationMetric:

  def __init__(self):
    self.bleu = evaluate.load('bleu')
    self.meteor = evaluate.load('meteor')
    self.bertscore = load("bertscore")

  def get_bleu_score(self, formatted_predictions, references):
    results = 0
    count = 0
    predictions_test = []
    references_test = []

    for each in formatted_predictions:
      count += 1
      for ref in references:

        if each["id"] == ref["id"]:
          predictions_test.append(each["prediction_text"].strip())
          references_test.append([ref["answers"]["text"][0].strip()])

    for i in range(30):
      print("Prediction comparision: ", i)
      print("Model prediction --> ", predictions_test[i])
      print("Reference --> ", references_test[i])
      print('\n')

    result = self.bleu.compute(predictions = predictions_test, references = references_test).get("bleu")

    # bleu_score = results/count
    # return bleu_score
    return result

  def get_meteor_score(self, formatted_predictions, references):
    results = 0
    count = 0
    predictions_test = []
    references_test = []

    for each in formatted_predictions:
      count += 1
      for ref in references:

        if each["id"] == ref["id"]:
          predictions_test.append(each["prediction_text"].strip())
          references_test.append(ref["answers"]["text"][0].strip())
    
    result = self.meteor.compute(predictions = predictions_test, references = references_test).get("meteor")

    # meteor_score = results/count
    # return meteor_score
    return result
  
  def get_bert_score(self, formatted_predictions, references):
    results = 0
    total_precision = 0
    total_recall = 0
    total_f1 = 0
    predictions_test = []
    references_test = []

    count = 0

    for each in formatted_predictions:
      count += 1
      for ref in references:

        if each["id"] == ref["id"]:
          predictions_test.append(each["prediction_text"].strip())
          references_test.append(ref["answers"]["text"][0].strip())
    
    results = bertscore.compute(predictions=predictions_test, references=references_test, lang = "en")

    # total_precision += results.get("precision")[0]
    # total_recall += results.get("recall")[0]
    # total_f1 += results.get("f1")[0]

    # avg_precision = total_precision/count
    # avg_recall = total_recall/count
    # avg_f1 = total_f1/count

    # return avg_precision, avg_recall, avg_f1
    return results
    

In [21]:
formatted_predictions_df = pd.DataFrame(data = formatted_predictions, columns = ['id','prediction_text'])

In [22]:
formatted_predictions_df.to_csv('phrase_formatted_predictions',header=True, index=False)

In [23]:
references_df = pd.DataFrame(data = references, columns = ['id','answers'])

In [24]:
references_df.to_csv('phrase_references.csv',index=False,header=True)

In [3]:
from statistics import mean

evaluation_metric_obj = EvaluationMetric()
bleu_score = evaluation_metric_obj.get_bleu_score(formatted_predictions, references)
meteor_score = evaluation_metric_obj.get_meteor_score(formatted_predictions, references)
bert_score = evaluation_metric_obj.get_bert_score(formatted_predictions, references)


print("BLEU Score: ", bleu_score)
print("METEOR Score: ", meteor_score)
print("BERT Score: ", mean(bert_score.get("f1")))

BLEU Score: 0.2301933199388655
METEOR Score: 0.4978373170106201
BERT Score: 0.8937415886670351
