In [None]:
root_url = '/content/drive/MyDrive/DL_NLP/data'
train_path = root_url + '/train_complete.jsonl'
extra_train_path = root_url + '/cleaned_ARC_CommonSense.jsonl'
dev_path = root_url + '/dev_complete.jsonl'
test_path = root_url + '/test_complete.jsonl'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install datasets



In [None]:
!pip install transformers



In [None]:
from datasets import load_dataset, load_metric, ClassLabel
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import transformers
import random
import pandas as pd

from pprint import pprint

import torch
import math
import time
import sys
import json
import numpy as np

In [None]:
ending_names = ['A', 'B', 'C', 'D']
model_chkpt = "bert-base-uncased"
fake_sentence = 'asdkjaslda asdkajasd ds'
tokenizer  = AutoTokenizer.from_pretrained(model_chkpt, use_fast=True)
model = AutoModelForMultipleChoice.from_pretrained(model_chkpt)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

In [None]:
def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [None]:
@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [ending_names.index(feature.pop(label_name)) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
def choices(example):
    for dic in example['question.choices']:
        example[dic['label']] = dic['text']
    example.pop('question.choices', None)
#    example.pop('question.stem', None)
    return example

In [None]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    pprint(df.to_html())

In [None]:
def show_one(example):
    print(f"Context: {example['fact1']}")
    print(f"  A - {example['question.stem']} {example['A']}")
    print(f"  B - {example['question.stem']} {example['B']}")
    print(f"  C - {example['question.stem']} {example['C']}")
    print(f"  D - {example['question.stem']} {example['D']}")
    print(f"\nGround truth: option {example['label']}")    

In [None]:
def preprocess_function(examples):
    # Repeat each first sentence four times to go with the four possibilities of second sentences.
    first_sentences = [[context] * 4 for context in examples["fact1"]]
    # Grab all second sentences possible for each context.
    question_headers = examples["question.stem"]
    second_sentences = [[f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)]
    
    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])

    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # Un-flatten
    return {k: [v[i:i+4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

In [None]:
facts = 1

input_files = [extra_train_path,train_path,test_path,dev_path]
if facts == 0:
    output_files = ['extra_train_complete_d.jsonl','train_complete_d.jsonl','test_complete_d.jsonl','dev_complete_d.jsonl']
else:
    output_files = ['extra_train_complete_e.jsonl','train_complete_e.jsonl','test_complete_e.jsonl','dev_complete_e.jsonl']

for io in range(4):
    file_name = input_files[io]
    with open(file_name) as json_file:
        json_list = list(json_file)
    for i in range(len(json_list)):
        json_str = json_list[i]
        result = json.loads(json_str)       
        # print(result['fact1'])
        if facts == 0:
            result['fact1'] = ''
        json_list[i] = json.dumps(result)
    file_name = output_files[io]
    fout = open(file_name,'wt')
    for i in range(len(json_list)):
        fout.write('%s\n' % json_list[i])
    fout.close()

batch_size = 16
if facts == 0:
    openbookQA = load_dataset('json', data_files={'extra_train':'extra_train_complete_d.jsonl',
                                                  'train': 'train_complete_d.jsonl', 
                                                  'validation': 'dev_complete_d.jsonl', 
                                                  'test': 'test_complete_d.jsonl'})
else:
    openbookQA = load_dataset('json', data_files={'extra_train':'extra_train_complete_e.jsonl',
                                                  'train': 'train_complete_e.jsonl', 
                                                  'validation': 'dev_complete_e.jsonl', 
                                                  'test': 'test_complete_e.jsonl'})
# pprint(openbookQA['train'][0])

flatten = openbookQA.flatten()

updated = flatten.map(choices)
updated = updated.rename_column('answerKey', 'label')
# pprint(updated['train'][0])

show_one(updated['train'][0])

examples = updated['train'][:5]
features = preprocess_function(examples)
# print(len(features["input_ids"]), len(features["input_ids"][0]), [len(x) for x in features["input_ids"][0]])   

idx = 3
[tokenizer.decode(features["input_ids"][idx][i]) for i in range(4)]    
show_one(updated['train'][idx])

encoded_datasets = updated.map(preprocess_function, batched=True)

model_name = model_chkpt.split("/")[-1]
args = TrainingArguments(f"{model_name}-finetuned-swag",
                          evaluation_strategy = "epoch",
                          learning_rate=5e-5,
                          per_device_train_batch_size=batch_size,
                          num_train_epochs=3,
                          weight_decay=0.01)

accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in encoded_datasets["train"][i].items() if k in accepted_keys} for i in range(10)]
batch = DataCollatorForMultipleChoice(tokenizer)(features)

[tokenizer.decode(batch["input_ids"][8][i].tolist()) for i in range(4)]
show_one(updated["train"][8])

trainer = Trainer(model,
                  args,
                  train_dataset=encoded_datasets["extra_train"],
                  eval_dataset=encoded_datasets["validation"],
                  tokenizer=tokenizer,
                  data_collator=DataCollatorForMultipleChoice(tokenizer),
                  compute_metrics=compute_metrics)

trainer.train()

trainer = Trainer(model,
                  args,
                  train_dataset=encoded_datasets["train"],
                  eval_dataset=encoded_datasets["validation"],
                  tokenizer=tokenizer,
                  data_collator=DataCollatorForMultipleChoice(tokenizer),
                  compute_metrics=compute_metrics)

trainer.train()
print('\n\n\n\n')
print('test set:')
print('\n\n\n\n')
final_eval = trainer.evaluate(eval_dataset=encoded_datasets['test'])
print(final_eval)

Using custom data configuration default-82bd53a6868a1a2f


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-82bd53a6868a1a2f/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/4 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/4 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-82bd53a6868a1a2f/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/13106 [00:00<?, ?ex/s]

  0%|          | 0/4957 [00:00<?, ?ex/s]

  0%|          | 0/500 [00:00<?, ?ex/s]

  0%|          | 0/500 [00:00<?, ?ex/s]

Context: the sun is the source of energy for physical cycles on Earth
  A - The sun is responsible for puppies learning new tricks
  B - The sun is responsible for children growing up and getting old
  C - The sun is responsible for flowers wilting in a vase
  D - The sun is responsible for plants sprouting, blooming and wilting

Ground truth: option D
Context: a star is made of gases
  A - Stars are warm lights that float
  B - Stars are made out of nitrate
  C - Stars are great balls of gas burning billions of miles away
  D - Stars are lights in the sky

Ground truth: option C


  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Context: as a source of light becomes closer , that source will appear brighter
  A - As a car approaches you in the night the headlights become more intense
  B - As a car approaches you in the night the headlights recede into the dark
  C - As a car approaches you in the night the headlights remain at a constant
  D - As a car approaches you in the night the headlights turn off

Ground truth: option A


The following columns in the training set  don't have a corresponding argument in `BertForMultipleChoice.forward` and have been ignored: id, B, D, fact1, humanScore, question.stem, turkIdAnonymized, C, clarity, A. If id, B, D, fact1, humanScore, question.stem, turkIdAnonymized, C, clarity, A are not expected by `BertForMultipleChoice.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 13106
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2460


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2146,1.299708,0.406
2,0.6719,1.381968,0.414
3,0.4165,1.681445,0.434


Saving model checkpoint to bert-base-uncased-finetuned-swag/checkpoint-500
Configuration saved in bert-base-uncased-finetuned-swag/checkpoint-500/config.json
Model weights saved in bert-base-uncased-finetuned-swag/checkpoint-500/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-finetuned-swag/checkpoint-500/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-swag/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForMultipleChoice.forward` and have been ignored: id, B, D, fact1, humanScore, question.stem, turkIdAnonymized, C, clarity, A. If id, B, D, fact1, humanScore, question.stem, turkIdAnonymized, C, clarity, A are not expected by `BertForMultipleChoice.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
Saving model checkpoint to bert-base-uncased-finetuned-swag/checkpoint-1000
Configuration saved i

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.800814,0.698
2,0.679900,0.956226,0.72
3,0.679900,1.208918,0.718


The following columns in the evaluation set  don't have a corresponding argument in `BertForMultipleChoice.forward` and have been ignored: id, B, D, fact1, humanScore, question.stem, turkIdAnonymized, C, clarity, A. If id, B, D, fact1, humanScore, question.stem, turkIdAnonymized, C, clarity, A are not expected by `BertForMultipleChoice.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
Saving model checkpoint to bert-base-uncased-finetuned-swag/checkpoint-500
Configuration saved in bert-base-uncased-finetuned-swag/checkpoint-500/config.json
Model weights saved in bert-base-uncased-finetuned-swag/checkpoint-500/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-finetuned-swag/checkpoint-500/tokenizer_config.json
Special tokens file saved in bert-base-uncased-finetuned-swag/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForMulti






test set:







{'eval_loss': 1.2618921995162964, 'eval_accuracy': 0.699999988079071, 'eval_runtime': 5.9629, 'eval_samples_per_second': 83.851, 'eval_steps_per_second': 10.565, 'epoch': 3.0}


In [None]:
facts = 0

input_files = [extra_train_path,train_path,test_path,dev_path]
if facts == 0:
    output_files = ['extra_train_complete_d.jsonl','train_complete_d.jsonl','test_complete_d.jsonl','dev_complete_d.jsonl']
else:
    output_files = ['extra_train_complete_e.jsonl','train_complete_e.jsonl','test_complete_e.jsonl','dev_complete_e.jsonl']

for io in range(4):
    file_name = input_files[io]
    with open(file_name) as json_file:
        json_list = list(json_file)
    for i in range(len(json_list)):
        json_str = json_list[i]
        result = json.loads(json_str)       
        # print(result['fact1'])
        if facts == 0:
            result['fact1'] = ''
        json_list[i] = json.dumps(result)
    file_name = output_files[io]
    fout = open(file_name,'wt')
    for i in range(len(json_list)):
        fout.write('%s\n' % json_list[i])
    fout.close()

batch_size = 16
if facts == 0:
    openbookQA = load_dataset('json', data_files={'extra_train':'extra_train_complete_d.jsonl',
                                                  'train': 'train_complete_d.jsonl', 
                                                  'validation': 'dev_complete_d.jsonl', 
                                                  'test': 'test_complete_d.jsonl'})
else:
    openbookQA = load_dataset('json', data_files={'extra_train':'extra_train_complete_e.jsonl',
                                                  'train': 'train_complete_e.jsonl', 
                                                  'validation': 'dev_complete_e.jsonl', 
                                                  'test': 'test_complete_e.jsonl'})
# pprint(openbookQA['train'][0])

flatten = openbookQA.flatten()

updated = flatten.map(choices)
updated = updated.rename_column('answerKey', 'label')
# pprint(updated['train'][0])

show_one(updated['train'][0])

examples = updated['train'][:5]
features = preprocess_function(examples)
# print(len(features["input_ids"]), len(features["input_ids"][0]), [len(x) for x in features["input_ids"][0]])   

idx = 3
[tokenizer.decode(features["input_ids"][idx][i]) for i in range(4)]    
show_one(updated['train'][idx])

encoded_datasets = updated.map(preprocess_function, batched=True)

model_name = model_chkpt.split("/")[-1]
args = TrainingArguments(f"{model_name}-finetuned-swag",
                          evaluation_strategy = "epoch",
                          learning_rate=5e-5,
                          per_device_train_batch_size=batch_size,
                          num_train_epochs=3,
                          weight_decay=0.01)

accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in encoded_datasets["train"][i].items() if k in accepted_keys} for i in range(10)]
batch = DataCollatorForMultipleChoice(tokenizer)(features)

[tokenizer.decode(batch["input_ids"][8][i].tolist()) for i in range(4)]
show_one(updated["train"][8])

trainer = Trainer(model,
                  args,
                  train_dataset=encoded_datasets["extra_train"],
                  eval_dataset=encoded_datasets["validation"],
                  tokenizer=tokenizer,
                  data_collator=DataCollatorForMultipleChoice(tokenizer),
                  compute_metrics=compute_metrics)

trainer.train()

trainer = Trainer(model,
                  args,
                  train_dataset=encoded_datasets["train"],
                  eval_dataset=encoded_datasets["validation"],
                  tokenizer=tokenizer,
                  data_collator=DataCollatorForMultipleChoice(tokenizer),
                  compute_metrics=compute_metrics)

trainer.train()
print('\n\n\n\n')
print('test set:')
print('\n\n\n\n')
final_eval = trainer.evaluate(eval_dataset=encoded_datasets['test'])
print(final_eval)