#Installs & Imports

In [None]:
import numpy as np
import pandas as pd
import json
import re
import csv

In [None]:
!pip install datasets
!pip install transformers
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiproces

#Importing Model & Tokenizer

In [None]:
from transformers import AutoTokenizer, RobertaModel
import torch

tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
# tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = RobertaModel.from_pretrained("roberta-large")

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/696 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#Dataset Preprocessing

###Squad Preprocessing Methods

In [None]:
##Add comment explaining functionality

def generate_examples(filepath):
        with open(filepath, encoding="utf-8") as f:
            squad = json.load(f)
            for example in squad["data"]:
                title = example.get("title", "")
                for paragraph in example["paragraphs"]:
                    context = paragraph["context"]  
                    for qa in paragraph["qas"]:
                        question = qa["question"]
                        id = qa["id"]

                        answer_starts = [answer["answer_start"] for answer in qa["answers"]]
                        answers = [answer["text"] for answer in qa["answers"]]

                        yield { "id": id,
                            "title": title,
                            "context": context,
                            "question": question,
                            "answers": {
                                "answer_start": answer_starts,
                                "text": answers,
                            },
                        }

In [None]:
##Add comment explaining functionality

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        
        if(len(answer["answer_start"])==0):
          start_positions.append(0);
          end_positions.append(0);
        else:
          start_char = answer["answer_start"][0]
          end_char = answer["answer_start"][0] + len(answer["text"][0])
          sequence_ids = inputs.sequence_ids(i)

          # Find the start and end of the context
          idx = 0
          while sequence_ids[idx] != 1:
              idx += 1
          context_start = idx
          while sequence_ids[idx] == 1:
              idx += 1
          context_end = idx - 1

          # If the answer is not fully inside the context, label it (0, 0)
          if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
              start_positions.append(0)
              end_positions.append(0)
          else:
              # Otherwise it's the start and end token positions
              idx = context_start
              while idx <= context_end and offset[idx][0] <= start_char:
                  idx += 1
              start_positions.append(idx - 1)

              idx = context_end
              while idx >= context_start and offset[idx][1] >= end_char:
                  idx -= 1
              end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
from datasets import Dataset

def preprocess_squad(filepath):
  examples = generate_examples(filepath)
  dataset = Dataset.from_list(list(examples)) 
  return dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

###SQuAD Dev Set

In [None]:
processed_squad_dev = preprocess_squad("dev-v2.0.json")
processed_squad_dev

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 11873
})

###SQuAD Train Set

In [None]:
processed_squad_train = preprocess_squad("train-v2.0 (1).json")
processed_squad_train

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 130319
})

#Data Analysis

### SQuAD Train Analysis

In [None]:
def generate_examples_analysis(filepath):
        with open(filepath, encoding="utf-8") as f:
            squad = json.load(f)
            for example in squad["data"]:
                title = example.get("title", "")
                for paragraph in example["paragraphs"]:
                    context = paragraph["context"]  
                    for qa in paragraph["qas"]:
                        question = qa["question"]
                        id = qa["id"]

                        answer_starts = [answer["answer_start"] for answer in qa["answers"]]
                        answers = [answer["text"] for answer in qa["answers"]]

                        yield { "id": id,
                            "title": title,
                            "context": context,
                            "question": question,
                            "answers": None if len(answers)==0 else answers[0]
                        }

In [None]:
train_df = pd.DataFrame(generate_examples_analysis("train-v2.0 (1).json"),columns=['id','title','context','question','answers'])
train_df.head()

Unnamed: 0,id,title,context,question,answers
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing
2,56be85543aeaaa14008c9066,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003
3,56bf6b0f3aeaaa14008c9601,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas"
4,56bf6b0f3aeaaa14008c9602,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s


How many question answer pairs are in the dataset?

In [None]:
train_df.shape[0]

130319

How many articles are the questions and contexts based on?

In [None]:
train_df["title"].nunique()

442

How many unique contexts does the dataset contain?

In [None]:
train_df["context"].nunique()

19029

How many answerable & unaswerable questions are in the dataset? What is their respective percentages?

In [None]:
number_of_rows = train_df.shape[0]
unanswerable_count = train_df["answers"].isna().sum()
answerable_count = number_of_rows - unanswerable_count

print("unanswerable questions count = " + str(unanswerable_count))
print("unanswerable questions percentage = "+ str((unanswerable_count/number_of_rows)*100))
print("answerable questions count = "+ str(answerable_count))
print("answerable questions percentage = "+ str((answerable_count/number_of_rows)*100))

unanswerable questions count = 43498
unanswerable questions percentage = 33.37809528925176
answerable questions count = 86821
answerable questions percentage = 66.62190471074824


What is the total vocab size? What is the vocab size without stop words?

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm", disable=[ "tagger", "parser", "senter", "attribute_ruler", "lemmatizer","ner"])

In [None]:
unique_contexts = train_df["context"].unique()
questions = train_df["question"]

vocab = []
vocab_no_stop_words = []


for context in unique_contexts:
  doc = nlp(context)
  for token in doc:
    match = re.match("^[a-zA-Z]+$", token.text)
    
    if match:
      if token.text not in vocab:
        vocab.append(token.text)
        if token.is_stop:
          vocab_no_stop_words.append(token.text)

for question in questions:
  doc = nlp(question)
  for token in doc:
    match = re.match("^[a-zA-Z]+$", token.text)
    
    if match:
      if token.text not in vocab:
        vocab.append(token.text)
        if token.is_stop:
          vocab_no_stop_words.append(token.text) 

print(vocab)



In [None]:
np.savetxt("vocab.csv", 
           vocab,
           delimiter =", ", 
           fmt ='% s')

In [None]:
vocab_size = len(vocab)
vocab_no_stop_words_size = len(vocab_no_stop_words)
print("vocab size = " + str(vocab_size))
print("vocab size without stop words = " + str(vocab_size - vocab_no_stop_words_size))

vocab size = 89982
vocab size without stop words = 89359


In [None]:
api_key = "AIzaSyA5MlVndz4qeUxoMNo6J4O8zNiqiqy9Gcw"

###SQuAD Dev Analysis

In [None]:
dev_df = pd.DataFrame(generate_examples_analysis("dev-v2.0.json"),columns=['id','title','context','question','answers'])
dev_df.head()

Unnamed: 0,id,title,context,question,answers
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France
1,56ddde6b9a695914005b9629,Normans,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,10th and 11th centuries
2,56ddde6b9a695914005b962a,Normans,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"Denmark, Iceland and Norway"
3,56ddde6b9a695914005b962b,Normans,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,Rollo
4,56ddde6b9a695914005b962c,Normans,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,10th century


How many question answer pairs are in the dataset?

In [None]:
dev_df.shape[0]

11873

How many articles are the questions and contexts based on?

In [None]:
dev_df["title"].nunique()

35

How many unique contexts does the dataset contain?

In [None]:
dev_df["context"].nunique()

1204

How many answerable & unaswerable questions are in the dataset? What is their respective percentages?

In [None]:
number_of_rows_dev = dev_df.shape[0]
unanswerable_count_dev = dev_df["answers"].isna().sum()
answerable_count_dev = number_of_rows_dev - unanswerable_count_dev

print("unanswerable questions count = " + str(unanswerable_count_dev))
print("unanswerable questions percentage = "+ str((unanswerable_count_dev/number_of_rows_dev)*100))
print("answerable questions count = "+ str(answerable_count_dev))
print("answerable questions percentage = "+ str((answerable_count_dev/number_of_rows_dev)*100))

unanswerable questions count = 5945
unanswerable questions percentage = 50.07159100480081
answerable questions count = 5928
answerable questions percentage = 49.92840899519919


What is the total vocab size? What is the vocab size without stop words?

In [None]:
unique_contexts = dev_df["context"].unique()
questions = dev_df["question"]

vocab_dev = []
vocab_stop_words_dev = []


for context in unique_contexts:
  doc = nlp(context)
  for token in doc:
    match = re.match("^[a-zA-Z]+$", token.text)
    
    if match:
      if token.text not in vocab_dev:
        vocab_dev.append(token.text)
        if token.is_stop:
          vocab_stop_words_dev.append(token.text)

for question in questions:
  doc = nlp(question)
  for token in doc:
    match = re.match("^[a-zA-Z]+$", token.text)
    
    if match:
      if token.text not in vocab_dev:
        vocab_dev.append(token.text)
        if token.is_stop:
          vocab_stop_words_dev.append(token.text) 

print(vocab_dev)



In [None]:
np.savetxt("vocabDev.csv", 
           vocab_dev,
           delimiter =", ", 
           fmt ='% s')

In [None]:
vocab_size_dev = len(vocab_dev)
vocab_stop_words_size_dev = len(vocab_stop_words_dev)
print("Dev vocab size = " + str(vocab_size_dev))
print("Dev vocab size without stop words = " + str(vocab_size_dev - vocab_stop_words_size_dev))

Dev vocab size = 18770
Dev vocab size without stop words = 18309
