In [239]:
## Train a parametric model

## Import contractions
!pip install contractions

## Import simpletransformers
!pip install simpletransformers




In [241]:
# Import necessary libraries
import pandas as pd  # pandas for data manipulation
import numpy as np  # numpy for numerical operations
import re  # re for regular expressions
import contractions  # contractions for expanding contractions
import csv  # csv for working with CSV files
import pickle  # pickle for serializing and deserializing Python objects

from simpletransformers.question_answering import QuestionAnsweringModel  # simpletransformers for QA model

In [197]:
## Read the CSV
import pandas as pd
import csv

## This is the crude dataset with ~85K rows. For the sake of resource constraints, we are usinf the subset
## of this dataset with train rows 1000 and test rows = 300.
## Also to have consistency with contextual model on model evaluation, the sampled train data and test data
## is used here (train_df and test_df)

## We are importing df_crude because we need the context text for every question to model the parametric model.
df_crude = pd.read_csv("v10-simplified_simplified-nq-train_closed_book_baseline_train_split.csv")

In [198]:
## Sampled train dataset from contextual model generation team
train_df = pd.read_csv('https://raw.githubusercontent.com/ShreyAgarwal11/NLP_Project/main/Data/train_data.csv')

train_df

Unnamed: 0.1,Unnamed: 0,question_text,question_tokens,answer
0,0,when is the last episode of season 8 of the wa...,"['when', 'is', 'the', 'last', 'episode', 'of',...","March 18 , 2018"
1,1,in greek mythology who was the goddess of spri...,"['in', 'greek', 'mythology', 'who', 'was', 'th...",Persephone ( / pərˈsɛfəni / ; Greek : Περσεφό...
2,6,what is the name of the most important jewish ...,"['what', 'is', 'the', 'name', 'of', 'the', 'mo...",the Shulchan Aruch
3,8,what is the name of spain's most famous soccer...,"['what', 'is', 'the', 'name', 'of', 'spain', ""...",Real Madrid
4,9,when was the first robot used in surgery,"['when', 'was', 'the', 'first', 'robot', 'used...",1983
...,...,...,...,...
12723,6120,who won nba finals mvp on a losing team,"['who', 'won', 'nba', 'finals', 'mvp', 'on', '...",Jerry West
12724,6130,who is responsible for enforcing the data prot...,"['who', 'is', 'responsible', 'for', 'enforcing...",the Information Commissioner 's Office
12725,6131,what is the name of beethoven's only opera,"['what', 'is', 'the', 'name', 'of', 'beethoven...",Fidelio
12726,6135,all standard classes of java are included with...,"['all', 'standard', 'classes', 'of', 'java', '...",java. util


In [199]:
## Sampled test dataset from contextual model generation team
## Now we have 1000 rows but we will just select first 300 later for evaluation
test_df = pd.read_csv('https://raw.githubusercontent.com/ShreyAgarwal11/NLP_Project/main/Data/test_data.csv')
test_df

Unnamed: 0.1,Unnamed: 0,question_text,question_tokens,answer
0,0,which is the most common use of opt-in e-mail ...,"['which', 'is', 'the', 'most', 'common', 'use'...",a newsletter sent to an advertising firm 's c...
1,1,how i.met your mother who is the mother,"['how', 'i.met', 'your', 'mother', 'who', 'is'...",Tracy McConnell
2,3,who had the most wins in the nfl,"['who', 'had', 'the', 'most', 'wins', 'in', 't...",Tom Brady
3,6,who played mantis guardians of the galaxy 2,"['who', 'played', 'mantis', 'guardians', 'of',...",Pom Klementieff
4,15,the nashville sound brought a polished and cos...,"['the', 'nashville', 'sound', 'brought', 'a', ...",the use of lush string arrangements with a re...
...,...,...,...,...
4341,6052,what kind of dog did they use in game of thrones,"['what', 'kind', 'of', 'dog', 'did', 'they', '...",Northern Inuit dogs
4342,6053,when does the war of the planet of the apes start,"['when', 'does', 'the', 'war', 'of', 'the', 'p...","July 14 , 2017"
4343,6056,when is the next episode of george gently bein...,"['when', 'is', 'the', 'next', 'episode', 'of',...",2017
4344,6059,who wrote the song you take my breath away,"['who', 'wrote', 'the', 'song', 'you', 'take',...",Giorgio Moroder


In [200]:
## Concatenate train and test dataset to perform perprocessing altogether at once. (later we will split again)
train_test_df =   pd.concat([train_df, test_df], ignore_index=True)

train_test_df

Unnamed: 0.1,Unnamed: 0,question_text,question_tokens,answer
0,0,when is the last episode of season 8 of the wa...,"['when', 'is', 'the', 'last', 'episode', 'of',...","March 18 , 2018"
1,1,in greek mythology who was the goddess of spri...,"['in', 'greek', 'mythology', 'who', 'was', 'th...",Persephone ( / pərˈsɛfəni / ; Greek : Περσεφό...
2,6,what is the name of the most important jewish ...,"['what', 'is', 'the', 'name', 'of', 'the', 'mo...",the Shulchan Aruch
3,8,what is the name of spain's most famous soccer...,"['what', 'is', 'the', 'name', 'of', 'spain', ""...",Real Madrid
4,9,when was the first robot used in surgery,"['when', 'was', 'the', 'first', 'robot', 'used...",1983
...,...,...,...,...
17069,6052,what kind of dog did they use in game of thrones,"['what', 'kind', 'of', 'dog', 'did', 'they', '...",Northern Inuit dogs
17070,6053,when does the war of the planet of the apes start,"['when', 'does', 'the', 'war', 'of', 'the', 'p...","July 14 , 2017"
17071,6056,when is the next episode of george gently bein...,"['when', 'is', 'the', 'next', 'episode', 'of',...",2017
17072,6059,who wrote the song you take my breath away,"['who', 'wrote', 'the', 'song', 'you', 'take',...",Giorgio Moroder


In [201]:
#Chaning the feature name
train_test_df = train_test_df.rename(columns = {
    "question_text": "question"

})

In [202]:
#Merging the original data and concatenated train and test dataset
df = pd.merge(df_crude, train_test_df, on ="question", how = "inner")

df

Unnamed: 0,Unnamed: 0.1,Unnamed: 0_x,question,context,parametric_answer,contextual_answer,answerable,type,input,output,Unnamed: 0_y,question_tokens,answer
0,53,51846,1 foot is equal to how much metre,<P> The foot ( pl . feet ; abbreviation : ft ;...,0.3048,unanswerable,False,closed_book,question: 1 foot is equal to how much metre?\n...,parametric: 0.3048,4884,"['1', 'foot', 'is', 'equal', 'to', 'how', 'muc...",0.3048
1,72,65188,1 kunta is equal to how many square feet,<Li> 1 Guntha ( R ) = 33 ft ( 10 m ) . x 33 ft...,"1,089 sq ft",unanswerable,False,closed_book,question: 1 kunta is equal to how many square ...,"parametric: 1,089 sq ft",3635,"['1', 'kunta', 'is', 'equal', 'to', 'how', 'ma...","1,089 sq ft"
2,85,65792,1 million dollar winner on who wants to be a m...,<Table> <Tr> <Th> Edition </Th> <Th> Winner </...,John Carpenter,unanswerable,False,closed_book,question: 1 million dollar winner on who wants...,parametric: John Carpenter,1655,"['1', 'million', 'dollar', 'winner', 'on', 'wh...",John Carpenter
3,101,5724,1 week is equal to how many days,<P> A week is a time unit equal to seven days ...,seven,unanswerable,False,closed_book,question: 1 week is equal to how many days?\nc...,parametric: seven,88,"['1', 'week', 'is', 'equal', 'to', 'how', 'man...",seven
4,108,815,1. what city was built as the new capital for ...,<P> As the power shifted from the Umayyads to ...,Baghdad,unanswerable,False,closed_book,question: 1. what city was built as the new ca...,parametric: Baghdad,3774,"['1', '.', 'what', 'city', 'was', 'built', 'as...",Baghdad
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13718,287179,30238,youtubers that have been on the amazing race,<Ul> <Li> YouTubers and friends Tyler Oakley a...,Tyler Oakley,unanswerable,False,closed_book,question: youtubers that have been on the amaz...,parametric: Tyler Oakley,4478,"['youtubers', 'that', 'have', 'been', 'on', 't...",Tyler Oakley
13719,287192,85040,zend avesta is the holy book of which religion,<P> The Avesta / əˈvɛstə / is the primary coll...,Zoroastrianism,unanswerable,False,closed_book,question: zend avesta is the holy book of whic...,parametric: Zoroastrianism,3286,"['zend', 'avesta', 'is', 'the', 'holy', 'book'...",Zoroastrianism
13720,287201,7619,zones of upwelling near the shores of land mas...,"<P> Deep waters are rich in nutrients , includ...","are rich in nutrients , including nitrate , ph...",unanswerable,False,closed_book,question: zones of upwelling near the shores o...,"parametric: are rich in nutrients , including ...",2669,"['zones', 'of', 'upwelling', 'near', 'the', 's...","are rich in nutrients , including nitrate , p..."
13721,287207,76384,​ stress that is converted to positive energy ...,<P> Solids include complex crystalline materia...,elastic energy,unanswerable,False,closed_book,question: ​ stress that is converted to positi...,parametric: elastic energy,2173,"['\u200b', 'stress', 'that', 'is', 'converted'...",elastic energy


In [203]:
df['index'] = df.index
#Changing the features names
df = df.rename(columns =
               {'context_y': 'context',
                'output_y': 'output'})
df = df[['question', 'context', 'output', 'question_tokens']]
def _remove_parameteric_tag(row):
  return row[10:]

df['output'] = df['output'].apply(lambda x: x[11:]) ## Remove "Parametric:" prefix before each of the answers
df['context'] = df['context'].apply(lambda x: x[3:len(x)-4]) ## Remoce <P></P> tags in the context

In [204]:
## Preprocess all three columns
cols = ['question', 'context', 'output']

def _clean_data(df, col):

    ## Convert all reviews into lowercase
    df[col] = str(df[col]).lower()

    ## remove all HTML and URLs from the reviews
    review_cleaned = re.sub(r"http?://\S+|www\.\S+",'', str(df[col]), flags=re.MULTILINE)

    ## Perform contractions on the reviews
    review_cleaned = contractions.fix(review_cleaned)

    ## Remove alphanumeric characters
    review_cleaned = re.sub(r'[^a-zA-Z\s]', ' ', review_cleaned)

    ## remove extra spaces
    review_cleaned = re.sub(r'\s+', ' ', review_cleaned).strip()


    return review_cleaned

In [205]:
## Preprocess the text
df['question'] = df.apply(_clean_data, args = ['question'], axis = 1)
df['context'] = df.apply(_clean_data, args = ['context'], axis = 1)
df['output'] = df.apply(_clean_data, args = ['output'], axis = 1)


In [206]:
df

Unnamed: 0,question,context,output,question_tokens
0,foot is equal to how much metre,the foot pl feet abbreviation ft symbol the pr...,,"['1', 'foot', 'is', 'equal', 'to', 'how', 'muc..."
1,kunta is equal to how many square feet,guntha are ft m x ft m sq ft m,sq ft,"['1', 'kunta', 'is', 'equal', 'to', 'how', 'ma..."
2,million dollar winner on who wants to be a mil...,ble tr th edition th th winner th th broadcast...,john carpenter,"['1', 'million', 'dollar', 'winner', 'on', 'wh..."
3,week is equal to how many days,a week is a time unit equal to seven days it i...,seven,"['1', 'week', 'is', 'equal', 'to', 'how', 'man..."
4,what city was built as the new capital for the...,as the power shifted from the umayyads to the ...,baghdad,"['1', '.', 'what', 'city', 'was', 'built', 'as..."
...,...,...,...,...
13718,youtubers that have been on the amazing race,li youtubers and friends tyler oakley and kore...,tyler oakley,"['youtubers', 'that', 'have', 'been', 'on', 't..."
13719,zend avesta is the holy book of which religion,the avesta v st is the primary collection of r...,zoroastrianism,"['zend', 'avesta', 'is', 'the', 'holy', 'book'..."
13720,zones of upwelling near the shores of land mas...,deep waters are rich in nutrients including ni...,are rich in nutrients including nitrate phosph...,"['zones', 'of', 'upwelling', 'near', 'the', 's..."
13721,stress that is converted to positive energy is...,solids include complex crystalline materials w...,elastic energy,"['\u200b', 'stress', 'that', 'is', 'converted'..."


In [207]:
## Create a column "answer_start"" in order ot match the input format of Simpletransformers pretrained model.

## Resource: https://simpletransformers.ai/docs/qa-model/#training-a-questionansweringmodel

def _find_answer_start(row):
  words = row['output'].split()
  if len(words)>0:
    index = row['context'].find(words[0])
  else:
    index = -1

  return index

df['answer_start'] = df.apply(_find_answer_start, axis = 1)

In [208]:
### **Important Step* ###
## Remove the rows that has no matching answer in the context (in order to comply with the simple transformers' model requisite)
def _find_answer_mismatch(row):

  answer_len = len(row['output'])
  if row['context'][row['answer_start']:row['answer_start']+answer_len]!=row['output']:
    return -1

  else:
    return row['answer_start']

df['answer_start'] = df.apply(_find_answer_mismatch, axis=1)

In [209]:
# create a Boolean mask for the rows to remove
mask = df['answer_start'] == -1

# select all rows except
df = df[~mask]


In [210]:
#Final dataframe with contains all the data that has answers from the content
df = df.reset_index()
df.to_csv("test2.csv")

In [211]:
df

Unnamed: 0,index,question,context,output,question_tokens,answer_start
0,1,kunta is equal to how many square feet,guntha are ft m x ft m sq ft m,sq ft,"['1', 'kunta', 'is', 'equal', 'to', 'how', 'ma...",23
1,2,million dollar winner on who wants to be a mil...,ble tr th edition th th winner th th broadcast...,john carpenter,"['1', 'million', 'dollar', 'winner', 'on', 'wh...",24145
2,3,week is equal to how many days,a week is a time unit equal to seven days it i...,seven,"['1', 'week', 'is', 'equal', 'to', 'how', 'man...",31
3,4,what city was built as the new capital for the...,as the power shifted from the umayyads to the ...,baghdad,"['1', '.', 'what', 'city', 'was', 'built', 'as...",385
4,5,acres is equal to how many hectares,the hectare h kt are t are si symbol ha is an ...,square hectometre hm,"['100', 'acres', 'is', 'equal', 'to', 'how', '...",104
...,...,...,...,...,...,...
10471,13718,youtubers that have been on the amazing race,li youtubers and friends tyler oakley and kore...,tyler oakley,"['youtubers', 'that', 'have', 'been', 'on', 't...",25
10472,13719,zend avesta is the holy book of which religion,the avesta v st is the primary collection of r...,zoroastrianism,"['zend', 'avesta', 'is', 'the', 'holy', 'book'...",64
10473,13720,zones of upwelling near the shores of land mas...,deep waters are rich in nutrients including ni...,are rich in nutrients including nitrate phosph...,"['zones', 'of', 'upwelling', 'near', 'the', 's...",12
10474,13721,stress that is converted to positive energy is...,solids include complex crystalline materials w...,elastic energy,"['\u200b', 'stress', 'that', 'is', 'converted'...",178


In [212]:
import sys

# Specify the path to the CSV file
csvFilePath = "test2.csv"

# Set the maximum field size to avoid field size limitations
csv.field_size_limit(sys.maxsize)

# Initialize an empty list to store the converted data
result = []

with open(csvFilePath) as csvFile:
  # Create a CSV reader object
  csvReader = csv.DictReader(csvFile)

  # Iterate through each row in the CSV file
  for rows in csvReader:
    # Append a formatted dictionary representing each row to the result list
    result.append({
        "context": rows["context"],
        "qas": [{
            "id": str(rows["index"]),
            "is_impossible": False,
            "question": rows["question"],
            "answers": [
                {
                "text": rows["output"],
                "answer_start": int(rows["answer_start"]),
                }
            ],
        },
      ],
    })

# The 'result' list now contains a formatted representation of the data in the CSV file
# You can use this 'result' list for further processing or conversion to other formats

In [213]:
## Train Test Split
# res_train = result[:int(len(result)*0.7)]
# res_test = result[:len(result)-int(len(result)*0.7)]

res_train = result[:6500]
res_test = result[6500:]

In [214]:
## Define the model
model = QuestionAnsweringModel(
    "roberta", "roberta-base",use_cuda=True)

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [215]:
## Clear the previous model weights
!rm -rf outputs

In [216]:
## Train the model
model.train_model(res_train)

convert squad examples to features: 100%|██████████| 6500/6500 [01:01<00:00, 105.90it/s]
add example index and unique id: 100%|██████████| 6500/6500 [00:00<00:00, 571167.69it/s]


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1106 [00:00<?, ?it/s]

(1106, 1.6879434048806135)

In [242]:
with open('parametric_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [218]:
## Predict an output for a given input


context_text = "Mistborn is a series of epic fantasy novels written by American author Brandon Sanderson."

predictions, raw_outputs = model.predict(
    [
        {
            "context": context_text,
            "qas": [
                {
                    "question": "Who was the author of Mistborn?",
                    "id": "0",
                }
            ],
        }
    ]
)


convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 89.32it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 11459.85it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [219]:
raw_outputs

[{'id': '0',
  'probability': [0.8626894160834769,
   0.11140583864386308,
   0.00661895531533323,
   0.0041745227907142085,
   0.0036162781538321438,
   0.002745751306862628,
   0.00241602843666126,
   0.00196613632897442,
   0.0008547575223521708,
   0.0008544967108562078,
   0.0008136265060083619,
   0.0006081890662639285,
   0.0004491078869537216,
   0.00030151677154248153,
   0.00010667247317147837,
   0.00010288690025280621,
   8.912818742476523e-05,
   8.340175827918389e-05,
   5.395349741792996e-05,
   4.845815498063205e-05]}]

In [220]:
predictions

[{'id': '0',
  'answer': ['Brandon Sanderson.',
   'American author Brandon Sanderson.',
   'Brandon',
   'written by American author Brandon Sanderson.',
   'author Brandon Sanderson.',
   'American author',
   'Sanderson.',
   'by American author Brandon Sanderson.',
   'American author Brandon',
   'epic fantasy novels written by American author Brandon Sanderson.',
   'Mistborn is a series of epic fantasy novels written by American author Brandon Sanderson.',
   'erson',
   'American',
   'series of epic fantasy novels written by American author Brandon Sanderson.',
   'a series of epic fantasy novels written by American author Brandon Sanderson.',
   'written by American author',
   'author',
   'fantasy novels written by American author Brandon Sanderson.',
   'novels written by American author Brandon Sanderson.',
   'by American author']}]

In [223]:
result, model_outputs= model.eval_model(res_test)

convert squad examples to features: 100%|██████████| 3976/3976 [00:21<00:00, 186.14it/s]
add example index and unique id: 100%|██████████| 3976/3976 [00:00<00:00, 658111.79it/s]


Running Evaluation:   0%|          | 0/616 [00:00<?, ?it/s]

In [224]:
# Extract the number of correct, similar, and incorrect predictions from the 'result' dictionary
correct_predictions = result['correct']
similar_predictions = result['similar']
incorrect_predictions = result['incorrect']

# Calculate the total number of predictions
total_predictions = correct_predictions + similar_predictions + incorrect_predictions

# Calculate accuracy
accuracy = (correct_predictions + similar_predictions) / total_predictions

# Print the calculated accuracy as a percentage with two decimal places
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 76.51%


In [225]:
# Initialize empty lists to store questions and predicted parametric values
questions = []
predicted_parametric = []

# Iterate over items in the 'model_outputs' dictionary
for key, value in model_outputs.items():
    # Iterate over items in the inner dictionary
    for key_i, value_i in value.items():
        # Check if the value is a string (for certain cases)
        if isinstance(value_i, str):
            # Convert the key to an integer to get the index
            idx = int(key_i)

            # Retrieve the corresponding question from the DataFrame based on the index
            temp = list(df[df['index'] == idx]['question'])
            letters = temp[0]
            sentence = "".join(letters)

            # Append the question and predicted parametric value to the lists
            questions.append(sentence)
            predicted_parametric.append(value_i)

        # Check if the value is a dictionary (for certain cases)
        if isinstance(value_i, dict):
            # Append the question and predicted parametric value to the lists
            questions.append(value_i['question'])
            predicted_parametric.append(value_i['predicted'])


In [226]:
temp_test_df =  pd.DataFrame({'question_text': questions, 'predicted_parametric_answers': predicted_parametric})

In [227]:
temp_test_df = temp_test_df.rename(columns = {
    'question_text': 'question'
})

In [228]:
test_df = test_df.rename(columns = {
    'question_text': 'question'
})

In [229]:
final_df = pd.merge(df,temp_test_df,on='question', how = "inner")

In [230]:
test_df_2 = pd.merge(final_df, test_df, on =['question'], how = "inner")

In [232]:
test_df_2 = test_df_2.rename(columns = {
    "question": "question_text",
    "question_tokens_y": "question_tokens"
})

In [233]:
test_df_2.columns

Index(['index', 'question_text', 'context', 'output', 'question_tokens_x',
       'answer_start', 'predicted_parametric_answers', 'Unnamed: 0',
       'question_tokens', 'answer'],
      dtype='object')

In [234]:
final_final = test_df_2[['question_text', "question_tokens", "predicted_parametric_answers", "answer", "Unnamed: 0"]]
#final_final.to_csv("parametric_answers.csv", index = False)