# **PREPROCESS THE MODEL 1**

## Installed Libraries

In [None]:
!pip install --quiet datasets==1.0.2
!pip install --quiet tqdm==4.55.1
!pip install --quiet sense2vec

[K     |████████████████████████████████| 1.8 MB 14.4 MB/s 
[K     |████████████████████████████████| 243 kB 79.7 MB/s 
[K     |████████████████████████████████| 68 kB 4.2 MB/s 
[K     |████████████████████████████████| 6.0 MB 11.8 MB/s 
[K     |████████████████████████████████| 451 kB 74.1 MB/s 
[K     |████████████████████████████████| 10.1 MB 60.9 MB/s 
[K     |████████████████████████████████| 181 kB 67.8 MB/s 
[K     |████████████████████████████████| 628 kB 70.2 MB/s 
[K     |████████████████████████████████| 42 kB 1.1 MB/s 
[?25h

In [None]:
# connect your personal google drive to store the preprocessed datasets as CSV files later
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!ls '/content/gdrive/My Drive'

'Colab Notebooks'   DISSERTATION   Other   Uni


In [None]:
# need to change dir => so, "s2v_old" can be found and be used
%cd /content/gdrive/My Drive/DISSERTATION/

/content/gdrive/My Drive/DISSERTATION


In [None]:
# SQuAD dataset
from datasets import load_dataset

# sense2vec
from sense2vec import Sense2Vec
from collections import OrderedDict
s2v = Sense2Vec().from_disk("s2v_old")

# hide warnings
import warnings
warnings.filterwarnings('ignore')

from pprint import pprint 
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.utils import shuffle

## Download and split the SQuAD dataset into training and validation sets

In [None]:
train_dataset = load_dataset('squad', split='train')

valid_dataset = load_dataset('squad', split='validation')

Downloading:   0%|          | 0.00/1.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/955 [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.75 MiB, post-processed: Unknown size, total: 119.27 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41...


Downloading:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

|          | 0/0 [00:00<?, ? examples/s]

|          | 0/0 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41. Subsequent calls will reuse this data.


Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41)


## Create two empty Pandas dataframes for storing the questions, their corresponding right answer and distractors in three different columns 

In [None]:
pd.set_option("display.max_colwidth", -1)

df_train = pd.DataFrame( columns = ['question', 'right answer','distractors'])
df_validation = pd.DataFrame( columns = ['question', 'right answer','distractors'])

print (df_train)
print (df_validation)

Empty DataFrame
Columns: [question, right answer, distractors]
Index: []
Empty DataFrame
Columns: [question, right answer, distractors]
Index: []


## Sense2Vec function

In [None]:
def sense2vec_get_words(word):
        
    output = []
    word = word.lower()
    word = word.replace(" ", "_")

    sense = s2v.get_best_sense(word)
        
    if not sense: # check if the word has no sense => return
        return None
    else: 
        most_similar = s2v.most_similar(sense, n=20)
        

    for each_word in most_similar:
        append_word = each_word[0].split("|")[0].replace("_", " ").lower()
        
        if len(output) == 3: # if the distractors are 3 => break
            break
        
        if append_word.lower() != word:
            output.append(append_word.title())

    out = list(OrderedDict.fromkeys(output))

    return out

## Function which checks whether an answer has distractors or not according to what is returned by the Sense2Vec function

In [None]:
def find_distractors(dataset_df):
    
    count_long = 0
    count_short = 0
    
    # check which data is passed
    if dataset_df == train_dataset:
        df = df_train
    elif dataset_df == valid_dataset:
        df = df_validation


    for index,val in enumerate(tqdm(dataset_df)):

        # break on the 1000th valid answer cuz otherwise, will take too much time
        #if count_short == 1000:
        #    return df;
        

        question = val['question']
        
        answer = val['answers']['text'][0]
        
        no_of_words = len(answer.split()) # find the length of how many words an answer contains
        
        # if the right answer is more than 1 word => skip it
        if no_of_words > 1:
            count_long = count_long + 1 # counting the answers that have been skipped
        else:
            distractors = sense2vec_get_words(answer) # call the function to get distractors for the current answer

            
            # check if the distractors list has been returned empty 
            # or if the list is less than 3 distractors
            if distractors is None or len(distractors) != 3:
                #print(f"The word '{answer}' does not have a set of synonyms \n")
                count_long = count_short + 1 # counting the answers that have been skipped  
            else:
                df.loc[count_short] = [question] + [answer] + [distractors] # add the two columns to the dataframe
                count_short = count_short + 1       

    print(f"The dataset  has '{count_long}' answers that either contain more than 1 word or the 1 word answer has no distractors")
    print (f"The dataset has  '{count_short}' answers that have distractors")

    return df

## Find distractors for the train dataset and display the dataframe

In [None]:
df_train = find_distractors(train_dataset)

  0%|          | 0/87599 [00:00<?, ?it/s]

The dataset  has '22564' answers that either contain more than 1 word or the 1 word answer has no distractors
The dataset has  '22567' answers that have distractors


In [None]:
df_train

Unnamed: 0,question,right answer,distractors
0,How often is Notre Dame's the Juggler published?,twice,"[Two Or Three Times, Once, 4 Or 5 Times]"
1,How many student news papers are found at Notre Dame?,three,"[Four, Two, Five]"
2,In what year did the student paper Common Sense begin publication at Notre Dame?,1987,"[1988, 1985, 1994]"
3,How many BS level degrees are offered in the College of Engineering at Notre Dame?,eight,"[Seven, Nine, Four]"
4,In what year was the College of Engineering at Notre Dame formed?,1920,"[1930, 1950, 1910]"
...,...,...,...
22562,From what city does Arkefly offer nonstop flights to Kathmandu?,Amsterdam,"[Prague, Copenhagen, Dublin]"
22563,In what US state did Kathmandu first establish an international relationship?,Oregon,"[Alabama, Tennessee, Wisconsin]"
22564,What was Yangon previously known as?,Rangoon,"[Hanoi, Saigon, Cottage]"
22565,With what Belorussian city does Kathmandu have a relationship?,Minsk,"[Moscow, Kiev, Kyiv]"


## Find distractors for the validation dataset and display the dataframe

In [None]:
df_validation = find_distractors(valid_dataset)

  0%|          | 0/10570 [00:00<?, ?it/s]

The dataset  has '2556' answers that either contain more than 1 word or the 1 word answer has no distractors
The dataset has  '2555' answers that have distractors


In [None]:
df_validation

Unnamed: 0,question,right answer,distractors
0,What color was used to emphasize the 50th anniversary of the Super Bowl?,gold,"[More Gold, Only Gold, Then Gold]"
1,Super Bowl 50 decided the NFL champion for what season?,2015,"[2014, 2020, 2018]"
2,What year did the Denver Broncos secure a Super Bowl title for the third time?,2015,"[2014, 2020, 2018]"
3,What year was Super Bowl 50?,2015,"[2014, 2020, 2018]"
4,Super Bowl 50 determined the NFL champion for what season?,2015,"[2014, 2020, 2018]"
...,...,...,...
2550,What is the force called rgarding a potential field between two locations?,artifact,"[Artifacts, Enchantment, Manifest]"
2551,What is sometimes impossible to model?,forces,"[Opposing Forces, Other Forces, Opposing Force]"
2552,What do electrostatic gradiient potentials create?,friction,"[More Friction, Enough Friction, Increased Friction]"
2553,What is a very seldom used unit of mass in the metric system?,slug,"[Slugs, Pellet, Buckshot]"


## Find the shapes of the two dataframes

In [None]:
print (df_train.shape)
print (df_validation.shape)

(22567, 3)
(2555, 3)


## Shuffle the two dataframes

In [None]:
df_train_shuffled = shuffle(df_train)

df_validation_shuffled = shuffle(df_validation)

In [None]:
df_train_shuffled.head()

Unnamed: 0,question,right answer,distractors
16667,Where do scholars believe the name Adonai came from?,Aten,"[Yahweh, Yhwh, Mithras]"
17309,How many members are in the Metropolitan Council?,210,"[215, 230, 220]"
17537,What overhelming percent of Australians voted for the 1967 Referendum?,90%,"[95%, 99%, 80%]"
5154,When was the University of Kansas School of Business established?,1924,"[1921, 1936, 1923]"
1324,How many cubic meters of oil is supposed to be in Newtown Creek?,110000,"[120,000, 24,000, 27,000]"


## Export the two shuffled dataframes as CSV files

In [None]:
# file path for saving the preprocessed shuffled training dataframe without indexes
train_save_path = '/content/gdrive/My Drive/DISSERTATION/MODEL 1/t5/dataset/squad_t5_train.csv'
df_train_shuffled.to_csv(train_save_path, index = False)



# file path for saving the preprocessed shuffled validation dataframe without indexes
validation_save_path = '/content/gdrive/My Drive/DISSERTATION/MODEL 1/t5/dataset/squad_t5_validation.csv'
df_validation_shuffled.to_csv(validation_save_path, index = False)