# **PREPROCESS THE MODEL 2**

## Installed Libraries

In [None]:
!pip install --quiet datasets==1.0.2
!pip install --quiet tqdm==4.55.1

[K     |████████████████████████████████| 1.8 MB 4.7 MB/s 
[K     |████████████████████████████████| 243 kB 65.8 MB/s 
[K     |████████████████████████████████| 68 kB 763 kB/s 
[?25h

In [None]:
# connect your personal google drive to store dataset and trained model
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# SQuAD dataset
from datasets import load_dataset

# hide warnings
import warnings
warnings.filterwarnings('ignore')

from pprint import pprint 
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.utils import shuffle

## Download and split the SQuAD dataset into training and validation sets

In [None]:
train_dataset = load_dataset('squad', split='train')
valid_dataset = load_dataset('squad', split='validation')

Downloading:   0%|          | 0.00/1.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/955 [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.75 MiB, post-processed: Unknown size, total: 119.27 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41...


Downloading:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

|          | 0/0 [00:00<?, ? examples/s]

|          | 0/0 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41. Subsequent calls will reuse this data.


Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41)


## Create two empty Pandas dataframes for storing the context, question and the right answer in three different columns 

In [None]:
pd.set_option("display.max_colwidth", -1)

df_train = pd.DataFrame( columns = ['context', 'answer','question'])
df_validation = pd.DataFrame( columns = ['context', 'answer','question'])

print (df_train)
print (df_validation)

Empty DataFrame
Columns: [context, answer, question]
Index: []
Empty DataFrame
Columns: [context, answer, question]
Index: []


## Fill the train dataframe by iterating through the training set

#### if an answer is equal to or more than 7 words => skip (do not include that sample in the dataframe)

In [None]:
# counter of the samples that will be skipped
count_long = 0

# counter of the samples that will be included in the dataframe
count_short = 0


# iterate through the train dataset        
for index,val in enumerate(tqdm(train_dataset)):
    context = val['context'] # get the context
    question = val['question'] # get the question
    answer = val['answers']['text'][0] # get the answer

    no_of_words = len(answer.split()) # find the number of words that the answer consists of

    # if the answer is equal to or more than 7 words => skip
    if no_of_words >= 7:
        count_long = count_long + 1
        continue
    else:
        df_train.loc[count_short]= [context] + [answer] + [question] 
        count_short = count_short + 1       


# print the number of samples that have been skipped
print ("count_long train dataset: ",count_long)

# print the number of samples that have been included in the dataframe
print ("count_short train dataset: ",count_short)

  0%|          | 0/87599 [00:00<?, ?it/s]

count_long train dataset:  8935
count_short train dataset:  78664


## Fill the validation dataframe by iterating through the validation dataset

#### if an answer is equal to or more than 7 words => skip (do not include that sample in the dataframe)

In [None]:
# counter of the samples that will be skipped
count_long = 0

# counter of the samples that will be included in the dataframe
count_short = 0


# iterate through the train dataset         
for index,val in enumerate(tqdm(valid_dataset)):
    context = val['context'] # get the context
    question = val['question'] # get the question
    answer = val['answers']['text'][0] # get the answer

    
    no_of_words = len(answer.split()) # find the number of words that the answer consists of
    
    # if the answer is equal to or more than 7 words => skip
    if no_of_words >= 7:
        count_long = count_long + 1
        continue
    else:
        df_validation.loc[count_short]= [context] + [answer] + [question] 
        count_short = count_short + 1       


# print the number of samples that have been skipped
print ("count_long validation dataset: ",count_long)

# print the number of samples that have been included in the dataframe
print ("count_short validation dataset: ",count_short)

  0%|          | 0/10570 [00:00<?, ?it/s]

count_long validation dataset:  918
count_short validation dataset:  9652


## Find the shapes of the two dataframes

In [None]:
print (df_train.shape)
print (df_validation.shape)

(78664, 3)
(9652, 3)


## Shuffle the two dataframes

In [None]:
df_train_shuffled = shuffle(df_train)

df_validation_shuffled = shuffle(df_validation)

## Export the two shuffled dataframes as CSV files

In [None]:
# file path for saving the preprocessed shuffled training dataframe without indexes
train_save_path = '/content/gdrive/My Drive/DISSERTATION/MODEL 2/t5/dataset/squad_t5_train.csv'
df_train_shuffled.to_csv(train_save_path, index = False)



# file path for saving the preprocessed shuffled validation dataframe without indexes
validation_save_path = '/content/gdrive/My Drive/DISSERTATION/MODEL 2/t5/dataset/squad_t5_validation.csv'
df_validation_shuffled.to_csv(validation_save_path, index = False)