In [4]:
import os
import re
import sys
import urllib.request
import json
import pandas as pd
import numpy as np

from yaml import load

class Preprocessing:
    def __init__(self):
        self.filename = 'train-v1.1.json'
        self.directory = 'squad1.1'
        self.url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"
    
    def start(self):
        # if dataset doesnt exit then download it from the github repo
        self.download_dataset(self.url, self.filename , self.directory)
        print("Dataset is downloaded")

        # load dataset into a file
        train_data = self.load_data(self.filename, self.directory)
        print("Json file succesfully loaded")

        # prepare a csv file
        self.prepare_csvfile(train_data)
    
    def prepare_csvfile(self, train_data):
        context_list = []
        question_list = []
        answer_text_list = []
        answer_start = []
        context_data = []
        question_data = []
        answer_start_data = []
        answer_length_data = []

        print("Enter prepare csv file")
        for id in range(len(train_data["data"])):
            list_para = train_data["data"][id]["paragraphs"]
            for para in list_para:
                context = para["context"]
                qas = para["qas"]
                q = []
                a = []
                for question in qas:
                    questions = question["question"]
                    answer_text = question["answers"][0]["text"]
                    answer_start = question["answers"][0]["answer_start"]
                    
                    start = 0
                    for c in range(answer_start):
                      if(context[c] == ' '):
                        start += 1
                    answer_length = len(list(answer_text.split()))
                    if(answer_length == 1):
                      context_data.append(context)
                      question_data.append(questions)
                      answer_start_data.append(start)
                      answer_length_data.append(answer_length)
                    q.append(questions)
                    a.append(answer_text)

            context_list.append(context)
            question_list.append(q)
            answer_text_list.append(a)
        print(len(context_list))
        print(len(question_list))
        print(len(answer_text_list))
        # make a list for all and then put it    
        dict = {'context': context_list , 'question': question_list , 'answer': answer_text_list}
        dict2 = {'context' : context_data, 'output' : question_data, 'answer_start':answer_start_data, 'answer_length':answer_length_data}
        df2 = pd.DataFrame.from_dict(dict2)
        df2.to_csv("new1.csv")
        new = pd.DataFrame.from_dict(dict)
        new.to_csv("output.csv")

    def load_data(self, filename , directory):
        try:
            json_path = os.path.join(directory, filename)
            print(json_path)
            file = open(json_path)
            data = json.load(file)
            file.close()
            return data
        except:
            print("unable to load a json file")
    def download_dataset(self, url , filename, directory):
        try:
            save_path = os.path.join(directory,filename)
            
            # if not present then download
            if not os.path.exists(save_path):
                # if folder doesnt exist

                url = os.path.join(url,filename)
                urllib.request.urlretrieve(url,save_path)

        except:
            print("some error occured")

    
preprocessing = Preprocessing()
preprocessing.start()

Dataset is downloaded
squad1.1/train-v1.1.json
Json file succesfully loaded
Enter prepare csv file
442
442
442


In [5]:
df = pd.read_csv('new1.csv')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30267 entries, 0 to 30266
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     30267 non-null  int64 
 1   context        30267 non-null  object
 2   output         30267 non-null  object
 3   answer_start   30267 non-null  int64 
 4   answer_length  30267 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 1.2+ MB


In [9]:
import tensorflow as tf

In [10]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(np.array(df.context[0:10000]))
tokenized_input = tokenizer.texts_to_sequences(np.array(df.context[0:10000]))

max_input_len = max([len(i) for i in tokenized_input])

padded_lines = tf.keras.utils.pad_sequences(tokenized_input, padding='post')

num_input_tokens = len(tokenizer.word_index) + 1

input_word_dict = tokenizer.word_index

In [11]:
tokenizer2 = tf.keras.preprocessing.text.Tokenizer()
tokenizer2.fit_on_texts(df.output[0:10000])
tokenized_output = tokenizer2.texts_to_sequences(np.array(df.output[0:10000]))

padded_output_lines = tf.keras.utils.pad_sequences(tokenized_output, padding='post')

output_word_dict = tokenizer.word_index

max_output_len = max([len(i) for i in tokenized_output])

num_out_tokens = len(output_word_dict) + 1

In [12]:
print(max_input_len, max_output_len, num_input_tokens, num_out_tokens)

518 31 33879 33879


In [13]:
answer_start = np.array(df.answer_start[0:10000], dtype=int)
answer_length = np.array(df.answer_length[0:10000], dtype=int)
onehot_start = tf.keras.utils.to_categorical(answer_start, max_input_len)
onehot_length = tf.keras.utils.to_categorical(answer_length, max_input_len)

In [17]:
decoder_target_data = np.array(onehot_start)

In [18]:
decoder_target_data.shape

(10000, 518)

In [19]:
dimensionality = 256
from tensorflow.keras.layers import Input, LSTM
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Embedding
from tensorflow.keras import preprocessing, utils, layers, activations, models

In [20]:
encoder_inputs = Input(shape=(None, ))
encoder_embedding = Embedding(num_input_tokens, dimensionality, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(dimensionality, return_state=True, recurrent_dropout=0.2,
                                         dropout=0.2)(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(num_out_tokens, dimensionality, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(dimensionality, return_state=True, recurrent_dropout=0.2, dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(dimensionality, activation=tf.keras.activations.relu)
output = decoder_dense(decoder_outputs)
decoder_dense1 = tf.keras.layers.Dense(max_input_len, activation=tf.keras.activations.softmax)
output1 = decoder_dense1(output)



In [21]:
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output1)
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()


history = model.fit([padded_lines, padded_output_lines],
                    np.array(onehot_start),
                    validation_split=0.33,
                    batch_size=32,
                    epochs=10,
                    shuffle=True)

model.save('model.h5')

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    8673024     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    8673024     ['input_2[0][0]']                
                                                                                              

In [22]:
from google.colab import files
files.download('model.h5') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>