# Working with SQUAD Dataset

In [1]:
import pandas as pd
from Utils import get_features
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
df = pd.read_json('train-v2.0.json')

In [3]:
def parse_title(df,title_index):
    """Function to parse the SQUAD data structure for one article and return the full text, Questions and Answers.

    Args:
        df (dict): SQUAD data set structure
        title_index (int): Index number for the article you would like to parse

    Returns:
        text (str): The full text of the article (paragraphs are separated by \n)
        Questions (list): A list of all of the questions (marked as possible)
        Answers (list):  A list of all the answers corresponding to the Question list 
    """

    text = '\n'.join([df['data'][title_index]['paragraphs'][i]['context'] for i in range(len(df['data'][title_index]['paragraphs']))])

    Questions = []
    Answers = []

    for i in range(len(df['data'][title_index]['paragraphs'])):
        for j in range(len(df['data'][title_index]['paragraphs'][i]['qas'])):
            if df['data'][title_index]['paragraphs'][i]['qas'][j]['is_impossible'] == True:
                continue
            else:
                Questions.append(df['data'][title_index]['paragraphs'][i]['qas'][j]['question'])
                Answers.append([x['text'] for x in df['data'][title_index]['paragraphs'][i]['qas'][j]['answers']])
    
    return text, Questions, Answers

In [None]:
vecs = get_features(text=df['data'][0]['paragraphs'][0]['context'],question=df['data'][0]['paragraphs'][0]['qas'][0]['question'],num_rel_sentences=3)

In [None]:
vecs

In [None]:
all_vectors = []

for vec in vecs:
    for vector in list(vec.values()):
        all_vectors.append(vector)

In [None]:
len(all_vectors)

In [None]:
for j in tqdm(range(len(df['data']))):
    #text, Questions, Answers = parse_title(df,j)
    for P in df['data'][j]['paragraphs']:
        for i, QA in enumerate(P['qas']):
            if QA['is_impossible']:
                continue

            try: # TODO 
                vecs = get_features(text=P['context'],question=QA['question'],num_rel_sentences=rel_sentences)

In [4]:
import random 
import spacy
import QAfeatures
nlp = spacy.load('en_core_web_md')



def random_index(n, end, start = 0):
    return list(range(start, n)) + list(range(n+1, end))

def filter_questions_find_len(vecs,Answers):
    """First figure out if the answer was found in the feature vectors.  Next find and return the answer feature vector and a randomly selected incorrect answer feature vector.
    

    Args:
        vecs (list): list of n feature vectors (from n sentences choosen previously)
        Answers (list): list of answers given by SQUAD

    Returns:
        found_answer (bool): Was the answer found or not?
        correct_feature_vector (array): Feaure vector of the correct answer
        incorrect_feature_vector (array): Feature vector of the incorrect answer
    """

    # Did we find the correct answer?
    check = []
    for vec in vecs:
        check.append(any(item in Answers for item in [str(i) for i in vec.keys()]))
    
    if any(check):
        break_out_flag = False
        for vec in vecs:
            for Answer in set(Answers):
                vec_list = [str(i) for i in vec.keys()]
                if Answer in vec_list:
                    correct_index = vec_list.index(Answer)
                    correct_feature_vector = list(vec.values())[correct_index]
                    # print(
                    #incorrect_feature_vectors = [x for i,x in enumerate(list(vec.values())) if i!=correct_index]
                    incorrect_feature_vector = list(vec.values())[random.choice(random_index(correct_index,len(list(vec.values()))))]
                    count = 1
                    while all(incorrect_feature_vector==correct_feature_vector):
                        incorrect_feature_vector = list(vec.values())[random.choice(random_index(correct_index,len(list(vec.values()))))]
                        count+=1
                        if count==10:
                            correct_feature_vector = np.array([])
                            incorrect_feature_vector= np.array([])
                            found_answer = False
                            break
                    break_out_flag = True
                    found_answer = True
                    break
            if break_out_flag:
                break
    else:
        correct_feature_vector = np.array([])
        incorrect_feature_vector = np.array([])
        found_answer = False


    return found_answer, correct_feature_vector, incorrect_feature_vector

In [7]:
import random 

def random_index(n, end, start = 0):
    return list(range(start, n)) + list(range(n+1, end))

def filter_questions(vecs,Answers):
    """First figure out if the answer was found in the feature vectors.  Next find and return the answer feature vector and a randomly selected incorrect answer feature vector.
    

    Args:
        vecs (list): list of n feature vectors (from n sentences choosen previously)
        Answers (list): list of answers given by SQUAD

    Returns:
        found_answer (bool): Was the answer found or not?
        correct_feature_vector (array): Feaure vector of the correct answer
        incorrect_feature_vector (array): Feature vector of the incorrect answer
    """

    # Did we find the correct answer?
    check = []
    for vec in vecs:
        check.append(any(item in Answers for item in [str(i) for i in vec.keys()]))
    
    if any(check):
        break_out_flag = False
        for vec in vecs:
            for Answer in set(Answers):
                vec_list = [str(i) for i in vec.keys()]
                if Answer in vec_list:
                    correct_index = vec_list.index(Answer)
                    correct_feature_vector = list(vec.values())[correct_index]
                    # print(
                    #incorrect_feature_vectors = [x for i,x in enumerate(list(vec.values())) if i!=correct_index]
                    incorrect_feature_vector = list(vec.values())[random.choice(random_index(correct_index,len(list(vec.values()))))]
                    count = 1
                    while all(incorrect_feature_vector==correct_feature_vector):
                        incorrect_feature_vector = list(vec.values())[random.choice(random_index(correct_index,len(list(vec.values()))))]
                        count+=1
                        if count==10:
                            correct_feature_vector = np.array([])
                            incorrect_feature_vector = np.array([])
                            found_answer = False
                            break
                    break_out_flag = True
                    found_answer = True
                    break
            if break_out_flag:
                break
    else:
        correct_feature_vector = np.array([])
        incorrect_feature_vector = np.array([])
        found_answer = False


    return found_answer, correct_feature_vector, incorrect_feature_vector

In [5]:
def find_len(df,rel_sentences=3,max_len_start=0):
    ######## THIS IS THE SECTION THAT TAKES FOREVER ##########
    feature_vec = []
    output_vec = []
    max_len = max_len_start
    for j in tqdm(range(len(df['data']))):
        #text, Questions, Answers = parse_title(df,j)
        for P in df['data'][j]['paragraphs']:
            for i, QA in enumerate(P['qas']):
                if QA['is_impossible']:
                    continue

                #try: # TODO 
                QS = QAfeatures.QuestionSense(QA['question'])
                vecs = get_features(text=nlp(P['context']),question=QS,num_rel_sentences=rel_sentences)
                    
                # except:
                #     continue

                try:
                    found, correct, incorrect = filter_questions(vecs,[x['text'] for x in QA['answers']])
                except:
                    continue
                #print('True')
                if found:
                    print('found')
                    all_vectors = []
                    for vec in vecs:
                        for vector in list(vec.values()):
                            all_vectors.append(vector)
                            
                    vec_len = len(all_vectors)
                    #print(vec_len)
                    if vec_len > 50:
                        print(vecs)
                        print()
                        print(QA['question'])
                        print()
                        print(P['context'])
                        print()
                        break
                    if vec_len > max_len:
                        max_len = vec_len
                        
    return max_len

In [6]:
max_len = find_len(df,3,0)

  0%|          | 0/442 [00:00<?, ?it/s]

TypeError: 'QuestionSense' object is not iterable

In [7]:
max_len

0

In [10]:
max_len = find_len(df,3,0)

  0%|          | 0/442 [00:00<?, ?it/s]

  v1 = candidate.similarity(QS.descriptors)
  v2 = A_verbParent.similarity(Q_verbParent)


Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison


  sim = sentence_nostop.similarity(question_nostop)


Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not 

In [34]:
def get_dataset(df,rel_sentences=3):
    ######## THIS IS THE SECTION THAT TAKES FOREVER ##########
    feature_vec = []
    output_vec = []

    for j in tqdm(range(len(df['data']))):
        #text, Questions, Answers = parse_title(df,j)
        for P in df['data'][j]['paragraphs']:
            for i, QA in enumerate(P['qas']):
                if QA['is_impossible']:
                    continue

                try: # TODO 
                    vecs = get_features(text=P['context'],question=QA['question'],num_rel_sentences=rel_sentences)
                except:
                    continue
                # print(vecs)
                # print()
                # print(QA['answers'])
                try:
                    found, correct, incorrect = filter_questions(vecs,[x['text'] for x in QA['answers']])
                except:
                    continue

                if found:
                    feature_vec.append(np.concatenate((correct,incorrect),axis=1))
                    output_vec.append(np.array([1,0]))

                    # feature_vec.append(incorrect)
                    # output_vec.append(0)

    ############################################################
    a = np.array(feature_vec)
    b = np.array(output_vec)

    indices = np.arange(a.shape[0])
    np.random.shuffle(indices)

    feature_vec = a[indices]
    output_vec = b[indices]
    X_train, X_test, y_train, y_test = train_test_split(feature_vec, output_vec, test_size=0.20, random_state=42)
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = get_dataset(df,rel_sentences=3)

  0%|          | 0/442 [00:00<?, ?it/s]

  v1 = candidate.similarity(QS.descriptors)
  v2 = A_verbParent.similarity(Q_verbParent)


Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison


In [7]:
idxs = []
for i, array in enumerate(X_train):
    if len(array) != 7:
        idxs.append(i)

X_train , y_train = np.delete(X_train,idxs),np.delete(y_train,idxs)

In [10]:
idxs = []
for i, array in enumerate(X_test):
    if len(array) != 7:
        idxs.append(i)

X_test , y_test = np.delete(X_test,idxs),np.delete(y_test,idxs)

In [9]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.feature_vec = X
        self.output_vec = Y
        
    def __len__(self):
        return len(self.output_vec)

    def __getitem__(self, idx):
        x = torch.tensor(self.feature_vec[idx])
        y = torch.tensor(self.output_vec[idx])
        return x , y

In [10]:
train_set = CustomDataset(X_train, y_train)

In [11]:
print(train_set.__getitem__(0))

(tensor([0.2261, 0.7028, 1.0000, 0.0000, 7.0000, 0.0000, 1.0000],
       dtype=torch.float64), tensor(1))


Initializing the Dataloader

In [28]:
train_loader = DataLoader(train_set, batch_size=25, shuffle=True, num_workers=1, drop_last=True)

Defining the Model

In [29]:
class Model(nn.Module):

    def __init__(self,in_features=7,h1=2048,h2=2048,h3=1024*2,h4=1024,h5=900,h6=900,h7=800,
                 h8=800,h9 = 800,h10=800,h11=800,h12=800,h13=800,h14=800,h15=800,out_features=1):
        
        # How many layers?
        # Input layer (# of features) --> hidden layer 1 (number of neurons N) --> h2 (N) --> output (346 of classes)
        super().__init__()
        self.fc1 = nn.Linear(in_features,h1)
        self.bn1 = nn.BatchNorm1d(num_features=h1,momentum=0.01)
        self.fc2 = nn.Linear(h1,h2)
        self.d2 =  nn.Dropout(0.25)
        self.bn2 = nn.BatchNorm1d(num_features=h2,momentum=0.01)
        self.fc3 = nn.Linear(h2,h3)
        self.bn3 = nn.BatchNorm1d(num_features=h3,momentum=0.01)
        self.d3 = nn.Dropout(0.4)
        self.fc4 = nn.Linear(h3,h4)
        self.bn4 = nn.BatchNorm1d(num_features=h4,momentum=0.01)
        self.d4 = nn.Dropout(0.3)
        self.fc5 = nn.Linear(h4,h5)
        self.bn5 = nn.BatchNorm1d(num_features=h5,momentum=0.01)
        self.d5 = nn.Dropout(0.25)
        self.fc6 = nn.Linear(h5,h6)
        self.bn6 = nn.BatchNorm1d(num_features=h6,momentum=0.01)
        self.d6 = nn.Dropout(0.35)
        
#         self.fc7 = nn.Linear(h6,h7)
#         self.bn7 = nn.BatchNorm1d(num_features=h7,momentum=0.01)
#         self.d7 = nn.Dropout(0.4)
        
#         self.fc8 = nn.Linear(h7,h8)
#         self.bn8 = nn.BatchNorm1d(num_features=h8,momentum=0.01)
#         self.d8 = nn.Dropout(0.35)
        
#         self.fc9 = nn.Linear(h8,h9)
#         self.bn9 = nn.BatchNorm1d(num_features=h9,momentum=0.01)
#         self.d9 = nn.Dropout(0.2)
        
#         self.fc10 = nn.Linear(h9,h10)
#         self.bn10 = nn.BatchNorm1d(num_features=h10,momentum=0.01)
#         self.d10 = nn.Dropout(0.25)
        
#         self.fc11 = nn.Linear(h10,h11)
#         self.bn11 = nn.BatchNorm1d(num_features=h11,momentum=0.01)
#         self.d11 = nn.Dropout(0.2)

#         self.fc12 = nn.Linear(h11,h12)
#         self.bn12 = nn.BatchNorm1d(num_features=h12,momentum=0.01)
#         self.d12 = nn.Dropout(0.2)

#         self.fc13 = nn.Linear(h12,h13)
#         self.bn13 = nn.BatchNorm1d(num_features=h13,momentum=0.01)
#         self.d13 = nn.Dropout(0.2)

#         self.fc14 = nn.Linear(h13,h14)
#         self.bn14 = nn.BatchNorm1d(num_features=h14,momentum=0.01)
#         self.d14 = nn.Dropout(0.2)

        self.fc15 = nn.Linear(h6,h15)
        self.bn15 = nn.BatchNorm1d(num_features=h15,momentum=0.01)
        self.d15 = nn.Dropout(0.2)

        self.out = nn.Linear(h15,out_features)
  
    def forward(self,x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = F.relu(self.bn2(self.d2(self.fc2(x))))
        x = F.relu(self.bn3(self.d3(self.fc3(x))))
        x = F.relu(self.bn4(self.d4(self.fc4(x))))
        x = F.relu(self.bn5(self.d5(self.fc5(x))))
        x = F.relu(self.bn6(self.d6(self.fc6(x))))
        # x = F.relu(self.bn7(self.d7(self.fc7(x))))
        # x = F.relu(self.bn8(self.d8(self.fc8(x))))
        # x = F.relu(self.bn9(self.d9(self.fc9(x))))
        # x = F.relu(self.bn10(self.d10(self.fc10(x))))
        # x = F.relu(self.bn11(self.d11(self.fc11(x))))
        # x = F.relu(self.bn12(self.d12(self.fc12(x))))
        # x = F.relu(self.bn13(self.d13(self.fc13(x))))
        # x = F.relu(self.bn14(self.d14(self.fc14(x))))
        x = F.relu(self.bn15(self.d15(self.fc15(x))))

        x = self.out(x)
        return x

In [30]:
seq_model = Model()

In [31]:
# device = torch.device("cuda")
device = torch.device("cpu")
seq_model.to(device)

Model(
  (fc1): Linear(in_features=7, out_features=2048, bias=True)
  (bn1): BatchNorm1d(2048, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=2048, out_features=2048, bias=True)
  (d2): Dropout(p=0.25, inplace=False)
  (bn2): BatchNorm1d(2048, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=2048, out_features=2048, bias=True)
  (bn3): BatchNorm1d(2048, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
  (d3): Dropout(p=0.4, inplace=False)
  (fc4): Linear(in_features=2048, out_features=1024, bias=True)
  (bn4): BatchNorm1d(1024, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
  (d4): Dropout(p=0.3, inplace=False)
  (fc5): Linear(in_features=1024, out_features=900, bias=True)
  (bn5): BatchNorm1d(900, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
  (d5): Dropout(p=0.25, inplace=False)
  (fc6): Linear(in_features=900, out_features=900, bias=True)
  

In [32]:
seq_model.train()

Model(
  (fc1): Linear(in_features=7, out_features=2048, bias=True)
  (bn1): BatchNorm1d(2048, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=2048, out_features=2048, bias=True)
  (d2): Dropout(p=0.25, inplace=False)
  (bn2): BatchNorm1d(2048, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=2048, out_features=2048, bias=True)
  (bn3): BatchNorm1d(2048, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
  (d3): Dropout(p=0.4, inplace=False)
  (fc4): Linear(in_features=2048, out_features=1024, bias=True)
  (bn4): BatchNorm1d(1024, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
  (d4): Dropout(p=0.3, inplace=False)
  (fc5): Linear(in_features=1024, out_features=900, bias=True)
  (bn5): BatchNorm1d(900, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
  (d5): Dropout(p=0.25, inplace=False)
  (fc6): Linear(in_features=900, out_features=900, bias=True)
  

In [33]:
criterion = torch.nn.BCELoss()

In [34]:
optimizer = torch.optim.Adam(seq_model.parameters(),lr=1e-4)

In [35]:
%%time
# EPOCHS 
seq_model.train()
epochs = 5
losses = []
loss = []
val_error = []
validation_losses = []
for i in range(epochs):
    losses.append(loss)
    validation_losses.append(val_error)
    seq_model.train()
    print(f'Epoch {i}, loss {loss}, validation loss: {val_error}')
    for (xi,yi) in train_loader:
        optimizer.zero_grad()
        xi = xi.to(device)
        yi = yi.to(device)
        output = seq_model(xi)
        loss = criterion(torch.squeeze(output),yi)
        loss.backward()
        optimizer.step()

    # if (i+1) % 5 == 0:
    #   Testing_Model(seq_model,test_loader,i)
    # val_error = validation_error(seq_model,criterion,val_loader)

Epoch 0, loss [], validation loss: []


RuntimeError: DataLoader worker (pid(s) 12426) exited unexpectedly

In [37]:
len(X_train)

1892

In [38]:
X_train

array([array([0.22608298, 0.70279014, 1.        , 0.        , 7.        ,
       0.        , 1.        ]),
       array([0.75444967, 0.41330665, 1.        , 0.        , 3.        ,
       0.        , 2.        ]),
       array([0, 0, 1, 0, 3, 0, 1]), ...,
       array([0.        , 0.50098819, 1.        , 0.        , 2.        ,
       1.        , 1.        ]),
       array([0, 0, 1, 0, 8, 1, 1]),
       array([0.28018862, 0.45312098, 1.        , 1.        , 8.        ,
       0.        , 1.        ])], dtype=object)

In [14]:
np.save('X_train_full',X_train)

In [13]:
np.save('y_train_full',y_train)

In [12]:
np.save('X_test_full',X_test)

In [11]:
np.save('y_test_full',y_test)