In [1]:
import pandas as pd
from Utils import get_features
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from testingHelpers import NBcount,questionHasAnswer

In [2]:
df = pd.read_json('train-v2.0.json')

In [3]:
def parse_title(df,title_index):
    """Function to parse the SQUAD data structure for one article and return the full text, Questions and Answers.

    Args:
        df (dict): SQUAD data set structure
        title_index (int): Index number for the article you would like to parse

    Returns:
        text (str): The full text of the article (paragraphs are separated by \n)
        Questions (list): A list of all of the questions (marked as possible)
        Answers (list):  A list of all the answers corresponding to the Question list 
    """

    text = '\n'.join([df['data'][title_index]['paragraphs'][i]['context'] for i in range(len(df['data'][title_index]['paragraphs']))])

    Questions = []
    Answers = []

    for i in range(len(df['data'][title_index]['paragraphs'])):
        for j in range(len(df['data'][title_index]['paragraphs'][i]['qas'])):
            if df['data'][title_index]['paragraphs'][i]['qas'][j]['is_impossible'] == True:
                continue
            else:
                Questions.append(df['data'][title_index]['paragraphs'][i]['qas'][j]['question'])
                Answers.append([x['text'] for x in df['data'][title_index]['paragraphs'][i]['qas'][j]['answers']])
    
    return text, Questions, Answers

In [4]:
import random 
import spacy
import QAfeatures
nlp = spacy.load('en_core_web_md')

def random_index(n, end, start = 0):
    return list(range(start, n)) + list(range(n+1, end))

def filter_questions(vecs,Answers,max_len=30):
    """First figure out if the answer was found in the feature vectors.  Next find and return the answer feature vector and a randomly selected incorrect answer feature vector.
    

    Args:
        vecs (list): list of n feature vectors (from n sentences choosen previously)
        Answers (list): list of answers given by SQUAD

    Returns:
        found_answer (bool): Was the answer found or not?
        correct_feature_vector (array): Feaure vector of the correct answer
        incorrect_feature_vector (array): Feature vector of the incorrect answer
    """

    # Did we find the correct answer?
    feat_len = len(list(vecs.values())[0])
    PAD = np.ones(feat_len)*-1
    check = []
    # for vec in vecs:
    check.append(any(item in Answers for item in [str(i) for i in vecs.keys()]))
    
    feature_vectors = []
    candidate_list = []
    if any(check):
        break_out_flag = False
        # for vec in vecs:
        feature_vectors.extend(list(vecs.values()))
        candidate_list.extend([str(i) for i in list(vecs.keys())])
        for Answer in set(Answers):
            #vec_list = [str(i) for i in vec.keys()]
            if Answer in candidate_list:
                correct_index = candidate_list.index(Answer)
                # print(feature_vectors)
                # print(feature_vectors[correct_index])
                # print(feature_vectors.count(feature_vectors[correct_index]))
                count = 0
                for i in feature_vectors:
                    if all(i==list(feature_vectors[correct_index])):
                        count+=1
                
                if count > 1: # This would mean there is at least one more feature vector equal to the correct one that is not correct 
                    feature_vectors = np.array([])
                    output_vector = np.array([])
                    found_answer = False
                    break
                else:
                    if len(feature_vectors) < max_len: # We must pad!
                        padding_array = [PAD for i in range(max_len-len(feature_vectors))]
                        feature_vectors.extend(padding_array)
                    
                    output_vector = np.zeros(len(feature_vectors))
                    output_vector[correct_index] = 1 # should still be the same index right?

                    indices = np.arange(max_len)

                    np.random.shuffle(indices)
                    feature_vectors = np.array(feature_vectors)[indices]
                    output_vector = np.array(output_vector)[indices]
                    break_out_flag = True
                    found_answer = True
                    break
            # if break_out_flag:
            #     break
    else:
        feature_vectors = np.array([])
        output_vector = np.array([])
        found_answer = False


    return found_answer, feature_vectors, output_vector




def get_dataset(df,rel_sentences=3):
    ######## THIS IS THE SECTION THAT TAKES FOREVER ##########
    feature_vec = []
    output_vec = []
    error_answer = []
    error_question = []
    error_context = []
    # Qtype = []
    # Atype = []

    for j in tqdm(range(len(df['data']))):
        # if j == len(df['data'])-1:
        #     return Qtype,Atype
        #     break
        #print(len(feature_vec))
        #text, Questions, Answers = parse_title(df,j)
        for P in df['data'][j]['paragraphs']:
            for i, QA in enumerate(P['qas']):
                if QA['is_impossible']:
                    continue
                
                try: 
                    QS = QAfeatures.QuestionSense(QA['question'])
                except:
                    continue
                try: # TODO 
                    vecs = get_features(text=nlp(P['context']),QS=QS,num_rel_sentences=rel_sentences)
                except:
                    #message.append(att)
                    #error_question.append(QA['question'])
                    continue
                    
                found = questionHasAnswer([x['text'] for x in QA['answers']],P['context'])
                if found==False:
                    #print(found)
                    error_question.append(QA['question'])
                    error_answer.append([x['text'] for x in QA['answers']][0])
                    error_context.append(P['context'])
  
                

    #X_train, X_test, y_train, y_test = train_test_split(feature_vec, output_vec, test_size=0.20, random_state=42)
    return error_question,error_answer,error_context

In [None]:
error_question,error_answer,error_context = get_dataset(df,3)

  0%|          | 0/442 [00:00<?, ?it/s]

  return comp1.similarity(comp2)
  return comp1.similarity(comp2)
  v2 = A_verbParent.similarity(Q_verbParent)
  otherVerbSim = [(t.similarity(Q_verbParent) if \
  otherQverbs = [(A_verbParent.similarity(t) if \


Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison


  sim = sentence_nostop.similarity(question_nostop)


Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not resolve this question as a binary comparison
Could not 