In [1]:
'''This is strictly for the 100 percent data set questions
Reason: The LAL-parser crashes at around 39% when tagging the questions on the whole train set when the questions have been initialized with a '?' symbol at the end. To fix this, we remove the '?' symbol from the questions and then tag them. Afterwards, we add the 'punct' token to the dependency labels and the index of the root node to the dependency heads'''
import json

'''First read the Questions'''
with open('../embeddings/questions/100/questions.txt', 'r') as f:
    questions = f.readlines()
    questions = [q.strip() for q in questions] # remove the newline character

# show the first 5 questions
questions[:5]

['what color is the middle vase?',
 'are there people going into the buildings?',
 'is the person walking on a sidewalk?',
 'what is wallpaper like?',
 'does the fluid cover the bananas?']

In [2]:
'''Here we read the Dependency Heads'''
from pathlib import Path
import ast

with open('../embeddings/questions/100/dependency_heads.txt', 'r') as f:
    dep_head_list = f.readlines()
    dep_head_list = [element.strip() for element in dep_head_list]
    dep_head_list = [ast.literal_eval(element) for element in dep_head_list]

dep_head_list[:5]


[[2, 6, 6, 6, 6, 0],
 [4, 4, 4, 0, 4, 7, 5],
 [4, 3, 4, 0, 4, 7, 5],
 [0, 4, 4, 1],
 [4, 3, 4, 0, 6, 4]]

In [3]:
'''Similarly, we read the Dependency Labels'''
with open('../embeddings/questions/100/dependency_labels.txt', 'r') as f:
    dep_label_list = f.readlines()
    dep_label_list = [element.strip() for element in dep_label_list]
    dep_label_list = [ast.literal_eval(element) for element in dep_label_list]
    # add the element 'punct' to the each list of dependency labels
    dep_label_list = [dep_label + ['punct'] for dep_label in dep_label_list]

dep_label_list[:5]

[['det', 'dep', 'cop', 'det', 'amod', 'nsubj', 'punct'],
 ['aux', 'nsubj', 'nsubj', 'root', 'prep', 'det', 'pobj', 'punct'],
 ['aux', 'det', 'nsubj', 'root', 'prep', 'det', 'pobj', 'punct'],
 ['dep', 'cop', 'nsubj', 'prep', 'punct'],
 ['aux', 'det', 'nsubj', 'root', 'det', 'dobj', 'punct']]

In [4]:
import stanza
from tqdm import tqdm

# initialize the stanza pipeline
nlp = stanza.Pipeline(lang='en', processors='tokenize, pos, lemma, depparse', verbose=False, use_gpu=True, pos_batch_size=10, lemma_batch_size=10, depparse_batch_size=10)

'''Find the index of the root of a sentence using the Stanza parser'''
def find_root_index(sentence):
    doc = nlp(sentence)
    for sent in doc.sentences:
        for word in sent.words:
            if word.deprel == 'root':
                return word.id


# add the root index to the dependency heads
dep_head_list = [dep_head + [find_root_index(question)]
                 for dep_head, question in zip(tqdm(dep_head_list), questions)]
dep_head_list[:5]


100%|██████████| 376082/376082 [3:28:43<00:00, 30.03it/s]  


[[2, 6, 6, 6, 6, 0, 2],
 [4, 4, 4, 0, 4, 7, 5, 1],
 [4, 3, 4, 0, 4, 7, 5, 4],
 [0, 4, 4, 1, 1],
 [4, 3, 4, 0, 6, 4, 4]]

In [7]:
# save the new dependency heads and the dependency labels to a text file
with open('../embeddings/questions/100/dependency_labels.txt', mode='wt', encoding='utf-8') as myfile:
    for dep_label in dep_label_list:
        myfile.write(str(dep_label) + '\n')
#    myfile.write('\n'.join(dep_label_list))

with open('../embeddings/questions/100/dependency_heads.txt', mode='wt', encoding='utf-8') as myfile:
    for dep_head in dep_head_list:
        myfile.write(str(dep_head) + '\n')
#    myfile.write('\n'.join(dep_head_list))