In [1]:
from tqdm import tqdm

In [2]:
import os
import pandas as pd
#import numpy as np
import spacy


from io import StringIO
from spacy.tokens import Doc
from csv import QUOTE_NONE

CONLL_COLUMN_NAMES = ["idx", "token", "lemma", "upos", "xpos", "morph", "head", "dep", "enhanced", "misc" ]

class CoNLLReader(object):

    def __init__(self, path):
        self.conll_file = open(path, "r")
        self.nlp = spacy.load("en_core_web_sm")


    def __iter__(self):
        return self

    def __next__(self):
        with StringIO() as buffer:
            metadata = {}
            initial = True
            for line in self.conll_file:
                if line == "\n":
                    if initial:
                        continue
                    else:
                        break

                initial = False

                if line[0] == "#":
                    key, val = line[2:].split("=", 1)
                    key = key.strip()
                    val = val.strip()
                    metadata[key] = val

                else:
                    buffer.write(line)

            buffer.seek(0)
            d = pd.read_csv(buffer, sep="\t", names=CONLL_COLUMN_NAMES, quoting=QUOTE_NONE, dtype="str", keep_default_na=False, na_values=[])
            if len(d) < 1:
                raise StopIteration
            d = d.astype({"head": "int32"})

        try:
            doc = Doc(self.nlp.vocab, words=list(d["token"]))
        except:
            print(d.dtypes)
            print(d)
            print(d["token"])
            raise StopIteration

        for token, lemma in zip(doc, d["lemma"]):
            token.lemma_ = lemma


        for token, tag in zip(doc, d["xpos"]):
            token.tag_ = tag

        for token, dep in zip(doc, d["dep"]):
            token.dep_ = dep

        for token, head in zip(doc, d["head"]):
            if head > 0:
                token.head = doc[head-1]

        return doc, metadata

In [29]:
from change_of_state_better import find_change_of_state
from continuation_of_state_better import find_continuation_of_state

In [30]:
from comparatives import check_comparative
from clefts import check_cleft

In [28]:
from embedded_question import check_sentence_for_embedded_question 
from factives import check_sentence_for_factives, check_sentence_for_quote

In [6]:
from lib.conll_reader import CoNLLReader

In [23]:
dataset_path = "../datasets/unanswerable_questions_dev.jsonl"    


In [33]:
import json 
import pandas as pd

In [34]:
qs = pd.read_csv('/Users/nehasrikanth/Documents/question-gen/seed_annotation/round_2/annotator_questions_masterlist.csv')

In [197]:
with open(dataset_path) as f:
    for line in f: 
        text = line.rstrip("\n").split(",")[0]
        if text.split()[0] not in set(['who', 'what', 'when', 'where', 'why', 'how', 'which']):
            continue
            
        nlp_text = nlp(text)
        print(nlp_text)
#         if 'POS' in [t.tag_ for t in nlp_text]:
#             print(text)
        
#         if check_sentence_for_embedded_question(nlp_text)[0]:
#             print(text)

when was the writ watch invented by who
when do new chapters of owari no seraph come out
where does the last name painter come from
what are the challenges faced by the cotton textile industry of ahmedabad
which of the below feature makes the ram not suitable for the persistent storage
what is the life cycle of a centipede
which is the nearest country north of egypt
what is the un declaration on the rights of indigenous peoples
what is selena gomez most popular tv show
when do they announce the next winter olympics
when was each region added to the united states
where is cars 3 supposed to take place
what is the origin of the name dara
who is the national artist for visual who was conferred and died of the same year 1999
where was the movie coming to america shot
what happened to the son in war of the worlds
who played scotty in the new star trek movie
who played the agent in i can only imagine
who played the ice cream man in this is us
who is playing in the western conference finals
w

who was rated number 1 on the 2015 fortune 500's 40 under 40
what term is used to describe inflammation of the lung
when was spruce grove composite high school built
who became the world heavyweight champion in november 2015
when does the thinning new world order come out
when was the big mac released in the uk
who is this doing this synthetic type of alpha beta psychedelic funkin' lyrics
who has a green white and red flag
what cathedrals stand on the bank of the mersey
when did tom brady play in the pro bowl
how much does a pret a manger franchise cost
who is the main character in the book drama
what is meant by a bread line and direct relief
what types of writing did the romantics create
what sports does jamaica play in the olympics
how long is ex on the beach on for
who wrote star of the story by heatwave
who is credited with describing the proper form for greek tragedy
how did the warlord era contribute to the start of the chinese civil war
who has a net worth of 500 million
what i

where does the national security council get their information
how do you get a black belt in jiu jitsu
what was the girl poisoned with in sixth sense
who can be buried at fort indiantown gap
who held political power in the southern colonies
which of the following was a significant innovation of lowell mills
which of the following are si units of young's modulus
where is carbohydrate converted to glucose through the process of digestion
which element is not a core component of the iso 27002 standard
what does canada and us have in common
which of the following events led most directly to the end of world war ll in europe
who covered stuck in the middle with you
who plays jekyll in once upon a time
who sings the song how do you like your love
which two choices describe the purpose of the hsi filter
who was the king of england at the time the house of the seven gables was built
what is the name of the oldest part of the university of paris
who plays sheldon's mother on the big bang theor

who fought in the french and indin war
what percentage of eagle scouts join the military
what is jvm and explain me the java memory allocation
when was the last time the jacksonville jaguars were in the superbowl
who read the file stored on web server
who did dr armstrong kill in and then there were none
how did india's nationalist movement change over time
who is running for us senate in arizona
how layers of sound occur in a piece of music
why are arizona new mexico oklahoma and texas a region
what rights did everyone have according to the declaration of the rights of man
when does the new series of scorpion start
when is project mc2 season 7 coming out on netflix
who played the grandmother in while you were sleeping
what powers did the federal government gained under the constitution
who am i (whats my name)
what channel is cbs on cable in arizona
what was the last hurricane to hit washington dc
when did baseball become america's national pastime
how long is the new star waes movie


where was the pirates of the caribbean filmed
who do the los angeles rams play sunday
what does the color federal blue look like
what harry potter movie came out in 2008
who were the leaders in the american revolution
what age does high school start in australia
what is the difference between webdings and wingdings
who am i what am i doing here
how long will it take me to get to portland oregon
what was the elevation of the land where coronado crossed the rio grande
when was a cure for bubonic plague found
who was vice president and president without being elected by the electoral college
when do liquor stores close on sunday in minnesota
what were the battles of the war of 1812
which of the following is a cluster b personality disorder in the dsm-5
who founded each of the new england colonies
what factors contributed to allied success in africa and italy during the second world war
what is the difference between nagar palika and nagar nigam
when does first quarter end in middle school

who maintained law and order throughout japan and negotiated peace
who is the guy that does the capital one commercial
who were the jesuits and what role did they play in the new world in the 16th and 17th centuries
what is the difference between italian beef and french dip
what are the goals of the us chamber of commerce
what did the pretty little liars do to sarah
what is the primary subsistence crop in latin america
when is ori and the will of the wisps coming out
where does the saying kick the bucket come from
how did arts change as a result of the enlightenment
where do you go when the stars go blue
what happened to claire on days of our lives
how hot are coals in a wood fire
what jobs can you get with a police caution
where did the phrase not my first rodeo come from
how did the estates general illustrate the inequality of the french government
how many crackers can you eat in 1 minute
when do annie and liam start hanging out
where has gold been found in the united states
what ar

who played the doctor in band of brothers
who used to be on good morning america
where is the world series being held at this year
when does stefan turn his humanity back on in season 8
what is the stock symbol for mars candy
what were two consequences of the french colonization of haiti
when do other cultures celebrate the new year
which of the following was not one of the roman senates powers in the early days
why did european countries give up their colonies in southeast asia
what does the song i built a friend mean
when was english ivy introduced to the us
when was the last time easter was on april 17
who has become the first woman chairman of indian bank association
which statement provides the most accurate description of u.s foreign policy in the 1800s
who was considered one of the greatest sailors during the history of early china
how far is skipper's virginia from here
what is the next comet to pass earth
what kind of vw jetta do i have
when do sandhill cranes lay eggs in flor

when did the calendar change from 360 days to 365 days
what were the 3 major native american civilizations in north america
which of the following areas did not have a history of serious colonial conflicts after world war ii
what is the legal age for a job
when was the last time costa rica had a hurricane
who plays alex's dad on grey's anatomy
what kind of clothes did the maya wear
what did australia provide for the british empire
who was the oldest man ever to have been appointed chief justice
what does it mean when find my iphone is off
who is generally regarded as the father of american psychology
who had participated in telangana movement during 2009
how many games did elise stacy play for the hockeyroos
when a cork is added to a glass of water
how many recursive calls can a recursive method have
who said the only thing certain in life is death and taxes
which of these is not an external force that affects business cycles
what song does sam sing in ted 2
who is the highest paid foo

In [156]:
pos = nlp("mary's wanting to go out was invalid")
for i in pos:
    print("%s | POS: %s | Head: %s | Subtree: %s" % (i, i.tag_, i.head, list(i.subtree)))
#     if i.tag_ == 'POS':
#         h = i.head
#         m = list(h.head.subtree)
#         tags = [t.tag_ for t in m]
#         print(m[:tags.index('POS')], m[tags.index('POS') + 1: ])

mary | POS: NNP | Head: wanting | Subtree: [mary]
's | POS: POS | Head: wanting | Subtree: ['s]
wanting | POS: VBG | Head: was | Subtree: [mary, 's, wanting, to, go, out]
to | POS: TO | Head: go | Subtree: [to]
go | POS: VB | Head: wanting | Subtree: [to, go, out]
out | POS: RP | Head: go | Subtree: [out]
was | POS: VBD | Head: was | Subtree: [mary, 's, wanting, to, go, out, was, invalid]
invalid | POS: JJ | Head: was | Subtree: [invalid]


In [181]:
from spacy import displacy


displacy.serve(nlp("where do harry potter’s aunt and uncle live"), style="dep")




OSError: [Errno 48] Address already in use

In [None]:
s = "it was rupak that ate the cookies"

In [None]:
import numpy as np 

In [None]:
s1 = "it was neha that ate the cookies"

In [None]:
check_cleft(nlp(s1)) 

In [None]:
tokens = list(nlp(s1))
#print(tokens)
tokens_str = [str(token) for token in tokens]
#print(tokens_str)
cleft_word = np.intersect1d(tokens_str, ["it"])
#print(cleft_word)
index = tokens_str.index(cleft_word)

cop = tokens[index + 1]
obj = None 
for child in cop.children: 
    #print(child, child.dep_ )
    
    if child.dep_ in ['attr', 'dobj']: 
        obj = child
        break 

In [None]:
list(obj.children)  

In [None]:
check_cleft(nlp(s1) ) 

In [None]:
comp_sent = "Clifford is a bigger dog than Cujo."  # should evaluate to true



In [None]:
# test sentences
comp_sent = "Is Clifford a bigger dog than Cujo?"  # should evaluate to true
comp_sent2 = "Clifford is a more giant dog than Cujo."  # should evaluate to true
non_comp_sent = "Clifford is a bigger dog than I thought,"  # should evaluate to false
other_comp = "Clifford is bigger of a dog than Cujo."  # should evaluate to true
other_comp2 = "Clifford is bigger a dog than Cujo."  # should evaluate to true, currently doesn't

# displacy.serve(nlp(other_comp2),style='dep')


In [None]:
check_comparative(nlp(comp_sent))

In [None]:
check_sentence_for_embedded_question(nlp("I fail to see how you can rationalize rewarding illegality.")) 

In [None]:
s10 = "a microsecond later, images from his exterior sensors didn't snap into focus"

find_change_of_state(nlp(s10))

In [None]:
s9 = "Lisa stopped petting Tom's cat"

q = "both kids smiled"
q1 = "all three cat owners that julia spoke to want another cat"
q2 = "both of our fathers were thin"

In [None]:
q3 = "do you see any problem at all because folks on both sides of the aisle have tremendous respect from him?"

In [None]:
s4 = "meaning behind the song whiter shade of pale"

In [None]:
find_change_of_state(nlp(s9))

In [None]:
find_continuation_of_state(nlp(s9))

In [158]:
def find_numeric_determiners(sentence):

    numeric_predeterminers = set(["all"])
    special_determiners = set(["both"])
    all_determiners = numeric_predeterminers | special_determiners


    # extract words as list of strings
    words = [t.text.lower() for t in sentence]
    
    #print(words)
    if set(words) & all_determiners:
        for token in sentence:
            include = False
            if token.text.lower() in special_determiners:
                if token.tag_ == "DT" and token.dep_ == "det":
                    include = True
                    return True  
                    
                if not include:
                    for child in token.children:
                        if child.dep_ == "prep" and child.text.lower() == "of":
                            include = True
                            return True 


            if not include and token.text.lower() in numeric_predeterminers:
                        
                if token.tag_ == "DT" and token.dep_ == "det":
                    head_token = token.head

                    if head_token.tag_ == "CD":
                        include = True # all three of the children
                        return True  
                        
                    else :
                        
                        for child in head_token.children:
                            ##print("\t", child, child.dep_, child.tag_)
                            if child.tag_ == "CD" and child.dep_ == "nummod":  ## changed token.dep_ to child.dep_ 
                                #print("here")
                                ##print(sentence)
                                include = True # all three children
                                #print(include)
                                return True 
                
                elif token.tag_ == "DT":
                    for child in token.children:
                        if child.dep_ == "prep" and child.text.lower() == "of":
                            for child_child in child.children:
                                if child_child.dep_ == "pobj":
                                    has_det = False
                                    has_nummod = False
                                    for child_child_child in child_child.children:
                                        if (child_child_child.tag_ == "CD"
                                            and child_child_child.dep_ == "nummod"):
                                            has_nummod = True # all of the three children
                                        if (child_child_child.tag_ == "DT"
                                            and child_child_child.dep_ == "det"):
                                            has_det = True # all of the three children

                                    include = has_det and has_nummod
                                    if include:
                                        #print(sentence)
                                        return True 
                            if include:
                                return True 


    return False




In [160]:
find_numeric_determiners(nlp("all three deny the allegation."))

True

In [22]:
s5 = "both channels is administered by the u.s. department of justice"

In [66]:
from re_verbs_better import find_re_verbs 

In [95]:
re_verbs_corpus = "wordlists/re_verbs_updated.txt"

def find_re_verbs(sentence, cos_predicate_path = re_verbs_corpus):

    with open(cos_predicate_path, 'r') as f:
        re_verbs = set([l.strip() for l in f.readlines()])

    # extract words as list of strings
    words = [t.text for t in sentence]
    
    if set(words) & re_verbs:
        
        for token in sentence:
            #print(token, token.tag_, token.dep_)
            if (str(token) in re_verbs
                and token.tag_[0] == "V"
                and token.dep_ == "ROOT"):  # why?? 
                
                if len(list(token.children))  > 0: 
                    return True 
                    
    return False 

 

In [98]:
s6 = "holly decided to reconnect the charger?"

In [99]:
find_re_verbs(nlp(s6))

False

In [56]:
with open(re_verbs_corpus, 'r') as f:
        re_verbs = set([l.strip() for l in f.readlines()])

In [59]:
"reenter" in re_verbs

False

In [100]:
from re_verbs_better import find_re_verbs

In [104]:
from spacy import displacy
from spacy.matcher import PhraseMatcher

implicative_path = "wordlists/implicative_predicates.txt"

def find_implicatives(sentence, impl_pred_path=implicative_path):
    
    nlp = spacy.load("en_core_web_sm")
    matcher = PhraseMatcher(nlp.vocab)
    
    with open(impl_pred_path, 'r') as f:
        implicatives = [l.strip() for l in f]
        implicative_pairs = [item.split(":") for item in implicatives] #seperate predicates and inferences
        implicative_predicates = [pair[0] for pair in implicative_pairs]
        implicative_inferences_dict = {pair[0]:pair[1] for pair in implicative_pairs} #dictionary mapping predicates to inferences
        implicative_verbs = set([p.split()[0] for p in implicative_predicates])
        # add implicative predicates to matcher
        patterns = [nlp.make_doc(text) for text in implicative_predicates]
        matcher.add("IMPL_PRED_LIST", None, *patterns)
        

    #words = [t.text for t in sentence]
    # find matches of implicative predicates in the sentence
    matches = matcher(sentence)

    if len(matches) > 0:
        for match_id, start, end in matches:
            impl_phrase = sentence[start:end]
            #print(str(impl_phrase))
        # match found, proceed to further checking
        for token in sentence:
            if (str(token) in implicative_verbs
                and token.tag_[0] == "V"
                and token.dep_ == "ROOT"):
                #include = False
                for child in token.children:
                    if child.dep_ == "prep":
                        for childs in child.children:
                            if childs.dep_ == "pcomp" and childs.pos_ == "VERB":
                                return True 
                                #include = True
                                #break
                    else:
                        if child.dep_ == "xcomp" and child.pos_ == "VERB":
                            return True 
                            #include = True
                            #break
    return False 

In [107]:
s = "survivors managed to scramble out through the tiny gap in the rocks"
s1 = "survivors scrambled through the tiny gaps on the rocks"

In [109]:
from implicative_verbs_better import find_implicatives

In [16]:
temporal_prepositions = ['before', 'after', 'while', 'since', 'because']
accepted_head_tags = {
    'VBG': 'gerund',
    'VBN': 'past-participle',
    'VBD': 'past',
    'VBP': 'non-3sg-present',
    'VBZ': '3sg-present',
    'VB': 'base',
}

def find_temporal_adverbs(sentence):
    words = [t.text for t in sentence]
    preps_in_sentence = [word for word in sentence if word.lemma_ in temporal_prepositions]
    print(preps_in_sentence)
    if len(preps_in_sentence) > 0:
        for prep in preps_in_sentence:
            prep_children = [child for child in prep.children]
            # get the tag(s) of the immediate child
            prep_tags = [child.tag_ for child in prep_children]
            # check if tag is accepted (must be a verbal category)
            accepted_prep_tags = list(set(prep_tags) & set(accepted_head_tags.keys()))
            if len(prep_children) > 0 and len(accepted_prep_tags) > 0:
                trigger_specific_fields = {
                    "preposition": prep.text,
                    "embedded_clause_head_tag": prep_tags,
                }
                return trigger_specific_fields
    return None




In [17]:
find_temporal_adverbs(
    nlp("he took them to the NL Championship Series last year before being swept by the Atlanta Braves.")
)

[before]


{'preposition': 'before', 'embedded_clause_head_tag': ['VBN']}

In [40]:
check_sentence_for_embedded_question(nlp("Who is the current monarch of France?"))

(False, None, None)

In [39]:
check_sentence_for_embedded_question(nlp("who played scotty in the new star trek movie"))

(False, None, None)

In [42]:
from glob import glob

In [43]:
csvs = glob("annotation/round2/*")

In [45]:
import pandas as pd
PRESUPPOSITIONAL_WH_WORDS = ["why", "how", "where", "when", "who", "what", "which"]

In [None]:
import nump

In [60]:
for file in csvs: 
    df = pd.read_csv(file)
    for index, row in df.iterrows(): 
        beginning_token = row['sentence'].lower().split()[0]
        if beginning_token in PRESUPPOSITIONAL_WH_WORDS and not pd.isnull(row['presupposition']): 
            print(row['trigger'], '|', row['sentence'], '|', row['presupposition'], '|', row['trigger_data'],  "\n")
            
            
        

comparatives | When it comes to mental toughness , there is no better team than this one . | this one is a team with mental toughness. | {'adjective': 'better', 'noun': 'team'} 

re_verbs | When AngloGold extracts the last ounces of gold from the mountain it has destroyed to make money , geophysical forces will recreate it , shoving more rocks to the surface . | The mountain was created once before | {'predicate': 'recreate'} 

continuation_of_state | How soon after the visit to ABC News did the child begin to show some symptoms , and what were the symptoms exactly ? | The child didn't show any symptoms before the visit to ABC News. | {'predicate': 'begin'} 

factives | What he 's doing is totally and completely normal : He 's discovering his body , and discovering that rubbing his penis feels good . | Rubbing his penis feels good. | {'factive': 'discovering', 'embedded_clause': "What he 's doing is totally and completely normal"} 

change_of_state | When she 'd settled her cousin in t

In [61]:
x = nlp("Which means , throw something up and you know exactly where it will fall to moon again.")

In [62]:
check_sentence_for_embedded_question(x)

(True, know, 'exactly where it will fall to moon again')

In [58]:
??check_sentence_for_embedded_question

In [64]:
for file in csvs: 
    df = pd.read_csv(file)
    for index, row in df.iterrows(): 
        beginning_token = row['sentence'].lower().split()[0]
        if beginning_token in PRESUPPOSITIONAL_WH_WORDS and row['sentence'][-1] == '?': 
            print(row['trigger'], '|', row['sentence'], '|', row['presupposition'], '|', row['trigger_data'],  "\n")
            
            
        

implicative_predicates | Why , then , would anyone bother to try to expand the genetic alphabet ? | nan | {'predicate': 'bother', 'inference': 'the action requires small effforts'} 

factives | How can you tell whether you 're paying a fair price or getting what you 're paying for ? | nan | {'factive': 'tell', 'embedded_clause': "whether you 're paying a fair price or getting what you 're paying for"} 

change_of_state | Who could stay underwater like Toci , when bearded CaUigarichichichich attempted to drown him with ten , twenty dunkings , one after the other ? | nan | {'predicate': 'stay'} 

continuation_of_state | How soon after the visit to ABC News did the child begin to show some symptoms , and what were the symptoms exactly ? | The child didn't show any symptoms before the visit to ABC News. | {'predicate': 'begin'} 

implicative_predicates | How has he managed to rise so high while maintaining the passion and commitment that distinguished him as chairman of the Student Nonviol

In [208]:
for i in nlp("which philosopher advocated the idea of return to nature"):
    print(i, i.tag_, i.head)

how WRB orchestra
did VBD orchestra
orchestra VB orchestra
change NN orchestra
in IN change
the DT period
romantic JJ period
period NN in


In [209]:
displacy.serve( nlp("which philosopher advocated the idea of return to nature"))

OSError: [Errno 48] Address already in use

In [178]:
check_comparative(nlp('my pyramid is a larger pyramid than the one in giza'))

(True, 'larger', 'pyramid')

In [236]:
for t in nlp("I would have been happier if I had a dog"):
    #print(t, t.dep_, t.tag_)
    
    if t.tag_ == 'IN' and str(t).lower() == 'if':
        for tok in t.head.subtree:
            print(tok.tag_, tok.lemma_)


IN if
PRP I
VBD have
DT a
NN dog


In [223]:
for t in nlp("when was the jury system abolished in india"):
    print(t, t.dep_, t.tag_)
    print()


when advmod WRB

was aux VBD

the det DT

jury compound NN

system nsubj NN

abolished ROOT VBN

in prep IN

india pobj NNP

