In [1]:
from tqdm import tqdm

In [2]:
import os
import pandas as pd
#import numpy as np
import spacy


from io import StringIO
from spacy.tokens import Doc
from csv import QUOTE_NONE

CONLL_COLUMN_NAMES = ["idx", "token", "lemma", "upos", "xpos", "morph", "head", "dep", "enhanced", "misc" ]

class CoNLLReader(object):

    def __init__(self, path):
        self.conll_file = open(path, "r")
        self.nlp = spacy.load("en_core_web_sm")


    def __iter__(self):
        return self

    def __next__(self):
        with StringIO() as buffer:
            metadata = {}
            initial = True
            for line in self.conll_file:
                if line == "\n":
                    if initial:
                        continue
                    else:
                        break

                initial = False

                if line[0] == "#":
                    key, val = line[2:].split("=", 1)
                    key = key.strip()
                    val = val.strip()
                    metadata[key] = val

                else:
                    buffer.write(line)

            buffer.seek(0)
            d = pd.read_csv(buffer, sep="\t", names=CONLL_COLUMN_NAMES, quoting=QUOTE_NONE, dtype="str", keep_default_na=False, na_values=[])
            if len(d) < 1:
                raise StopIteration
            d = d.astype({"head": "int32"})

        try:
            doc = Doc(self.nlp.vocab, words=list(d["token"]))
        except:
            print(d.dtypes)
            print(d)
            print(d["token"])
            raise StopIteration

        for token, lemma in zip(doc, d["lemma"]):
            token.lemma_ = lemma


        for token, tag in zip(doc, d["xpos"]):
            token.tag_ = tag

        for token, dep in zip(doc, d["dep"]):
            token.dep_ = dep

        for token, head in zip(doc, d["head"]):
            if head > 0:
                token.head = doc[head-1]

        return doc, metadata

In [2]:
from change_of_state_better import find_change_of_state
from continuation_of_state_better import find_continuation_of_state

In [3]:
from comparatives import check_comparative
from clefts import check_cleft

In [4]:
from embedded_question import check_sentence_for_embedded_question 
from factives import check_sentence_for_factives, check_sentence_for_quote

In [6]:
from lib.conll_reader import CoNLLReader

In [7]:
dataset_path = "../../datasets/unanswerable_questions_dev.jsonl"    


In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [7]:
import json 
import pandas as pd

In [11]:
qs = pd.read_csv('/Users/nehasrikanth/Documents/question-gen/seed_annotation/round_2/annotator_questions_masterlist.csv')

In [16]:
for q in qs.question:
    spacy_q = nlp(q)
    
    if check_sentence_for_embedded_question(nlp(q)):
        print(q)
    break
         

When do babies go from babies to toddlers?


In [82]:
with open(dataset_path) as f:
    for line in f: 
        text = line.rstrip("\n").split(",")[0]
        nlp_text = nlp(text)
        
#         if check_sentence_for_embedded_question(nlp(text))[0]: 
#             print(text)
         
#         if check_sentence_for_factives(nlp_text)[0]: 
#             print(text)
            
#         if check_cleft(nlp_text): 
#             print(text)
     
#         if check_comparative(nlp_text)[0]: 
#             print(text)
            
    
        if find_re_verbs(nlp_text): 
            print(text)

who warned that a president who could be reelected again and again may easily become a king


In [None]:
s = "it was rupak that ate the cookies"

In [None]:
import numpy as np 

In [None]:
s1 = "it was neha that ate the cookies"

In [None]:
check_cleft(nlp(s1)) 

In [None]:
tokens = list(nlp(s1))
#print(tokens)
tokens_str = [str(token) for token in tokens]
#print(tokens_str)
cleft_word = np.intersect1d(tokens_str, ["it"])
#print(cleft_word)
index = tokens_str.index(cleft_word)

cop = tokens[index + 1]
obj = None 
for child in cop.children: 
    #print(child, child.dep_ )
    
    if child.dep_ in ['attr', 'dobj']: 
        obj = child
        break 

In [None]:
list(obj.children)  

In [None]:
check_cleft(nlp(s1) ) 

In [None]:
comp_sent = "Clifford is a bigger dog than Cujo."  # should evaluate to true



In [None]:
# test sentences
comp_sent = "Is Clifford a bigger dog than Cujo?"  # should evaluate to true
comp_sent2 = "Clifford is a more giant dog than Cujo."  # should evaluate to true
non_comp_sent = "Clifford is a bigger dog than I thought,"  # should evaluate to false
other_comp = "Clifford is bigger of a dog than Cujo."  # should evaluate to true
other_comp2 = "Clifford is bigger a dog than Cujo."  # should evaluate to true, currently doesn't

# displacy.serve(nlp(other_comp2),style='dep')


In [None]:
check_comparative(nlp(comp_sent))

In [None]:
check_sentence_for_embedded_question(nlp("I fail to see how you can rationalize rewarding illegality.")) 

In [None]:
s10 = "a microsecond later, images from his exterior sensors didn't snap into focus"

find_change_of_state(nlp(s10))

In [None]:
s9 = "Lisa stopped petting Tom's cat"

q = "both kids smiled"
q1 = "all three cat owners that julia spoke to want another cat"
q2 = "both of our fathers were thin"

In [None]:
q3 = "do you see any problem at all because folks on both sides of the aisle have tremendous respect from him?"

In [None]:
s4 = "meaning behind the song whiter shade of pale"

In [None]:
find_change_of_state(nlp(s9))

In [None]:
find_continuation_of_state(nlp(s9))

In [21]:
def find_numeric_determiners(sentence):

    numeric_predeterminers = set(["all"])
    special_determiners = set(["both"])
    all_determiners = numeric_predeterminers | special_determiners


    # extract words as list of strings
    words = [t.text.lower() for t in sentence]
    
    #print(words)
    if set(words) & all_determiners:
        for token in sentence:
            include = False
            if token.text.lower() in special_determiners:
                if token.tag_ == "DT" and token.dep_ == "det":
                    include = True
                    return True  
                    
                if not include:
                    for child in token.children:
                        if child.dep_ == "prep" and child.text.lower() == "of":
                            include = True
                            return True 


            if not include and token.text.lower() in numeric_predeterminers:
                        
                if token.tag_ == "DT" and token.dep_ == "det":
                    head_token = token.head

                    if head_token.tag_ == "CD":
                        include = True # all three of the children
                        return True  
                        
                    else :
                        
                        for child in head_token.children:
                            ##print("\t", child, child.dep_, child.tag_)
                            if child.tag_ == "CD" and child.dep_ == "nummod":  ## changed token.dep_ to child.dep_ 
                                #print("here")
                                ##print(sentence)
                                include = True # all three children
                                #print(include)
                                return True 
                
                elif token.tag_ == "DT":
                    for child in token.children:
                        if child.dep_ == "prep" and child.text.lower() == "of":
                            for child_child in child.children:
                                if child_child.dep_ == "pobj":
                                    has_det = False
                                    has_nummod = False
                                    for child_child_child in child_child.children:
                                        if (child_child_child.tag_ == "CD"
                                            and child_child_child.dep_ == "nummod"):
                                            has_nummod = True # all of the three children
                                        if (child_child_child.tag_ == "DT"
                                            and child_child_child.dep_ == "det"):
                                            has_det = True # all of the three children

                                    include = has_det and has_nummod
                                    if include:
                                        #print(sentence)
                                        return True 
                            if include:
                                return True 


    return False




In [23]:
find_numeric_determiners(nlp(s5))

True

In [22]:
s5 = "both channels is administered by the u.s. department of justice"

In [32]:
#from re_verbs_better import find_re_verbs 

In [95]:
re_verbs_corpus = "wordlists/re_verbs_updated.txt"

def find_re_verbs(sentence, cos_predicate_path = re_verbs_corpus):

    with open(cos_predicate_path, 'r') as f:
        re_verbs = set([l.strip() for l in f.readlines()])

    # extract words as list of strings
    words = [t.text for t in sentence]
    
    if set(words) & re_verbs:
        
        for token in sentence:
            #print(token, token.tag_, token.dep_)
            if (str(token) in re_verbs
                and token.tag_[0] == "V"
                and token.dep_ == "ROOT"):  # why?? 
                
                if len(list(token.children))  > 0: 
                    return True 
                    
    return False 

 

In [98]:
s6 = "holly decided to reconnect the charger?"

In [99]:
find_re_verbs(nlp(s6))

False

In [56]:
with open(re_verbs_corpus, 'r') as f:
        re_verbs = set([l.strip() for l in f.readlines()])

In [59]:
"reenter" in re_verbs

False

In [100]:
from re_verbs_better import find_re_verbs

In [104]:
from spacy import displacy
from spacy.matcher import PhraseMatcher

implicative_path = "wordlists/implicative_predicates.txt"

def find_implicatives(sentence, impl_pred_path=implicative_path):
    
    nlp = spacy.load("en_core_web_sm")
    matcher = PhraseMatcher(nlp.vocab)
    
    with open(impl_pred_path, 'r') as f:
        implicatives = [l.strip() for l in f]
        implicative_pairs = [item.split(":") for item in implicatives] #seperate predicates and inferences
        implicative_predicates = [pair[0] for pair in implicative_pairs]
        implicative_inferences_dict = {pair[0]:pair[1] for pair in implicative_pairs} #dictionary mapping predicates to inferences
        implicative_verbs = set([p.split()[0] for p in implicative_predicates])
        # add implicative predicates to matcher
        patterns = [nlp.make_doc(text) for text in implicative_predicates]
        matcher.add("IMPL_PRED_LIST", None, *patterns)
        

    #words = [t.text for t in sentence]
    # find matches of implicative predicates in the sentence
    matches = matcher(sentence)

    if len(matches) > 0:
        for match_id, start, end in matches:
            impl_phrase = sentence[start:end]
            #print(str(impl_phrase))
        # match found, proceed to further checking
        for token in sentence:
            if (str(token) in implicative_verbs
                and token.tag_[0] == "V"
                and token.dep_ == "ROOT"):
                #include = False
                for child in token.children:
                    if child.dep_ == "prep":
                        for childs in child.children:
                            if childs.dep_ == "pcomp" and childs.pos_ == "VERB":
                                return True 
                                #include = True
                                #break
                    else:
                        if child.dep_ == "xcomp" and child.pos_ == "VERB":
                            return True 
                            #include = True
                            #break
    return False 

In [107]:
s = "survivors managed to scramble out through the tiny gap in the rocks"
s1 = "survivors scrambled through the tiny gaps on the rocks"

In [109]:
from implicative_verbs_better import find_implicatives