In [5]:
import nltk
from nltk import sent_tokenize
from get_SAO_en import *
import json


global preposition_list
preposition_list = ['about', 'above', 'across', 'after', 'against', 'along', 'among', 'around', 'at', 'before', 'behind', 'between', 'beyond', 'but', 'by', 'despite', 'down', 'during', 'except', 'for', 'from', 'in', 'into', 'like', 'near', 'of', 'off', 'on', 'onto', 'out', 'over', 'past', 'plus', 'since', 'throughout', 'to', 'towards', 'under', 'until', 'up', 'upon', 'with', 'within', 'without']

def extractTriplets(claims):
    sents = sent_tokenize(claims, language="english")
    for sent in sents:
        saos = get_SAO_en(sent)
        if saos:
            if saos != None:
                return saos
            else:
                return []
            
def get_noun_chunks(claims, model=nlp):
    res = []
    doc = nlp(claims)

    for chunk in doc.noun_chunks:
        # remove det and stop words as root
        try:
            while chunk[0].pos_=="DET" or chunk[0].text in stopwords or chunk[0].lemma_ in stopwords or "," in chunk[0].text:
                chunk = chunk[1:]
        except IndexError:
            continue

        if len(chunk) == 1:
            if chunk[0].lemma_ not in stopwords:
                res.append(chunk)
        else:
            if not (chunk.root.lemma_ in stopwords):
                res.append(chunk.root)
                
            if not ("plurality of" in chunk.text.lower() or any(t.replace('.','',1).isdigit() for t in chunk.text.split()) or ("e.g." in chunk.text) or (len(chunk) > 8 and "," in chunk.text)): 
                res.append(chunk)

            if len(chunk) > 3:
                children_compound = [child.i for child in chunk.root.children if child.dep_ == "compound"]
                if children_compound:
                    idx = children_compound + [chunk.root.i]
                    span = doc[min(idx): max(idx)+1]

                    if span != chunk:
                        res.append(span)

                # to split some very long noun chunk which includes "," in it
                if len(chunk) > 8 and "," in chunk.text:
                        
                    start_index = chunk[0].i
                    curr_index = start_index + 1
                    while curr_index < chunk[-1].i:
                        if doc[curr_index].text == ",":

                            res.append(doc[start_index: curr_index])
                            start_index = curr_index + 1
                            curr_index = start_index + 1
                        else:
                            curr_index += 1
                    res.append(doc[start_index:chunk[-1].i+1])

    # convert all tokens into span type        
    res = [(doc[term.i: term.i+1] if type(term)==type(doc[0]) else term) for term in res]

    for i in range(len(res)-1):
        curr_term = res[i]
        next_term = res[i+1]
        try:
            if doc[curr_term[-1].i+1].text in preposition_list and next_term[0].i == (curr_term[-1].i+2):
                res.append(doc[curr_term[0].i: next_term[-1].i+1])
        except IndexError:
            continue

    return res

In [6]:
listOfPatents = open('texts_raw_2018.txt').read().split('\n\n')

In [7]:
listOfClaims = []
for p in listOfPatents:
    try:
        if '_____c:' in p:
            rest, claims = p.split('_____c:')
            listOfClaims.append(claims)
    except ValueError:
            continue

In [8]:
print(len(listOfClaims),'claims')

129891 claims


In [None]:
print(listOfClaims[999:1001])

In [9]:
f = open('verb_rel.json')
data = json.load(f)
listOfVerbsAnnotated = ""
for i in data:
    listOfVerbsAnnotated = listOfVerbsAnnotated + " " + i
f.close()

In [12]:
import nltk
from nltk import sent_tokenize
import json
import spacy,re

nlp = spacy.load('en_core_web_md')
stopwords = nlp.Defaults.stop_words

with open("US_stopwords.txt") as inf:
    stopwords_to_append = inf.read().splitlines()
stopwords.update(stopwords_to_append)

def cleaned_noun_chunk(noun):
    noun_cleaned = re.sub("( comprising$)", "", noun)
    return noun_cleaned


def get_passive_verbs(sentence, model=nlp):
    sentence = sentence.replace("said", "the")

    doc = model(sentence)

    ############################# PART2 #############################
    # for passive form
    listOfPassiveVerbs = []
    passive_verbs = [w for w in doc if w.pos_ == "VERB" and [c for c in w.children if c.dep_ == "nsubjpass"]]
    for verb in passive_verbs:
        subject = [child for child in verb.children if child.dep_ == "nsubjpass"][-1]
        listOfPassiveVerbs.append(verb.text)

    return listOfPassiveVerbs


def extractPassiveVerbs(claims):
    sents = sent_tokenize(claims, language="english")
    for sent in sents:
        passive = get_passive_verbs(sent)
        if passive:
            if passive != None:
                return passive
            else:
                return []

In [13]:
from collections import Counter

listOfPassiveVerbs = []
for c in listOfClaims:
    passive = extractPassiveVerbs(c)
    if passive is not None:
        listOfPassiveVerbs.append(passive[0])

Counter(listOfPassiveVerbs)

KeyboardInterrupt: 

In [None]:
verbsDict = {}
for claims in listOfClaims:
    try:
        triplets = extractTriplets(claims)
        if triplets is not None:
            for triplet in triplets:
                if triplet[1] not in listOfVerbsAnnotated:
                    if triplet[1] in verbsDict:
                        verbsDict[triplet[1]] = verbsDict[triplet[1]] + 1
                    else:
                        verbsDict[triplet[1]] = 1
    except IndexError:
            continue

In [None]:
for key, value in verbsDict.items():
    print(f"{key}: {value}")