In [2]:
# Import the below packages.

import spacy
import textacy
from textacy.extract import subject_verb_object_triples
from bs4 import BeautifulSoup
import requests
import re
import os

# Loading the input data

In [3]:
# Execute the commented line below if running locally, if running on Google Colab use the uncommented statement 
# data_dir stores the directory where we store the PreprocessedDataset (i.e., text files after using stanford-corenlp-python)
# We read the text present in every file and save it in a list called TEXTS

# data_dir = os.getcwd()+'/ActorsPreprocessedDataset/'
data_dir ='/home/nb01/C_Drive/Knowledge_Graph_Creation/Dataset/PetroleumPreprocessedDataset/' 
TEXTS = [open(data_dir+f).read() for f in os.listdir(data_dir)]

In [4]:
os.listdir(data_dir)

['Alkane.txt', 'Alaska.txt']

In [5]:
len(TEXTS)

2

# Subject Verb Object Generation using Spacy's subject_verb_object_triples method

In [6]:
# Download and load SpaCy's en_core_web_sm model to do the find the named entities 
# Store the entities and its type/annotations in a dictionary for using it in KG construction
# Extract the SVO (Subject Verb Object) triples suing dependency parsing and save it in the svos list of tuples
# Also save the labels of the SVO as a separate list

nlp = spacy.load('en_core_web_sm')
final_svos = []
final_text_svos = []
entity_dict = {}
svo_labels = []
for i, text in enumerate(TEXTS):
    doc = nlp(text)
    for ent in doc.ents:
        if ent not in entity_dict.keys():
            entity_dict[str(ent)] = ent.label_       
    svos = list(subject_verb_object_triples(doc))
    svos_text = [(str(x[0]).strip(), str(x[1]).strip(), str(x[2]).strip()) for x in svos]
    final_svos = final_svos + svos
    final_text_svos = final_text_svos + svos_text

for svo in final_text_svos:
    tup = ['Object', 'Object']
    if(svo[0] in entity_dict.keys()):
        tup[0] = entity_dict[svo[0]]
    
    if(svo[2] in entity_dict.keys()):
        tup[1] = entity_dict[svo[2]]
    svo_labels.append(tuple(tup))

In [7]:
final_text_svos

[('alkane paraffin acyclic', 'saturated', 'hydrocarbon'),
 ('carbon atoms', 'arranged', 'structure carbon'),
 ('alkanes', 'range', 'case'),
 ('alkanes', 'range', 'molecules'),
 ('IUPAC defines', 'alkanes', 'carbon atoms'),
 ('acyclic', 'branched', 'formula'),
 ('sources', 'use', 'term denote'),
 ('sources', 'use', 'hydrocarbon'),
 ('bonds hydrogen atom', 'joined', 'carbon atoms'),
 ('series', 'linked', 'molecule'),
 ('number carbon atoms', 'may considered', 'size alkane'),
 ('compounds members', 'differ', 'mass multiples'),
 ('They', 'viewed', 'trees'),
 ('They', 'abbreviated', 'R'),
 ('Alk', 'used', 'symbolize'),
 ('group', 'called', 'cycloalkanes'),
 ('hydrocarbons', 'combine', 'formula'),
 ('Alkanes', 'arranged', 'ways'),
 ('carbon atoms', 'arranged', 'chain branches'),
 ('isomer', 'called', 'nisomer'),
 ('chain carbon atoms', 'branched', 'points'),
 ('isomers', 'increases', 'number carbon atoms'),
 ('alkane isomers chain carbon atoms', 'may form', 'loops'),
 ('compounds', 'called',

In [8]:
# Write all the SVOs as a CSV file

import csv

with open('svos.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(final_text_svos)

csvFile.close()

In [9]:
len(final_text_svos)

630

In [10]:
svo_labels

[('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('ORG', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('NORP', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Object'),
 ('Object', 'Obj

In [11]:
# Save the entity type dictionary using pickle

import pickle
with open('entity_dict.pickle', 'wb') as handle:
    pickle.dump(entity_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

##Experimental section:
Other techniques we tried out. 

### Knowledge Graph visualization

In [12]:
# Visualize the KG using Graphviz

def generate_graphviz_graph(entity_relations, name, verbose=True):
    """digraph G {
    # a -> b [ label="a to b" ];
    # b -> c [ label="another label"];
    }"""
    graph = list()
    graph.append('digraph {')
    for er in entity_relations:
        graph.append('"{}" -> "{}" [ label="{}" ];'.format(er[0], er[2], er[1]))
    graph.append('}')

    out_dot = name + '.dot'
    with open(out_dot, 'w') as output_file:
        output_file.writelines(graph)

    out_png = name + '.png'
    DOT_BIN_PATH = 'dot'
    command = "dot -Tpng {} -o {}".format(out_dot, out_png)
 
    os.system(command)

    print('Wrote graph to {} and {}'.format(out_dot, out_png))

### Subject Verb Object using Spacy's Github code

In [13]:
# Define the dependencies that are applicable to subjects and Objects

from nltk.stem.wordnet import WordNetLemmatizer
import spacy
from spacy.lang.en import English

SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
OBJECTS = ["dobj", "dative", "attr", "oprd"]

In [14]:
#Every function has a set of rules to extract Subjects and Objects based on the rules 

def getSubsFromConjunctions(subs):
    moreSubs = []
    for sub in subs:
        # rights is a generator
        rights = list(sub.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
            if len(moreSubs) > 0:
                moreSubs.extend(getSubsFromConjunctions(moreSubs))
    return moreSubs

def getObjsFromConjunctions(objs):
    moreObjs = []
    for obj in objs:
        # rights is a generator
        rights = list(obj.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
            if len(moreObjs) > 0:
                moreObjs.extend(getObjsFromConjunctions(moreObjs))
    return moreObjs

def getVerbsFromConjunctions(verbs):
    moreVerbs = []
    for verb in verbs:
        rightDeps = {tok.lower_ for tok in verb.rights}
        if "and" in rightDeps:
            moreVerbs.extend([tok for tok in verb.rights if tok.pos_ == "VERB"])
            if len(moreVerbs) > 0:
                moreVerbs.extend(getVerbsFromConjunctions(moreVerbs))
    return moreVerbs

def findSubs(tok):
    head = tok.head
    while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
        head = head.head
    if head.pos_ == "VERB":
        subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
        if len(subs) > 0:
            verbNegated = isNegated(head)
            subs.extend(getSubsFromConjunctions(subs))
            return subs, verbNegated
        elif head.head != head:
            return findSubs(head)
    elif head.pos_ == "NOUN":
        return [head], isNegated(tok)
    return [], False

def isNegated(tok):
    negations = {"no", "not", "n't", "never", "none"}
    for dep in list(tok.lefts) + list(tok.rights):
        if dep.lower_ in negations:
            return True
    return False

def findSVs(tokens):
    svs = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        if len(subs) > 0:
            for sub in subs:
                svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
    return svs

def getObjsFromPrepositions(deps):
    objs = []
    for dep in deps:
        if dep.pos_ == "ADP" and dep.dep_ == "prep":
            objs.extend([tok for tok in dep.rights if tok.dep_  in OBJECTS or (tok.pos_ == "PRON" and tok.lower_ == "me")])
    return objs

def getObjsFromAttrs(deps):
    for dep in deps:
        if dep.pos_ == "NOUN" and dep.dep_ == "attr":
            verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
            if len(verbs) > 0:
                for v in verbs:
                    rights = list(v.rights)
                    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
                    objs.extend(getObjsFromPrepositions(rights))
                    if len(objs) > 0:
                        return v, objs
    return None, None

def getObjFromXComp(deps):
    for dep in deps:
        if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
            v = dep
            rights = list(v.rights)
            objs = [tok for tok in rights if tok.dep_ in OBJECTS]
            objs.extend(getObjsFromPrepositions(rights))
            if len(objs) > 0:
                return v, objs
    return None, None

def getAllSubs(v):
    verbNegated = isNegated(v)
    subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
    if len(subs) > 0:
        subs.extend(getSubsFromConjunctions(subs))
    else:
        foundSubs, verbNegated = findSubs(v)
        subs.extend(foundSubs)
    return subs, verbNegated

def getAllObjs(v):
    # rights is a generator
    rights = list(v.rights)
    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
    objs.extend(getObjsFromPrepositions(rights)
    potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
    if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
        objs.extend(potentialNewObjs)
        v = potentialNewVerb
    if len(objs) > 0:
        objs.extend(getObjsFromConjunctions(objs))
    return v, objs

def findSVOs(tokens):
    svos = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        # hopefully there are subs, if not, don't examine this verb any longer
        if len(subs) > 0:
            v, objs = getAllObjs(v)
            for sub in subs:
                for obj in objs:
                    objNegated = isNegated(obj)
                    svos.append((sub.lower_, "!" + v.lower_ if verbNegated or objNegated else v.lower_, obj.lower_))
    return svos
def printDeps(toks):
    for tok in toks:
        print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_, [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights])

SyntaxError: invalid syntax (<ipython-input-14-5eccff644eb5>, line 116)

In [17]:
# Example SVOs
for sent in nltk.sent_tokenize(doc):
    svos = findSVOs(sent)
    print(svos)

NameError: name 'nltk' is not defined

In [16]:
print(doc)

Coordinates 64°N 150°W﻿ ﻿64°N 150°W﻿ 64 150 Alaska Aleut Alax̂sxax̂ Inupiaq Alaasikaq Alutiiq Alaskaaq Tlingit Anáaski Russian Аляска romanized Alyaska state located northwest extremity United States West Coast across Bering Strait Asia . An exclave U.S. borders Canadian province British Columbia territory Yukon east southeast maritime border Russia s Chukotka Autonomous Okrug west . To north Chukchi Beaufort seas Arctic Ocean Pacific Ocean lies south southwest . Alaska largest U.S. state area seventh largest subnational division world . It third least populous sparsely populated state far continent s populous territory located mostly north 60th parallel estimated population 738432 2015—more quadruple combined populations Northern Canada Greenland . Approximately half Alaska s residents live within Anchorage metropolitan area . The state capital Juneau second largest city United States area comprising territory states Rhode Island Delaware . Alaska occupied various indigenous peoples t