In [1]:
# Import the below packages.

import spacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm")
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
#from nltk.stem.wordnet import WordNetLemmatizer
#import textacy
#from textacy.extract import subject_verb_object_triples
#from bs4 import BeautifulSoup
#import requests
import re
import os

In [2]:
data_dir ='/home/nb01/C_Drive/Knowledge_Graph_Creation/Dataset/PetroleumPreprocessedDataset/' 
TEXTS = [open(data_dir+f).read() for f in os.listdir(data_dir)]

In [3]:
os.listdir(data_dir)

['Alkane.txt', 'Alaska.txt']

In [4]:
len(TEXTS)

2

In [5]:
SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
OBJECTS = ["dobj", "dative", "attr", "oprd"]

In [6]:
#Every function has a set of rules to extract Subjects and Objects based on the rules 

def getSubsFromConjunctions(subs):
    moreSubs = []
    for sub in subs:
        # rights is a generator
        rights = list(sub.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
            if len(moreSubs) > 0:
                moreSubs.extend(getSubsFromConjunctions(moreSubs))
    return moreSubs

def getObjsFromConjunctions(objs):
    moreObjs = []
    for obj in objs:
        # rights is a generator
        rights = list(obj.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
            if len(moreObjs) > 0:
                moreObjs.extend(getObjsFromConjunctions(moreObjs))
    return moreObjs

def getVerbsFromConjunctions(verbs):
    moreVerbs = []
    for verb in verbs:
        rightDeps = {tok.lower_ for tok in verb.rights}
        if "and" in rightDeps:
            moreVerbs.extend([tok for tok in verb.rights if tok.pos_ == "VERB"])
            if len(moreVerbs) > 0:
                moreVerbs.extend(getVerbsFromConjunctions(moreVerbs))
    return moreVerbs

def findSubs(tok):
    head = tok.head
    while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
        head = head.head
    if head.pos_ == "VERB":
        subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
        if len(subs) > 0:
            verbNegated = isNegated(head)
            subs.extend(getSubsFromConjunctions(subs))
            return subs, verbNegated
        elif head.head != head:
            return findSubs(head)
    elif head.pos_ == "NOUN":
        return [head], isNegated(tok)
    return [], False

def isNegated(tok):
    negations = {"no", "not", "n't", "never", "none"}
    for dep in list(tok.lefts) + list(tok.rights):
        if dep.lower_ in negations:
            return True
    return False

def findSVs(tokens):
    svs = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        if len(subs) > 0:
            for sub in subs:
                svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
    return svs

def getObjsFromPrepositions(deps):
    objs = []
    for dep in deps:
        if dep.pos_ == "ADP" and dep.dep_ == "prep":
            objs.extend([tok for tok in dep.rights if tok.dep_  in OBJECTS or (tok.pos_ == "PRON" and tok.lower_ == "me")])
    return objs

def getObjsFromAttrs(deps):
    for dep in deps:
        if dep.pos_ == "NOUN" and dep.dep_ == "attr":
            verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
            if len(verbs) > 0:
                for v in verbs:
                    rights = list(v.rights)
                    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
                    objs.extend(getObjsFromPrepositions(rights))
                    if len(objs) > 0:
                        return v, objs
    return None, None

def getObjFromXComp(deps):
    for dep in deps:
        if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
            v = dep
            rights = list(v.rights)
            objs = [tok for tok in rights if tok.dep_ in OBJECTS]
            objs.extend(getObjsFromPrepositions(rights))
            if len(objs) > 0:
                return v, objs
    return None, None

def getAllSubs(v):
    verbNegated = isNegated(v)
    subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
    if len(subs) > 0:
        subs.extend(getSubsFromConjunctions(subs))
    else:
        foundSubs, verbNegated = findSubs(v)
        subs.extend(foundSubs)
    return subs, verbNegated

def getAllObjs(v):
    # rights is a generator
    rights = list(v.rights)
    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
    objs.extend(getObjsFromPrepositions(rights)
    potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
    if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
        objs.extend(potentialNewObjs)
        v = potentialNewVerb
    if len(objs) > 0:
        objs.extend(getObjsFromConjunctions(objs))
    return v, objs

def findSVOs(tokens):
    svos = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        # hopefully there are subs, if not, don't examine this verb any longer
        if len(subs) > 0:
            v, objs = getAllObjs(v)
            for sub in subs:
                for obj in objs:
                    objNegated = isNegated(obj)
                    svos.append((sub.lower_, "!" + v.lower_ if verbNegated or objNegated else v.lower_, obj.lower_))
    return svos
def printDeps(toks):
    for tok in toks:
        print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_, [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights])

SyntaxError: invalid syntax (<ipython-input-6-5eccff644eb5>, line 116)

In [None]:
for i,text in enumerate(TEXTS):
    for sent in sent_tokenize(text):
        #svos = findSVOs(sent)
        print(sent)
        #print(svos)

In [14]:
for i,text in enumerate(TEXTS):
    #count = 0
    for sentence in nltk.sent_tokenize(text):
        doc = nlp(sentence)
        #count += 1
        #while count < 2:
        print(doc)
        print('\n')

In organic chemistry alkane paraffin acyclic saturated hydrocarbon .


In words alkane consists hydrogen carbon atoms arranged tree structure carbon–carbon bonds single .


Alkanes general chemical formula CnH2n2 .


The alkanes range complexity simplest case methane n 1 arbitrarily large complex molecules like pentacontane 6ethyl2methyl5 octane isomer tetradecane .


IUPAC defines alkanes acyclic branched unbranched hydrocarbons general formula CnH2n2 therefore consisting entirely hydrogen atoms saturated carbon atoms .


However sources use term denote saturated hydrocarbon including either monocyclic polycyclic despite different general formula .


In alkane carbon atom sp3hybridized 4 sigma bonds hydrogen atom joined one carbon atoms .


The longest series linked carbon atoms molecule known carbon skeleton carbon backbone .


The number carbon atoms may considered size alkane .


One group higher alkanes waxes solids standard ambient temperature pressure number carbon atoms carbon 

Structural formulae represent bonds right angles one another common useful correspond reality .


The structural formula bond angles usually sufficient completely describe geometry molecule .


There degree freedom carbon–carbon bond torsion angle atoms groups bound atoms end bond .


The spatial arrangement described torsion angles molecule known conformation .


Ethane forms simplest case studying conformation alkanes one C–C bond .


If one looks axis C–C bond one see socalled Newman projection .


The hydrogen atoms front rear carbon atoms angle 120° resulting projection base tetrahedron onto flat plane .


However torsion angle given hydrogen atom attached front carbon given hydrogen atom attached rear carbon vary freely 0° 360° .


This consequence free rotation carbon–carbon single bond .


Despite apparent freedom two limiting conformations important eclipsed conformation staggered conformation .


The two conformations differ energy staggered conformation 12.6 kJmol lower ener

Chemical analysis showed abundances ethane methane roughly equal thought imply ices formed interstellar space away Sun would evaporated volatile molecules .


Alkanes also detected meteorites carbonaceous chondrites .


Traces methane gas occur Earth s atmosphere produced primarily methanogenic microorganisms Archaea gut ruminants .


The important commercial sources alkanes natural gas oil .


Natural gas contains primarily methane ethane propane butane oil mixture liquid alkanes hydrocarbons .


These hydrocarbons formed marine animals plants died sank bottom ancient seas covered sediments anoxic environment converted many millions years high temperatures high pressure current form .


Natural gas resulted thereby example following reaction These hydrocarbon deposits collected porous rocks trapped beneath impermeable cap rocks comprise commercial oil fields .


They formed millions years exhausted readily replaced .


The depletion hydrocarbons reserves basis known energy crisis .




.


Coordinates 64°N 150°W﻿ ﻿64°N 150°W﻿ 64 150 Alaska Aleut Alax̂sxax̂ Inupiaq Alaasikaq Alutiiq Alaskaaq Tlingit Anáaski Russian Аляска romanized Alyaska state located northwest extremity United States West Coast across Bering Strait Asia .


An exclave U.S. borders Canadian province British Columbia territory Yukon east southeast maritime border Russia s Chukotka Autonomous Okrug west .


To north Chukchi Beaufort seas Arctic Ocean Pacific Ocean lies south southwest .


Alaska largest U.S. state area seventh largest subnational division world .


It third least populous sparsely populated state far continent s populous territory located mostly north 60th parallel estimated population 738432 2015—more quadruple combined populations Northern Canada Greenland .


Approximately half Alaska s residents live within Anchorage metropolitan area .


The state capital Juneau second largest city United States area comprising territory states Rhode Island Delaware .


Alaska occupied various in

The lowest official Alaska temperature −80 °F Prospect Creek January 23 1971 one degree lowest temperature recorded continental North America .


The climate extreme north Alaska Arctic long cold winters short cool summers .


Even July average low temperature Utqiagvik 34 °F .


Precipitation light part Alaska many places averaging less 10 per year mostly snow stays ground almost entire year .


Numerous indigenous peoples occupied Alaska thousands years arrival European peoples area .


Linguistic DNA studies done provided evidence settlement North America way Bering land bridge .


At Upward Sun River site Tanana River Valley Alaska remains sixweekold infant found .


The baby s DNA showed belonged population genetically separate native groups present elsewhere New World end Pleistocene .


Ben Potter University Alaska Fairbanks archaeologist unearthed remains Upward River Sun site 2013 named new group Ancient Beringians .


The Tlingit people developed society matrilineal kinship s

According 2011 American Community Survey 83.4 people age five spoke English home .


About 3.5 spoke Spanish home 2.2 spoke another IndoEuropean language 4.3 spoke Asian language 5.3 spoke languages home .


The Alaska Native Language Center University Alaska Fairbanks claims least 20 Alaskan native languages exist also languages different dialects .


Most Alaska s native languages belong either Eskimo–Aleut NaDene language families however languages thought isolates yet classified .


As 2014 nearly Alaska s native languages classified either threatened shifting moribund nearly extinct dormant languages .


A total 5.2 Alaskans speak one state s 20 indigenous languages known locally native languages .


In October 2014 governor Alaska signed bill declaring state s 20 indigenous languages official status .


This bill gave symbolic recognition official languages though adopted official use within government .


The 20 languages included bill According statistics collected Association 

Hunting subsistence primarily caribou moose Dall sheep still common state particularly remote Bush communities .


An example traditional native food Akutaq Eskimo ice cream consist reindeer fat seal oil dried fish meat local berries .


Alaska s reindeer herding concentrated Seward Peninsula wild caribou prevented mingling migrating domesticated reindeer .


Most food Alaska transported state Outside shipping costs make food cities relatively expensive .


In rural areas subsistence hunting gathering essential activity imported food prohibitively expensive .


Although small towns villages Alaska lie along coastline cost importing food remote villages high terrain difficult road conditions change dramatically due varying climate precipitation changes .


The cost transport reach high 50¢ per pound remote areas difficult times locations reached inclement weather terrain conditions .


The cost delivering 1 US gallon milk 3.50 many villages per capita income 20000 less .


Fuel cost per

He officially rejoined Republican party 1994 .


Alaska s voter initiative making marijuana legal took effect February 24 2015 placing Alaska alongside Colorado Washington first three U.S. states recreational marijuana legal .


The new law means people 21 consume small amounts pot—if find .


The first legal marijuana store opened Valdez October 2016 .


To finance state government operations Alaska depends primarily petroleum revenues federal subsidies .


This allows lowest individual tax burden United States .


It one five states sales tax one seven states individual income tax state neither .


The Department Revenue Tax Division reports regularly state s revenue sources .


The Department also issues annual summary operations including new state laws directly affect tax division .


While Alaska state sales tax 89 municipalities collect local sales tax 1.0–7.5 typically 3–5 .


Other local taxes levied include raw fish taxes hotel motel bedandbreakfast bed taxes severance taxes 

Upon arriving Alaska set Camp Hollywood Northwest Alaska lived duration filming .


Louis B. Mayer spared expense spite remote location going far hire chef Hotel Roosevelt Hollywood prepare meals .


When Eskimo premiered Astor Theatre New York City studio received largest amount feedback history .


Eskimo critically acclaimed released worldwide result Mala became international movie star .


Eskimo first Oscar Best Film Editing Academy Awards showcased preserved aspects Inupiat culture film .


The 1983 Disney movie Never Cry Wolf least partially shot Alaska .


The 1991 film White Fang based Jack London s novel starring Ethan Hawke filmed around Haines .


Steven Seagal s 1994 On Deadly Ground starring Michael Caine filmed part Worthington Glacier near Valdez .


The 1999 John Sayles film Limbo starring David Strathairn Mary Elizabeth Mastrantonio Kris Kristofferson filmed Juneau .


The psychological thriller Insomnia starring Al Pacino Robin Williams shot Canada set Alaska .


The

In [15]:
sent = "IUPAC defines alkanes acyclic branched unbranched hydrocarbons general formula CnH2n2 therefore consisting entirely hydrogen atoms saturated carbon atoms."
doc = nlp(sent)
doc

IUPAC defines alkanes acyclic branched unbranched hydrocarbons general formula CnH2n2 therefore consisting entirely hydrogen atoms saturated carbon atoms.

In [43]:
for token in doc:
    print(f"{token.text:{15}} {token.tag_:{5}} {spacy.explain(token.tag_):{40}} {token.pos_:{10}} {token.dep_:{20}}")


IUPAC           NN    noun, singular or mass                   NOUN       compound            
defines         NNS   noun, plural                             NOUN       nsubj               
alkanes         VBZ   verb, 3rd person singular present        VERB       ROOT                
acyclic         NNP   noun, proper singular                    PROPN      nsubj               
branched        VBD   verb, past tense                         VERB       ccomp               
unbranched      JJ    adjective                                ADJ        amod                
hydrocarbons    NNS   noun, plural                             NOUN       nmod                
general         JJ    adjective                                ADJ        amod                
formula         NN    noun, singular or mass                   NOUN       dobj                
CnH2n2          NNP   noun, proper singular                    PROPN      punct               
therefore       RB    adverb                      

In [45]:
for ent in doc.ents:
    print(f"{ent.text:{10}} {ent.label_:{10}} {str(spacy.explain(ent.label_)):{50}}")

In [46]:
for chunk in doc.noun_chunks:
    print(chunk)

IUPAC defines
acyclic
unbranched hydrocarbons general formula
hydrogen atoms
