In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc =  nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [5]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [7]:
sent = '''Professor Tan Eng Chye, NUS Deputy President and Provost, and Professor 
Menahem Ben-Sasson, President of HUJ signed the joint degree agreement at NUS, 
in the presence of Ambassador of Israel to Singapore Her Excellency Amira Arnon 
and about 30 invited guests, on July 03, 2013.
'''

In [9]:
doc = nlp(sent)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Tan Eng Chye 10 22 PERSON
NUS 24 27 ORG
Provost 49 56 ORG
Menahem Ben-Sasson 73 91 PERSON
HUJ 106 109 ORG
NUS 147 150 ORG
Israel 186 192 GPE
Singapore 196 205 GPE
Amira Arnon 221 232 PERSON
about 30 238 246 CARDINAL
July 03, 2013 266 279 DATE


POST Tag followed by NE Chunk

In [10]:
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

Professor Tan Eng Chye Chye nsubj signed
NUS Deputy President President appos Chye
Provost Provost conj President
Professor Professor conj Chye
Menahem Ben-Sasson Sasson conj Professor
President President appos Sasson
HUJ HUJ pobj of
the joint degree agreement agreement dobj signed
NUS NUS pobj at
the presence presence pobj in
Ambassador Ambassador pobj of
Israel Israel pobj of
Her Excellency Excellency dobj Singapore
Amira Arnon Arnon appos Excellency
about 30 invited guests guests dobj Singapore
July July pobj on


In [13]:
# object and subject constants
OBJECT_DEPS = {"dobj", "dative", "attr", "oprd"}
SUBJECT_DEPS = {"nsubj", "nsubjpass", "csubj", "agent", "expl"}
# tags that define wether the word is wh-
WH_WORDS = {"WP", "WP$", "WRB"}

# extract the subject, object and verb from the input
def extract_svo(doc):
    sub = []
    at = []
    ve = []
    for token in doc:
        # is this a verb?
        if token.pos_ == "VERB":
            ve.append(token.text)
        # is this the object?
        if token.dep_ in OBJECT_DEPS or token.head.dep_ in OBJECT_DEPS:
            at.append(token.text)
        # is this the subject?
        if token.dep_ in SUBJECT_DEPS or token.head.dep_ in SUBJECT_DEPS:
            sub.append(token.text)
    return " ".join(sub).strip().lower(), " ".join(ve).strip().lower(), " ".join(at).strip().lower()

def is_question(doc):
    # is the first token a verb?
    if len(doc) > 0 and doc[0].pos_ == "VERB":
        return True, ""
    # go over all words
    for token in doc:
        # is it a wh- word?
        if token.tag_ in WH_WORDS:
            return True, token.text.lower()
    return False, ""

In [14]:
subject, verb, attribute = extract_svo(doc)

In [15]:
subject

'professor tan eng chye , president , and professor'

In [16]:
verb

'signed invited'

In [17]:
attribute

'the degree agreement her excellency arnon 30 invited guests'