# Extracting simple event data
### Useful code source:
https://github.com/majiga/Text2Event

https://andrewhalterman.com/post/event-data-in-30-lines-of-python/
### NLP tool: https://spacy.io/

In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = ("Premier Daniel Andrews says Victoria has recorded 723 more cases and 13 deaths from coronavirus.")

#processed_docs = list(nlp.pipe(text))
doc = nlp(text)
print(f'{"WORD":20}', f'{"LEMMA":20}', f'{"POS":10}', f'{"ENTITY":10}', f'{"DEPENDENCY":12}', f'{"INDEX":5}')

for t in doc:    
    print(f'{t.text:20}', f'{t.lemma_:20}', f'{t.pos_:10}', f'{t.ent_type_:10}', f'{t.dep_:12}', f'{t.i:5}')

WORD                 LEMMA                POS        ENTITY     DEPENDENCY   INDEX
Premier              Premier              PROPN                 compound         0
Daniel               Daniel               PROPN      PERSON     compound         1
Andrews              Andrews              PROPN      PERSON     nsubj            2
says                 say                  VERB                  ROOT             3
Victoria             Victoria             PROPN      GPE        nsubj            4
has                  have                 AUX                   aux              5
recorded             record               VERB                  ccomp            6
723                  723                  NUM        CARDINAL   nummod           7
more                 more                 ADJ                   amod             8
cases                case                 NOUN                  dobj             9
and                  and                  CCONJ                 cc              10
13  

In [2]:
from spacy import displacy
displacy.render(doc, style="ent")

In [3]:
#displacy.serve(doc, style="dep")
displacy.render(doc, style="dep")

In [4]:
def actor_extractor(root):
    for child in root.children: 
        if child.dep_ == "nsubj": 
            nsubj = child.text 
            nsubj_subtree = ''.join(w.text_with_ws for w in child.subtree).strip()            
    return nsubj_subtree

def object_extractor(root):
    objects = []
    for child in root.children: 
        if child.dep_ in ["dobj", 'conj']: 
            obj = child.text 
            obj_subtree = ''.join(w.text_with_ws for w in child.subtree).strip()
            objects.append(obj_subtree)
    return objects

def detect_event(doc, trigger_words):
    relations = []
    for word in doc: 
        if (word.dep_ == "ROOT" or word.pos_ == "VERB") and word.lemma_ in trigger_words:
            print('\n- Trigger word: ', word.lemma_)
            # Find the subject
            subj = actor_extractor(word)
            if subj:
                relations.append((subj, word))
                print('- Actor: ', subj)
            # Find the object
            objects = object_extractor(word)
            for obj in objects:
                relations.append((word, obj))
                print('- Objects: ', obj)
    return relations

In [5]:
# Detect an event from the text

#for doc in processed_docs: 
trigger_words = ["record", "begin", "say", "start", 'drill'] 
print('Text: ', text)

relations = detect_event(doc, trigger_words)

print('\nAn event found. Details:', *relations, sep='\n')

Text:  Premier Daniel Andrews says Victoria has recorded 723 more cases and 13 deaths from coronavirus.

- Trigger word:  say
- Actor:  Premier Daniel Andrews

- Trigger word:  record
- Actor:  Victoria
- Objects:  723 more cases and 13 deaths from coronavirus

An event found. Details:
('Premier Daniel Andrews', says)
('Victoria', recorded)
(recorded, '723 more cases and 13 deaths from coronavirus')


In [8]:
# Detecting entities from general domain text is streight forward with Spacy.

text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc3 = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc3.noun_chunks])
print("\nVerbs:", [token.lemma_ for token in doc3 if token.pos_ == "VERB"], '\n\nEntities:\n')

# Find named entities, phrases and concepts
for entity in doc3.ents:
    print(entity.text, entity.label_)
    
displacy.render(doc3, style="ent")

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']

Verbs: ['start', 'work', 'drive', 'take', 'can', 'tell', 'would', 'shake', 'turn', 'talk', 'say'] 

Entities:

Sebastian NORP
Google ORG
2007 DATE
American NORP
Recode ORG
earlier this week DATE


In [9]:
# Entities from geological text needs more work as the domain is not as same as the general domain. We miss important entities.

doc2 = nlp("An area of Morrissey Metamorphics on the Mt Phillips Sheet was thought to be gold deposits for Broken Hill.")

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc2.noun_chunks])
print("\nVerbs:", [token.lemma_ for token in doc2 if token.pos_ == "VERB"], '\n\nEntities:\n')

# Find named entities, phrases and concepts
for entity in doc2.ents:
    print(entity.text, entity.label_)
    
displacy.render(doc2, style="ent")

Noun phrases: ['An area', 'Morrissey Metamorphics', 'the Mt Phillips Sheet', 'gold deposits', 'Broken Hill']

Verbs: ['think'] 

Entities:

Morrissey Metamorphics PERSON
Broken Hill ORG
