In [1]:
import spacy
import nltk
from nltk.stem import WordNetLemmatizer

In [8]:
with open('die') as data_file:
        sentences = data_file.readlines()

In [3]:
nlp = spacy.load('en_core_web_lg')
wordnet_lemmatizer = WordNetLemmatizer() 

In [66]:
SPACY_DEP_ROOT = "ROOT"
SPACY_DEP_NSUBJ = "nsubj"
SPACY_DEP_NSUBJ_PASS = "nsubjpass"
SPACY_DEP_DOBJ = "dobj"
SPACY_DEP_PREP = "prep"
SPACY_DEP_PREP_OBJ = "pobj"
SPACY_DEP_CCOMP = "ccomp"
SPACY_DEP_IND_OBJ_1 = "dative"
SPACY_DEP_IND_OBJ_2 = "iobj"

SPACY_TAG_PREP = "IN"
SPACY_TAG_NN = "NN"
SPACY_TAG_NNS = "NNS"
SPACY_TAG_NNP = "NNP"
SPACY_TAG_NNPS = "NNPS"
SPACY_TAG_JJ = "JJ"
SPACY_TAG_JJR = "JJR"
SPACY_TAG_JJS = "JJS"
SPACY_TAG_VB = "VB"
SPACY_TAG_VBD = "VBD"
SPACY_TAG_VBG = "VBG"
SPACY_TAG_VBN = "VBN"
SPACY_TAG_VBP = "VBP"
SPACY_TAG_VBZ = "VBZ"

PREP_FOR = "for"
PREP_TO = "to"
PREP_FROM = "from"
PREP_OVER = "over"
PREP_UNTIL = "until"
PREP_OF = "of"
PREP_AT = "at"
PREP_IN = "in"
PREP_ON = "on"
PREP_NEAR = "near"

def getSubTreeString(token):
    if not token:
        return ""
    return ' '.join([str(token1.text) for token1 in list(token.subtree)])

def getHeadOfSentence(sentence):

    doc = nlp(str(sentence))
    head = None
    for token in doc:
        if token.dep_ == SPACY_DEP_ROOT:
            #print ("head",sentence, token.text)
            head = token
            
    return head  

def printTree(sentence):
    doc = nlp(str(sentence))
 
    for token in doc:
        print("{5}: {0}/{1} <--{2}-- {3}/{4} > {6};  type {7}".format(
           token.text, token.tag_, token.dep_, token.head.text, token.head.tag_,token.i,token.head.i, token.ent_id_))
        
        print(list(token.subtree))
        print ('**********')
        
def printEntities(sentence):
    doc = nlp(str(sentence))
    for ent in doc.ents:
        print(ent.text, ent.label_)
        
def findSubjectOfToken(sentence, targetToken):
    doc = nlp(sentence)
    for token in doc:
        if token.head.i == targetToken.i and  token.dep_ in [SPACY_DEP_NSUBJ]:
            return token
    
    return None
        
def findPrepsAttachedToToken(sentence, targetToken):
    doc = nlp(sentence)
    associatedPrepositionIds = {}
    for token in doc:
        if token.head.i == targetToken.i and token.tag_ == SPACY_TAG_PREP:
            associatedPrepositionIds[token.i] = token
    
    return associatedPrepositionIds

In [72]:
def parse(sentence):
    rootWord = getHeadOfSentence(sentence)
    whoDied = None
    whereDied = None
    whenDied = None
    age = None
    reason = None
    
    whoDied = getSubTreeString(findSubjectOfToken(sentence, rootWord))
    preps = findPrepsAttachedToToken(sentence, rootWord)
    
    for prepId, prepToken in preps.items():
        if prepToken.text == PREP_OF:
            reason = getSubTreeString(prepToken)
        
        elif prepToken.text in [PREP_AT, PREP_NEAR, PREP_IN]:
            subTreeText = getSubTreeString(prepToken)
            if 'age'  not in subTreeText:
                whereDied = subTreeText
            else:
                age = subTreeText
        elif prepToken.text == PREP_ON:
            whenDied = getSubTreeString(prepToken)
    print(sentence)            
    print ("Who: " ,whoDied, "where: ", whereDied, "whenDIed: ", whenDied, "age: ", age, "reason: ", reason)
    print("\n")
        

In [70]:
for sentence in sentences:
    sentence.replace('\n','')
    parse(sentence)

49-year-old boxer dies days after knockout in title fight

Who:  49-year - old boxer where:  in title whenDIed:  None age:  None reason:  None


Miss being there, kill it: Hardik Pandya on brother Krunal's debut

Who:   where:  None whenDIed:  None age:  None reason:  None


Leicester City's billionaire Thai owner dies in helicopter crash

Who:  Leicester City 's billionaire Thai owner where:  in helicopter crash 
 whenDIed:  None age:  None reason:  None


Former boxing champion dies after being hit by car in Italy

Who:  Former boxing champion where:  None whenDIed:  None age:  None reason:  None


You die of shame: Coach shouted at Tajinderpal before gold win

Who:  Coach where:  at Tajinderpal whenDIed:  None age:  None reason:  None


E-sports gamer kills self after killing 2 at gaming event

Who:  E - sports gamer where:  None whenDIed:  None age:  None reason:  None


Nepalese worker dies at 2022 World Cup stadium site in Qatar

Who:  Nepalese worker where:  at 2022 World Cup st

In [71]:
sentence = "49 year old boxer dies days after knockout in title fight"
sentence = "50 year old Cricket player John died of tumor in London"
#sentence = "John dies of Tumor in London on saturday at an age of 50 "
printTree(sentence)
printEntities(sentence)
parse(sentence)

0: 50/CD <--nummod-- year/NN > 1;  type 
[50]
**********
1: year/NN <--npadvmod-- old/JJ > 2;  type 
[50, year]
**********
2: old/JJ <--amod-- player/NN > 4;  type 
[50, year, old]
**********
3: Cricket/NNP <--compound-- player/NN > 4;  type 
[Cricket]
**********
4: player/NN <--nsubj-- died/VBD > 6;  type 
[50, year, old, Cricket, player, John]
**********
5: John/NNP <--appos-- player/NN > 4;  type 
[John]
**********
6: died/VBD <--ROOT-- died/VBD > 6;  type 
[50, year, old, Cricket, player, John, died, of, tumor, in, London]
**********
7: of/IN <--prep-- died/VBD > 6;  type 
[of, tumor]
**********
8: tumor/NN <--pobj-- of/IN > 7;  type 
[tumor]
**********
9: in/IN <--prep-- died/VBD > 6;  type 
[in, London]
**********
10: London/NNP <--pobj-- in/IN > 9;  type 
[London]
**********
50 year old DATE
John PERSON
London GPE
50 year old Cricket player John died of tumor in London
Who:  50 year old Cricket player John where:  in London whenDIed:  None age:  None reason:  of tumor




In [None]:
#sentence = "49 year old boxer dies days after knockout in title fight"
#sentence = "50 year old Cricket player John died of tumor at St Marks Hospital"
#sentence = "John dies of Tumor in London on saturday at an age of 50 "

# Problem:  in <car crash>.  in <hospital> -- Differnetiate??