## Approach (based on Eichler et al. 2008)
* Find at leas two NEs in a sentence
* Extract simplified dependency tree (skeleton) from each NE element to the root element (by default the VERB)
* Extract information based on the dependency types

In [184]:
import spacy
import nltk
import re
import pandas as pd

from spacy import displacy
from spacy.matcher import Matcher
from spacy.symbols import nsubj, pobj, obj, VERB, PERSON

In [254]:
#nlp = spacy.load('en')
nlp = spacy.load('de')
#text = u'''Meine kleine Enkelin Lisa und mein Enkel Lukas fliegen morgen nach London.'''
#text = u'''Herbert ist der Vater von Hans'''
#text = u'''Peter und Maria gehen morgen ins Kino'''
#text = u'''Herbert sein Sohn und ich gehen heute ins Kino'''
text = u'''Sein Sohn Hans und ich gehen heute ins Kino'''
#text = u'''Subsequent members of the Hohenzollern family ruled until 1918 in Berlin, first as electors of Brandenburg.'''

In [255]:
text = re.sub(r'\W', ' ', text)
text = re.sub(r'\s{2}', ' ', text)
text

'Sein Sohn Hans und ich gehen heute ins Kino'

In [256]:
doc = nlp(text)
displacy.render(doc, style='dep', jupyter=True)

In [257]:
import pandas as pd
feature_columns = ['ne', 'ne_type', 'ne_dep', 'ne_head', 'ne_pos', 'children']
features = pd.DataFrame(columns=feature_columns)
  
tokens = []
for token in doc:
    ne = token.text
    ne_dep = token.dep_
    head = token.head.text
    pos = token.pos_
    children = [child for child in token.children]
    #lemma = token.lemma_.lower()
    #print(token.text, token.dep_, token.head.text, token.head.pos_,
     # [child for child in token.children])
    data = {'ne': ne.lower(), 'ne_type': None, 'ne_dep': ne_dep,'ne_head': head.lower(), 'ne_pos': pos, 'children': children}
    training_ex = pd.Series(data, index=feature_columns)
    features = features.append(training_ex, ignore_index=True)

for ent in doc.ents:
    features.loc[features['ne'] == ent.text.lower(), 'ne_type'] = ent.label_

features

Unnamed: 0,ne,ne_type,ne_dep,ne_head,ne_pos,children
0,sein,,nk,sohn,DET,[]
1,sohn,,ROOT,sohn,NOUN,"[Sein, Hans, und]"
2,hans,PER,nk,sohn,PROPN,[]
3,und,,cd,sohn,CONJ,[gehen]
4,ich,,sb,gehen,PRON,[]
5,gehen,,cj,und,VERB,"[ich, heute, ins]"
6,heute,,mo,gehen,ADV,[]
7,ins,,mo,gehen,ADP,[Kino]
8,kino,,nk,ins,NOUN,[]


### Build skeleton
iterate dependecies back from NE

In [258]:
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

Sein Sohn Hans Sohn ROOT Sohn
ich ich sb gehen
Kino Kino nk ins


In [259]:
ne_list = []

for ent in doc.ents:
    print(ent.text, ent.label_)
    #if ent.label_ == 'PERSON' or ent.label_ == 'GPE': # en
    #if ent.label_ == 'PER':  # de
    ne_list.append(ent.text)
    
ne_list

Hans PER


['Hans']

In [262]:

class Node:
    name = None
    ancestors = []
    root = None
    

In [263]:
root = [token for token in doc if token.head == token][0]
entity_dict = {}
nodes = []

#print(list(root.lefts))[0]
for token in doc:
    ancestors = []
    node = Node()
    if token.text in ne_list:
        node.name = token.text
        for ancestor in token.ancestors:
            
            if ancestor == root:
                node.root = ancestor.text
                
            else:
                ancestors.append([ancestor, ancestor.dep_])
        
        nodes.append(node)            
        entity_dict[token] = ancestors

print(entity_dict)
nodes

Sohn
gehen
und
Sohn
{Sohn: [], Hans: [], ich: [[gehen, 'cj'], [und, 'cd']]}


[<__main__.Node at 0x26c9ec4b6a0>,
 <__main__.Node at 0x26c9ec4b128>,
 <__main__.Node at 0x26c9ec4bfd0>]

In [158]:
root = [token for token in doc if token.head == token][0]
for node in nodes:
    for a in 
    print(f'{node.name}')
    


[[ist, 'ROOT']]
[[von, 'pg'], [Vater, 'sb'], [ist, 'ROOT']]


In [None]:
from networkx import nx_pydot
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

ng.add_edge(source, target, weight=int(w))

e_small = [(u, v) for (u, v, d) in ng.edges(data=True) if d['weight'] <= 3]
e_middle = [(u, v) for (u, v, d) in ng.edges(data=True) if d['weight'] <= 6]
e_large = [(u, v) for (u, v, d) in ng.edges(data=True) if d['weight'] > 6]
#pos = nx.spring_layout(ng)  # positions for all nodes
pos = nx.nx_pydot.graphviz_layout(ng, prog='dot')

# nodes
nx.draw_networkx_nodes(ng, pos, node_size=300)

# edges
nx.draw_networkx_edges(ng, pos, edgelist=e_small, width=1)
nx.draw_networkx_edges(ng, pos, edgelist=e_middle, width=2)
nx.draw_networkx_edges(ng, pos, edgelist=e_large, width=6)

# labels
nx.draw_networkx_labels(ng, pos, font_size=11, font_family='sans-serif')

plt.axis('off')  # disable axis
plt.show()


### select subject and object

for english texts extract verb correspondants

In [127]:
subjects = set()
objects = set()
de_objects = ['oa', 'oc', 'og', 'op']

for elem in doc:
    if elem.dep == nsubj and elem.head.pos == VERB:
        subjects.add(elem.text)
        
    if elem.dep == pobj and elem.head.pos == VERB:
        objects.add(elem.text)
        

print(subjects, objects)

Herbert
ist
der
Vater
von
Hans
set() set()


#### search subject, predicate and object

In [145]:
subjects = set()
objects = set()
de_objects = ['oa', 'oc', 'og', 'op', 'pg']


for elem in doc:
    if elem.dep_ == 'sb':
        subjects.add(elem.text)
    
    if elem.dep_ in de_objects:
        objects.add(elem.text)
        

print(subjects, objects)

{'Vater'} {'von'}
