## Approach
* search for NE ins text column
* for each NE search for related NE via head column
* iterate through head-text unitl NE found

In [1]:
import spacy
import nltk
import re
import pandas as pd

from spacy import displacy
from spacy.matcher import Matcher

In [68]:
nlp = spacy.load('de')
#text = u'''Meine kleine Enkelin Lisa und mein Enkel Lukas fliegen morgen nach London.'''
#text = u'''Herbert ist der Vater von Hans'''
text = u'''Peter und Maria gehen morgen ins Kino'''
#text = u'''Herbert sein Sohn und ich gehen heute ins Kino'''
#text = u'''Ich gehe mit Johann in den Zoo'''
#text = u'''Hans und sein Sohn Hubert gehen in den Zoo.'''
#text = u'''Hans, welcher der Sohn von Hubert ist, geht mit Peter ins Kino.'''
#text = u'''Hubert ist der Vater von Hans.'''
#text = u'''Peter und Michael gehen ins Kino.'''

In [69]:
text_long = u'''Meine kleine Enkelin Lisa und mein Enkel Lukas fliegen morgen nach London. Sie sind zum ersten Mal in England. 
        Peter und Maria gehen morgen ins Kino. Ich und mein Sohn gehen heute zum Fußball. 
        Ich gehe mit Johann in den Zoo. Hans und sein Sohn Hubert.'''

In [70]:
sentences = nltk.sent_tokenize(text)
sentences

['Peter und Maria gehen morgen ins Kino']

## Build Data Frame with NLP features

In [71]:
feature_columns = ['ne', 'ne_type', 'ne_pos', 'ne_dep', 'ne_head', 'ne_deppath', 'ne_children']
features = pd.DataFrame(columns=feature_columns)

In [72]:
for sentence in sentences:
    #sentence = re.sub('\W', ' ', sentence)
    #sentence = re.sub('\s{2,}', ' ', sentence)
    doc = nlp(sentence)
    
    tokens = []
    for token in doc:
        ne = token.text
        ne_dep = token.dep_
        head = token.head.text
        pos = token.pos_
        children = [child for child in token.children]
        #lemma = token.lemma_.lower()
        dep_path = []
        for chunk in doc.noun_chunks:
            if chunk.root.text == ne:
                dep_path.append([chunk.root.text, chunk.root.dep_, chunk.root.head.text])
        #print(token.text, token.dep_, token.head.text, token.head.pos_,
         # [child for child in token.children])
        data = {'ne': ne.lower(), 'ne_type': None, 'ne_pos': pos, 'ne_dep': ne_dep, 'ne_head': head.lower(), 
                'ne_deppath': dep_path, 'ne_children': children}
        training_ex = pd.Series(data, index=feature_columns)
        features = features.append(training_ex, ignore_index=True)
        
    for ent in doc.ents:
        features.loc[features['ne'] == ent.text.lower(), 'ne_type'] = ent.label_
        
    displacy.render(doc, style='dep', jupyter=True)
features
    

Unnamed: 0,ne,ne_type,ne_pos,ne_dep,ne_head,ne_deppath,ne_children
0,peter,PER,PROPN,sb,gehen,"[[Peter, sb, gehen]]",[und]
1,und,,CONJ,cd,peter,[],[Maria]
2,maria,PER,PROPN,cj,und,"[[Maria, cj, und]]",[]
3,gehen,,VERB,ROOT,gehen,[],"[Peter, morgen, ins]"
4,morgen,,ADV,mo,gehen,[],[]
5,ins,,ADP,mo,gehen,[],[Kino]
6,kino,,NOUN,nk,ins,"[[Kino, nk, ins]]",[]


## Extract Relationships
#### Apporach:
* search for NEs (according to entity list) in ne-column in the data frame
* if found
    * get all rows with the same *ne_head* value
    * search for NEs in these rows inside the *ne* column
* if found, assume relationship between these NEs
* if no NE found, assume transitive relation
    * get all rows with *ne* value in *ne_head* column
    * search for NEs in thes rows inside the *ne* column

In [8]:
relationships = ['vater', 'mutter', 'sohn', 'tochter', 'bruder', 'schwester', 'enkel', 'enkelin', 'nichte',
            'neffe', 'onkel', 'tante']
me_entities = ['ich', 'mein', 'meine', 'meinen', 'meines']

In [9]:
direct_rels = possible_rel[(possible_rel['ne_type'] == 'PER') | (possible_rel['ne'].isin(me_entities)) 
                                   & ~possible_rel['ne'].isin([exclude_elem])]
direct_rels

NameError: name 'possible_rel' is not defined

In [10]:
rel_list = []
def iterate(exclude_elem, head):
    possible_rel = features[(features['ne_head'] == head) & ~features['ne'].isin([exclude_elem])
                                & ~features['ne_dep'].isin(['ROOT'])]

    # check if column 'ne' of possible_rel contains one or more named entities (real world entites)
    direct_rels = possible_rel[(possible_rel['ne_type'] == 'PER') | (possible_rel['ne'].isin(me_entities)) 
                                   & ~possible_rel['ne'].isin([exclude_elem])]
    
    if len(direct_rels) > 0:
        for ent in direct_rels.iterrows():
            entity = ent[1]['ne']

            if rel_list:
                relationship = [word for word in rel_list if word in relationships]
                if relationship:
                    print(f"({exclude_elem})-[{relationship}]->({entity})")
                else:
                    print(f"({exclude_elem})-['KNOWS']->({entity})")
            else:
                print(f"({exclude_elem})-[{head}]->({entity})")

            rel_list.clear()


    else:  # if no direct relationship between names was found iterate possible transitive rels
        for row in possible_rel.iterrows():
            entity = row[1]['ne']

            #if row[1]['ne_dep'] != 'root':  # look for transitive relationship
            rel_list.append(entity)
            iterate(exclude_elem, entity)


for i, row in enumerate(features['ne'].iteritems()):
    elem = row[1].lower()
    rel = []
    if elem in me_entities or features['ne_type'][i] == 'PER':
        head = features['ne_head'][i].lower()
        #print(f'{elem} {head}')
        iterate(elem, head)

rel_list

(hans)-['KNOWS']->(peter)
(hans)-[mit]->(peter)
(hans)-['KNOWS']->(peter)
(hans)-[mit]->(peter)
(hans)-['KNOWS']->(peter)
(hans)-[mit]->(peter)
(hans)-['KNOWS']->(peter)
(hans)-[mit]->(peter)


['ins', 'kino', 'kino', '.', 'welcher', 'welcher']