### Dataset preprocessing
Process the text to extract utterances and non-utterances and match the samples with the labelled dataset

#### Book utterances and non-utterances extraction

In [1]:
from src.character import characters
from src.curation import curation

curation(characters)

In [2]:
for character in characters:
    print(character)

name: Mrs_Annesley
gender: F
aliases: ['Mrs. Annesley', 'Annesley']
name: Elizabeth_Bennet
gender: F
aliases: ['Elizabeth Bennet', 'Miss Elizabeth Bennet', 'Miss Elizabeth', 'Miss Lizzy', 'Miss Bennet', 'Miss Eliza', 'Eliza Bennet', 'Elizabeth', 'Lizzy', 'Liz', 'Eliza']
name: Jane_Bennet
gender: F
aliases: ['Jane Bennet', 'Jane']
name: Lydia_Bennet
gender: F
aliases: ['Lydia Bennet', 'Miss Lydia Bennet', 'Miss Lydia', 'Lydia']
name: Kitty_Bennet
gender: F
aliases: ['Kitty Bennet', 'Catherine Bennet', 'Kitty']
name: Mary_Bennet
gender: F
aliases: ['Mary Bennet', 'Mary']
name: Mrs_Bennet
gender: F
aliases: ['Mrs. Bennet']
name: Caroline_Bingley
gender: F
aliases: ['Caroline Bingley', 'Caroline', 'Miss Bingley']
name: Charlotte
gender: F
aliases: ['Charlotte', 'Charlotte Lucas', 'Mrs. Collins', 'Miss Lucas']
name: Lady_Catherine
gender: F
aliases: ['Lady Catherine', 'Catherine']
name: Mr_Denny
gender: F
aliases: ['Mr. Denny']
name: Lady_Anne_Darcy
gender: F
aliases: ['Lady Anne Darcy', 'L

In [3]:
with open('corpus/curated_text.txt', 'r+') as raw_text_file:
    # go through all lines in the book
    text = raw_text_file.read()

In [4]:
import re

annotations = []
is_utterance = False
processed = ""
source = ""
sample_parts = []
text = re.sub(' +', ' ', " "+text) 
parts = list(p for p in re.split("(``)|('')", text) if p is not None)
i = 0
next_quote_doesnt_count = False
while i < len(parts):
    part = parts[i]
    if part == '``' or part == "''":
        is_utterance = part == '``'
        source += part
        i += 1
        continue
    if not is_utterance:
        if "\n\n" in part: # before or after an utterance
            lines = part.split("\n\n")
            if processed != "":
                if lines[0] != "":
                    sample_parts.append({"text": lines[0], "utterance": False})
                source += lines[0]
                if processed[-5:] == " [X] ":
                    processed = processed[:-5]
                if processed != "":
                    annotations.append({
                        "only_utterance_us": processed,
                        "source": source,
                        "parts": sample_parts
                    })
            processed = ""
            if lines[-1] != "":
                sample_parts = [({"text": lines[-1], "utterance": False})]
            else:
                sample_parts = []
            source = lines[-1]
        else: # in the middle of an utterance
            sample_parts.append({"text": part, "utterance": False})
            source += part
            if part != " -- ":
                if processed[-5:] != " [X] ":
                    processed += " [X] "
            else:
                processed += " "
    else:
        sample_parts.append({"text": part, "utterance": True})
        monoline = " ".join(part.split("\n\n"))
        processed += monoline
        source += monoline
    i += 1

#### Match the annotated dataset with the re-processed dataset

In [5]:
processed_to_index = {annotation["only_utterance_us"]: i for i, annotation in reversed(list(enumerate(annotations)))}

In [6]:
def strip_equal(a, b, l):
    return re.sub(r'(\[X\])|\s', '', a)[:l] == re.sub(r'(\[X\])|\s', '', b)[:l]

In [7]:
with open('corpus/curated_dialogs.txt') as annoted_text_file:
    annotated_text_lines = annoted_text_file.readlines()
    for annoted_line in annotated_text_lines:
        annotation_i, label, utterance = annoted_line.split('\t')
        utterance = re.sub('\s+', ' ', utterance.strip())
        if utterance in processed_to_index and "target" not in annotations[processed_to_index[utterance]]:
            annotation = annotations[processed_to_index[utterance]]
        else:
            annotation = next((a for a in annotations if strip_equal(a['only_utterance_us'], utterance, 100) and "target" not in a), None)
            if annotation['only_utterance_us'] != utterance:
                print(annotation['only_utterance_us'])
                print("--")
        assert "target" not in annotation
        annotation["only_utterance_article"] = utterance
        annotation["target"] = label

My dear Elizabeth_Bennet, I have the highest opinion in the world of your excellent judgment in all matters within the scope of your understanding, but permit me to say that there must be a wide difference between the established forms of ceremony amongst the laity, and those which regulate the clergy; for give me leave to observe that I consider the clerical office as equal in point of dignity with the highest rank in the kingdom -- provided that a proper humility of behaviour is at the same time maintained. You must therefore allow me to follow the dictates of my conscience on this occasion, which leads me to perform what I look on as a point of duty. Pardon me for neglecting to profit by your advice, which on every other subject shall be my constant guide, though in the case before us I consider myself more fitted by education and habitual study to decide on what is right than a young lady like yourself. [X] apology, [X] Hunsford, [X] Lady_Catherine de Anne_de_Bourgh.
--
delightful,

In [8]:
for a in annotations:
    if "target" not in a:
        print(a)

#### Stanford parser annotations

Load the stanford parser

In [10]:
import nltk
from nltk.parse import stanford

jar = 'stanford-parser-full-2017-06-09/stanford-parser.jar'
model = 'stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar'

dep_parser = stanford.StanfordDependencyParser(model, jar, model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8')

#Add standford parser annotations like name or gender

#######################################################
# Stanford parser rules: triples -> name, gender, etc #
#######################################################
def extract_features(speaker_name, speaker_role, speaker_gender, triple):
    (word1, tag1), dep, (word2, tag2) = triple
    if (tag1.startswith('VBD') or word1 in ['said', 'added', 'cried', 'asked', 'replied', 'returned', 'continued', 'observed']) and (tag2.startswith('NN') or tag2.startswith('PRP')) and not dep.startswith('nmod'):
        if tag2.startswith('NN'):
            if tag2.startswith('NNP'):
                speaker_name = word2
            else:
                speaker_role = word2
        if word2 in ['he', 'man', 'boy', 'lad']:
            speaker_gender = 'M'
        if word2 in ['she', 'lady', 'girl']:
            speaker_gender = 'F'
    return (speaker_name, speaker_role, speaker_gender)


for sample in annotations:
    for part in sample["parts"]:
        if not part["utterance"]:
            speaker_name = None
            speaker_role = None
            speaker_gender = None
            tokens = nltk.word_tokenize(part["text"][:200])
            tagged = nltk.pos_tag(tokens)
            try:
                dependencies = sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents([tagged])],[])
                for (term1,dep,term2) in dependencies[0]:
                    #print(term1, dep, term2)
                    speaker_name, speaker_role, speaker_gender = extract_features(speaker_name, speaker_role, speaker_gender, (term1,dep,term2) )
                    # try reverse order
                    speaker_name, speaker_role, speaker_gender =  extract_features(speaker_name, speaker_role, speaker_gender, (term2,dep,term1) )
            except Exception as e:
                    print(e)
                    print(part)
            part["speaker_name"] = speaker_name
            part["speaker_role"] = speaker_role
            part["speaker_gender"] = speaker_gender
            #print(part)

In [14]:
print(len(annotations))

1294


In [15]:
annotations[:20]

[{'only_utterance_article': 'My dear Mr_Bennet, [X] have you heard that Netherfield Park is let at last?',
  'only_utterance_us': 'My dear Mr_Bennet, [X] have you heard that Netherfield Park is let at last?',
  'parts': [{'text': 'My dear Mr_Bennet,', 'utterance': True},
   {'speaker_function': 'lady',
    'speaker_gender': 'F',
    'speaker_name': None,
    'text': ' said his lady to him one day, ',
    'utterance': False},
   {'text': 'have you heard that Netherfield Park is let at last?',
    'utterance': True}],
  'source': "``My dear Mr_Bennet,'' said his lady to him one day, ``have you heard that Netherfield Park is let at last?''",
  'target': 'Mrs_Bennet'},
 {'only_utterance_article': 'But it is, [X] for Mrs_Long has just been here, and she told me all about it.',
  'only_utterance_us': 'But it is, [X] for Mrs_Long has just been here, and she told me all about it.',
  'parts': [{'text': 'But it is,', 'utterance': True},
   {'speaker_function': None,
    'speaker_gender': 'F',
 

Dump the annotated dataset

In [16]:
pickle.dump(annotations, open("corpus/dataset.pkl", "wb"))

In [12]:
import pickle

In [17]:
# To facilitate comparison with curated_dialogs.txt
with open('corpus/dataset.pkl', 'rb') as pick:
    with open('corpus/dataset.txt', 'w+') as text:
        annotations = pickle.load(pick)
        for a in annotations:
            parts = " ".join(re.sub(r'\n', ' ', part["text"]) for part in a["parts"])
            text.write(parts[:200] + "\n")

In [13]:
with open('corpus/dataset.pkl', 'rb') as pick:
    annotations2 = pickle.load(pick)
    print(annotations2[858])

{'only_utterance_us': 'for your housekeeper, [X] informed us that you would certainly not be here till to-morrow; and indeed, before we left Bakewell we understood that you were not immediately expected in the country. [X] They will join me early tomorrow, [X] and among them are some who will claim an acquaintance with you, -- Mr_Bingley and his sisters.', 'source': "After walking some time in this way, the two ladies in front, the two gentlemen behind, on resuming their places after descending to the brink of the river for the better inspection of some curious water-plant, there chanced to be a little alteration. It originated in Mrs_Gardiner, who, fatigued by the exercise of the morning, found Elizabeth_Bennet's arm inadequate to her support, and consequently preferred her husband's. Mr_Darcy took her place by her niece, and they walked on together. After a short silence, the lady first spoke. She wished him to know that she had been assured of his absence before she came to the plac

In [1]:
import nltk
from nltk.parse import stanford

jar = '../stanford-parser-full-2017-06-09/stanford-parser.jar'
model = '../stanford-parser-full-2017-06-09/stanford-parser-3.8.0-models.jar'

dep_parser = stanford.StanfordDependencyParser(model, jar, model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8')

In [3]:
import re

In [31]:
def extract_features_from_triple(triple, speaker_name, speaker_role, speaker_gender, tokens_position):
    (word1, tag1), dep, (word2, tag2) = triple
    if ((tag1.startswith('VBD') or word1 in ['said', 'added', 'cried', 'asked', 'replied', 'returned', 'continued', 'observed']) and \
        (tag2.startswith('NN') or tag2.startswith('PRP')) and \
         not dep.startswith('nmod')) \
        or (tag1.startswith('VBD') and dep == "dobj" and tokens_position[(word1, tag1)] == 0):
        if tag2.startswith('NN'):
            if tag2.startswith('NNP'):
                speaker_name = word2
                print("speaker_name = {}: {}".format(speaker_name, str(triple)))
            elif word2 in true_roles:
                speaker_role = word2
                print("speaker_role = {}: {}".format(speaker_role, str(triple)))
        if word2 in ['he', 'man', 'boy', 'lad']:
            speaker_gender = 'M'
            print("speaker_gender = {}: {}".format(speaker_gender, str(triple)))
        if word2 in ['she', 'lady', 'girl']:
            speaker_gender = 'F'
            print("speaker_gender = {}: {}".format(speaker_gender, str(triple)))
    return (speaker_name, speaker_role, speaker_gender)

def extract_features_from_text(text):
    tokens = nltk.word_tokenize(
        text
    )
    tagged = nltk.pos_tag(tokens)
    tokens_position = {v: i for i, v in enumerate(tagged)}
    speaker_name, speaker_role, speaker_gender = None, None, None
    triples = sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents([tagged])],[])[0]
    for triple in triples:
        speaker_name, speaker_role, speaker_gender = extract_features_from_triple(triple, speaker_name, speaker_role, speaker_gender, tokens_position)
    return (speaker_name, speaker_role, speaker_gender), triples

In [33]:
extract_features_from_text()

((None, None, None),
 [(('sang', 'NN'), 'nmod:poss', ('My', 'PRP$')),
  (('sang', 'NN'), 'compound', ('husband', 'NN')),
  (('sang', 'NN'), 'nmod', ('me', 'PRP')),
  (('me', 'PRP'), 'case', ('for', 'IN'))])

In [604]:
subj_prp = ["i", "you", "he", "she", "we", "they", "somebody", "anybody", "noone", "anyone", "it"]
obj_prp = ["me", "you", "him", "her", "us", "them", "somebody", "anybody", "noone", "anyone", "it"]
expression_verbs = [stemmer.stem(v) for v in ['said', 'added', 'cried', 'asked', 'replied', 'returned', 'continued', 'observed', 'call', 'read']]
true_roles = {
    "husband": "husband",
    "ladi": "wife",
    "wife": "wife",
    "aunt": "aunt",
    "uncl": "uncle",
    "father": "father",
    "sister": "sister",
    "mother": "mother",
    "daughter": "daughter",
    "son": "son",
    "cousin": "cousin",
    "fiance": "married"
}

In [517]:
[stemmer.stem(r) for r in true_roles]

['fiance',
 'son',
 'ladi',
 'aunt',
 'husband',
 'sister',
 'father',
 'wife',
 'cousin',
 'mother',
 'daughter',
 'uncl']

In [184]:
tokens = nltk.word_tokenize(
    "repeated Caroline_Bingley to her sister."
)
tagged = [(str(i), t) for i, (s, t) in enumerate(nltk.pos_tag(tokens))]
stemmed = [stemmer.stem(t) for t in tokens]
triples = sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents([tagged])],[])[0]
# detect the subj of the sentence

In [327]:
# Extract the subject
def extract_subj_from_triples(triples, stemmed, return_ranks=False):
    subj_ranks = [0] * len(stemmed)
    for (p1, t1), dep, (p2, t2) in triples:
        p1, p2 = int(p1), int(p2)
        if t2.startswith("NN"):
            subj_ranks[p2] += 1
        if t2.startswith("PRP") and stemmed[p2] in subj_prp:
            subj_ranks[p2] += 1
        if dep == 'nsubj': # word is marked as subject
            if p1 == 0:
                subj_ranks[p2] += 1
            else:
                subj_ranks[p2] += 2
        if t1.startswith("VB"): # if it is a dependency toward a verb
            subj_ranks[p2] += 1
            if p1 == 0: # first verb after utterance
                subj_ranks[p2] += 1
                if dep.startswith("dobj"):
                    subj_ranks[p2] += 1
        if stemmed[p1] in expression_verbs:
            subj_ranks[p2] += 1
    subj_score, subj = max(zip(subj_ranks, range(len(stemmed))))
    if return_ranks:
        return subj_score, subj, subj_ranks
    return subj_score, subj

In [570]:
import string

#Extract the destination charater
def extract_dest_from_triples(triples, stemmed, return_ranks=False):
    dest_ranks = [0] * len(stemmed)
    for (p1, t1), dep, (p2, t2) in triples:
        p1, p2 = int(p1), int(p2)
        if t2.startswith("NN"):
            dest_ranks[p2] += 1
        if t2.startswith("PRP") and stemmed[p2] in obj_prp:
            dest_ranks[p2] += 1
        if t1.startswith("VB"): # if it is a dependency toward a verb
            dest_ranks[p2] += 1
            if p1 == 0: # first verb after utterance
                dest_ranks[p2] += 1
        if dep.startswith("nmod"):
            dest_ranks[p2] += 1
        if stemmed[p1] in expression_verbs:
            dest_ranks[p2] += 1
    dest_score, dest = max(zip(dest_ranks, range(len(stemmed))))
    if return_ranks:
        return dest_score, dest, dest_ranks
    return dest_score, dest


#Extract the relation
def extract_relational_mod_from_triples(triples, stemmed, token_i, return_ranks=False):
    nmod_ranks = [0] * len(stemmed)
    for (p1, t1), dep, (p2, t2) in triples:
        p1, p2 = int(p1), int(p2)
        if p1 != token_i:
            continue
        if t2.startswith("NN"):
            nmod_ranks[p2] += 1
        if t2.startswith("PRP"):
            nmod_ranks[p2] += 1
        if dep.startswith("nmod"):
            nmod_ranks[p2] += 1
        if stemmed[p1] in expression_verbs:
            nmod_ranks[p2] += 1
    nmod = max(range(len(nmod_ranks)), key=lambda x: nmod_ranks[x])
    nmod_score, nmod = max(zip(nmod_ranks, range(len(stemmed))))
    if return_ranks:
        return nmod_score, nmod, nmod_ranks
    return nmod_score, nmod

def extract_features_from_token(token, *token_codes):
    names, properties = set(), set()
    return names, properties

def get_tree_leaves(tree, root_i):
    def _rec(i):
        node = tree.get_by_address(i)
        yield (node['address'] - 1)
        for [v] in node['deps'].values():
            yield from _rec(v)
    
    return sorted(list(_rec(root_i+1)))

def switch_tokens_in_tree(token1_root, token2_root, tree, lists, remove1=False, remove2=False):
    tokens1 = [t for t in get_tree_leaves(tree, token1_root)]
    tokens2 = [t for t in get_tree_leaves(tree, token2_root)]
    f1, f2 = tokens1[0], tokens2[0]
    tokens1, tokens2 = sorted((tokens1, tokens2))
    for l in lists:
        newl = []
        for i in range(len(l)):
            if i == f1 and not remove2:
                newl.extend([l[t] for t in tokens2])
            if i == f2 and not remove1:
                newl.extend([l[t] for t in tokens1])
            if i not in tokens1 and i not in tokens2:
                newl.append(l[i])
        yield newl

In [576]:
def extract_features_from_text(text, debug=False):
    text = text if text.strip()[0] in string.ascii_uppercase else 'XXX ' + text
    try:
        tree = next(parser.raw_parse(text))
    except Exception as e:
        return (tuple([None]*4), tuple([None]*4))
        
    token_count = max(a for a in tree.nodes)
    stemmed = ['']*token_count
    tagged = [('', '') for i in range(token_count)]
    for i, n in tree.nodes.items():
        if i != 0:
            stemmed[i-1] = stemmer.stem(n['word']) if n['word'][0] not in string.ascii_uppercase else n['word']
            tagged[i-1] = (n['word'], n['ctag'])

    triples = list(custom_triples(tree.root, tree))

    if debug:
        print("--- triples ---")
        display(triples)
        print("--- stemmed ---")
        display(stemmed, token_count)

    subj_score, subj, subj_ranks = extract_subj_from_triples(triples, stemmed, return_ranks=True)
    if debug:
        display(list(enumerate(zip(subj_ranks, stemmed))))

    parser_nsubj_triples = [t for t in triples if t[1].startswith('nsubj')]

    # If the fake subject was picked, then switch it with the dobj and remove it from the sentence
    if stemmed[subj] == 'XXX':
        parser_verb = next(t[0][0] for t in triples if t[2][0] == subj and t[1].startswith('nsubj'))
        parser_dobj = next((t[2][0] for t in triples if t[0][0] == parser_verb and t[1].startswith('dobj')), None)

        if parser_dobj is not None:
            print(tagged)
            tagged, stemmed = tuple(switch_tokens_in_tree(subj, parser_dobj, tree, (tagged, stemmed), remove1=True))
            tree = next(dep_parser.tagged_parse(tagged))
            triples = list(custom_triples(tree.root, tree))

            subj_score, subj, subj_ranks = extract_subj_from_triples(triples, stemmed, return_ranks=True)
            if debug:
                print("re-parsing with", stemmed)
                print("--- triples ---")
                display(triples)
                print("--- stemmed ---")
                display(stemmed, token_count)
                display(list(enumerate(zip(subj_ranks, stemmed))))


    dest_score, dest, dest_ranks = extract_dest_from_triples(triples, stemmed, return_ranks=True)
    if debug:
        display(list(enumerate(zip(dest_ranks, stemmed))))
    if dest == subj or dest_score < 3 or abs(dest-subj) > 40:
        dest = None

    subj_names, subj_properties = set(), set()
    dest_names, dest_properties = set(), set()

    if subj is not None:
        # + 1 because the 0 node is for the empty root only
        for sub_subj in get_tree_leaves(tree, subj):
            subj_token = stemmed[sub_subj]
            if subj_token[0] in string.ascii_uppercase:
                subj_names.add(subj_token)
            if subj_token in ['he', 'man', 'boy', 'lad', 'him']:
                subj_properties.add('status(X, male)')
            elif subj_token in ['she', 'ladi', 'girl']:
                subj_properties.add('status(X, female)')
            if subj_token in true_roles:
                subj_properties.add('related(X, Y, {})'.format(subj_token))

            subj_nmod_score, subj_nmod = extract_relational_mod_from_triples(triples, stemmed, subj)
            if subj_nmod_score >= 1:
                subj_nmod_token = stemmed[subj_nmod]
                if subj_nmod_token in ['his', 'he', 'him']:
                    subj_properties.add('status(Y, male)')
                elif subj_nmod_token in ['her', 'she']:
                    subj_properties.add('status(Y, female)')


    if dest is not None:
        # + 1 because the 0 node is for the empty root only
        for sub_dest in get_tree_leaves(tree, dest):
            dest_token = stemmed[sub_dest]

            if dest_token[0] in string.ascii_uppercase:
                dest_names.add(dest_token)
            if dest_token in ['he', 'man', 'boy', 'lad', 'him']:
                dest_properties.add('status(U, male)')
            elif dest_token in ['she', 'ladi', 'girl']:
                dest_properties.add('status(U, female)')
            if dest_token in true_roles:
                dest_properties.add('related(U, V, {})'.format(dest_token))

            dest_nmod_score, dest_nmod = extract_relational_mod_from_triples(triples, stemmed, dest)
            if dest_nmod_score >= 1:
                dest_nmod_token = stemmed[dest_nmod]
                if dest_nmod_token in ['his', 'he', 'him']:
                    dest_properties.add('status(V, male)')
                elif dest_nmod_token in ['her', 'she']:
                    dest_properties.add('status(V, female)')
                
    return (subj_names, subj_properties), (dest_names, dest_properties)

In [585]:
import pickle

In [586]:
people = pickle.load(open("corpus/people.pkl", "rb"))

In [612]:
re.sub('([A-Z][a-z])*s+([^\w])', r'\1\2', "Tests ok ")

'Test ok '

In [677]:
def extract_features_from_utterance(text, people, debug=False):
    people_to_code = {p['main'].strip('s'): p['code'] for p in people}
    text = re.sub('([A-Z][a-z])*s+([^\w])', r'\1\2', text)
            
    try:
        tree = next(parser.raw_parse(text))
    except Exception as e:
        return (tuple([None]*4), tuple([None]*4))
        
    token_count = max(a for a in tree.nodes)
    stemmed = ['']*token_count
    tagged = [('', '') for i in range(token_count)]
    for i, n in tree.nodes.items():
        if i != 0:
            stemmed[i-1] = stemmer.stem(n['word']) if n['word'] not in people_to_code else n['word']
            tagged[i-1] = (n['word'], n['ctag'])

    triples = list(custom_triples(tree.root, tree))

    if debug:
        print("--- triples ---")
        display(sorted(triples, key=lambda x: (x[0][0], x[2][0])))
        print("--- stemmed ---")
        display(list(enumerate(stemmed)), token_count)

    subj_names, subj_properties = set(), set()
    dest_names, dest_properties = set(), set()

        # + 1 because the 0 node is for the empty root only
    for_subj = True
    non_anaphore = set()
    for i, triple in enumerate(triples):
        if triple[1] == 'nsubj' and triple[0][1].startswith('VB'):
            non_anaphore.add(triple[2][0])
        if triple[1] == 'nmod:poss':
            if stemmed[triple[2][0]] == 'my':
                for_subj = True
            elif stemmed[triple[2][0]] == 'your':
                for_subj = False
            else:
                continue
            
            poss = triple[0][0]
            print(people_to_code)
            for sub in get_tree_leaves(tree, poss):
                token = stemmed[sub]
                print(token)
                if token in true_roles:
                    if for_subj:
                        subj_properties.add('related(Y{}, X, {})'.format(i, true_roles[token]))
                    else:
                        dest_properties.add('related(V{}, U, {})'.format(i, true_roles[token]))
                elif token in people_to_code:
                    if for_subj:
                        subj_properties.add('Y{}={}'.format(i, people_to_code[token]))
                    else:
                        dest_properties.add('V{}={}'.format(i, people_to_code[token]))
    other_mentions = [i for i, t in enumerate(stemmed) if t in ('you', 'your', 'dear')]
    people_names = [t for i, t in enumerate(stemmed)
                    if t in people_to_code and
                       i not in non_anaphore and
                       len([j for j in other_mentions if abs(i-j)<5]) > 0]
    
    if len(people_names) > 0:
        dest_names.add(people_names[0])
                    
                
    return (subj_names, subj_properties), (dest_names, dest_properties)

In [678]:
extract_features_from_utterance("But I must say, Elizabeth_Bennet, that you look nice", people, debug=True)

--- triples ---


[((3, 'VB'), 'cc', (0, 'CC')),
 ((3, 'VB'), 'nsubj', (1, 'PRP')),
 ((3, 'VB'), 'aux', (2, 'MD')),
 ((3, 'VB'), 'dep', (5, 'VB')),
 ((3, 'VB'), 'ccomp', (9, 'VBP')),
 ((9, 'VBP'), 'mark', (7, 'IN')),
 ((9, 'VBP'), 'nsubj', (8, 'PRP')),
 ((9, 'VBP'), 'xcomp', (10, 'JJ'))]

--- stemmed ---


[(0, 'but'),
 (1, 'i'),
 (2, 'must'),
 (3, 'say'),
 (4, ''),
 (5, 'Elizabeth_Bennet'),
 (6, ''),
 (7, 'that'),
 (8, 'you'),
 (9, 'look'),
 (10, 'nice')]

11

((set(), set()), ({'Elizabeth_Bennet'}, set()))

In [679]:
extract_features_from_utterance("Dear Elizabeth_Bennet, I like my uncle Mr_Bennet who left me alone", people, debug=True)

--- triples ---


[((1, 'NNP'), 'compound', (0, 'NNP')),
 ((1, 'NNP'), 'appos', (3, 'PRP')),
 ((3, 'PRP'), 'nmod', (7, 'NN')),
 ((3, 'PRP'), 'acl:relcl', (9, 'VBD')),
 ((7, 'NN'), 'case', (4, 'IN')),
 ((7, 'NN'), 'nmod:poss', (5, 'PRP$')),
 ((7, 'NN'), 'compound', (6, 'NN')),
 ((9, 'VBD'), 'nsubj', (8, 'WP')),
 ((9, 'VBD'), 'xcomp', (11, 'RB')),
 ((11, 'RB'), 'nsubj', (10, 'PRP'))]

--- stemmed ---


[(0, 'dear'),
 (1, 'Elizabeth_Bennet'),
 (2, ''),
 (3, 'i'),
 (4, 'like'),
 (5, 'my'),
 (6, 'uncl'),
 (7, 'Mr_Bennet'),
 (8, 'who'),
 (9, 'left'),
 (10, 'me'),
 (11, 'alon')]

12

{'Mr_Robinson': 'mr_robinson', 'William_Goulding': 'william_goulding', 'Young_Luca': 'young_lucas', 'Captain_Carter': 'captain_carter', 'Mr_Morri': 'mr_morris', 'Miss_Mary_King': 'miss_mary_king', 'Haggerston': 'haggerston', 'Mr_Pratt': 'mr_pratt', 'Miss_Pope': 'miss_pope', 'Mr_Gardiner': 'mr_gardiner', 'Mr_Hurst': 'mr_hurst', 'Old_Mr_Wickham': 'old_mr_wickham', 'Anne_de_Bourgh': 'anne_de_bourgh', 'Mr_Collin': 'mr_collins', 'Mr_Bingley': 'mr_bingley', 'Mr_Bennet': 'mr_bennet', 'Lady_Anne_Darcy': 'lady_anne_darcy', 'Mrs_Hill': 'mrs_hill', 'Mr_Philip': 'mr_philips', 'Miss_Grantley': 'miss_grantley', 'Mrs_Bennet': 'mrs_bennet', 'Miss_Watson': 'miss_watson', 'Mr_Jone': 'mr_jones', 'Louisa_Hurst': 'louisa_hurst', 'Mrs_Reynold': 'mrs_reynolds', 'The_Butler': 'the_butler', 'Mrs_Annesley': 'mrs_annesley', 'Jane_Bennet': 'jane_bennet', 'Mr_Chamberlayne': 'mr_chamberlayne', 'Colonel_Fitzwilliam': 'colonel_fitzwilliam', 'Mr_Denny': 'mr_denny', 'Old_Mr_Darcy': 'old_mr_darcy', 'Caroline_Bingley': '

((set(), {'Y5=mr_bennet', 'related(Y5, X, uncle)'}),
 ({'Elizabeth_Bennet'}, set()))

In [680]:
extract_features_from_utterance("Do you know, Mama, that my uncle Mr_Philips talks of turning away Richard, and if he does, Colonel Forster will hire him. My aunt told me so herself on Saturday.  I shall walk to Meryton to-morrow to hear more about it, and to ask when Mr. Denny comes back from town.", people, debug=True)

--- triples ---


[((2, 'VB'), 'aux', (0, 'VBP')),
 ((2, 'VB'), 'nsubj', (1, 'PRP')),
 ((2, 'VB'), 'discourse', (4, 'UH')),
 ((2, 'VB'), 'ccomp', (10, 'VBP')),
 ((9, 'NN'), 'nmod:poss', (7, 'PRP$')),
 ((9, 'NN'), 'compound', (8, 'NN')),
 ((10, 'VBP'), 'mark', (6, 'IN')),
 ((10, 'VBP'), 'nsubj', (9, 'NN')),
 ((10, 'VBP'), 'advcl', (12, 'VBG')),
 ((10, 'VBP'), 'cc', (16, 'CC')),
 ((10, 'VBP'), 'conj', (24, 'VB')),
 ((12, 'VBG'), 'mark', (11, 'IN')),
 ((12, 'VBG'), 'compound:prt', (13, 'RP')),
 ((12, 'VBG'), 'dobj', (14, 'NNP')),
 ((19, 'VBP'), 'mark', (17, 'IN')),
 ((19, 'VBP'), 'nsubj', (18, 'PRP')),
 ((22, 'NNP'), 'compound', (21, 'NNP')),
 ((24, 'VB'), 'advcl', (19, 'VBP')),
 ((24, 'VB'), 'nsubj', (22, 'NNP')),
 ((24, 'VB'), 'aux', (23, 'MD')),
 ((24, 'VB'), 'dobj', (25, 'PRP')),
 ((28, 'NN'), 'nmod:poss', (27, 'PRP$')),
 ((29, 'VBD'), 'dep', (2, 'VB')),
 ((29, 'VBD'), 'nsubj', (28, 'NN')),
 ((29, 'VBD'), 'xcomp', (32, 'JJ')),
 ((32, 'JJ'), 'nsubj', (30, 'PRP')),
 ((32, 'JJ'), 'advmod', (31, 'RB')),
 (

--- stemmed ---


[(0, 'do'),
 (1, 'you'),
 (2, 'know'),
 (3, ''),
 (4, 'mama'),
 (5, ''),
 (6, 'that'),
 (7, 'my'),
 (8, 'uncl'),
 (9, 'Mr_Philip'),
 (10, 'talk'),
 (11, 'of'),
 (12, 'turn'),
 (13, 'away'),
 (14, 'richard'),
 (15, ''),
 (16, 'and'),
 (17, 'if'),
 (18, 'he'),
 (19, 'doe'),
 (20, ''),
 (21, 'colonel'),
 (22, 'forster'),
 (23, 'will'),
 (24, 'hire'),
 (25, 'him'),
 (26, ''),
 (27, 'my'),
 (28, 'aunt'),
 (29, 'told'),
 (30, 'me'),
 (31, 'so'),
 (32, 'herself'),
 (33, 'on'),
 (34, 'saturday'),
 (35, ''),
 (36, 'i'),
 (37, 'shall'),
 (38, 'walk'),
 (39, 'to'),
 (40, 'meryton'),
 (41, 'to-morrow'),
 (42, 'to'),
 (43, 'hear'),
 (44, 'more'),
 (45, 'about'),
 (46, 'it'),
 (47, ''),
 (48, 'and'),
 (49, 'to'),
 (50, 'ask'),
 (51, 'when'),
 (52, 'mr.'),
 (53, 'denni'),
 (54, 'come'),
 (55, 'back'),
 (56, 'from'),
 (57, 'town')]

58

{'Mr_Robinson': 'mr_robinson', 'William_Goulding': 'william_goulding', 'Young_Luca': 'young_lucas', 'Captain_Carter': 'captain_carter', 'Mr_Morri': 'mr_morris', 'Miss_Mary_King': 'miss_mary_king', 'Haggerston': 'haggerston', 'Mr_Pratt': 'mr_pratt', 'Miss_Pope': 'miss_pope', 'Mr_Gardiner': 'mr_gardiner', 'Mr_Hurst': 'mr_hurst', 'Old_Mr_Wickham': 'old_mr_wickham', 'Anne_de_Bourgh': 'anne_de_bourgh', 'Mr_Collin': 'mr_collins', 'Mr_Bingley': 'mr_bingley', 'Mr_Bennet': 'mr_bennet', 'Lady_Anne_Darcy': 'lady_anne_darcy', 'Mrs_Hill': 'mrs_hill', 'Mr_Philip': 'mr_philips', 'Miss_Grantley': 'miss_grantley', 'Mrs_Bennet': 'mrs_bennet', 'Miss_Watson': 'miss_watson', 'Mr_Jone': 'mr_jones', 'Louisa_Hurst': 'louisa_hurst', 'Mrs_Reynold': 'mrs_reynolds', 'The_Butler': 'the_butler', 'Mrs_Annesley': 'mrs_annesley', 'Jane_Bennet': 'jane_bennet', 'Mr_Chamberlayne': 'mr_chamberlayne', 'Colonel_Fitzwilliam': 'colonel_fitzwilliam', 'Mr_Denny': 'mr_denny', 'Old_Mr_Darcy': 'old_mr_darcy', 'Caroline_Bingley': '

((set(),
  {'Y39=mr_philips', 'related(Y27, X, aunt)', 'related(Y39, X, uncle)'}),
 (set(), set()))

In [480]:
def custom_triples(root, tree):
    for dep, child_addresses in root['deps'].items():
        for child_address in child_addresses:
            child = tree.get_by_address(child_address)
            yield (root['address']-1, root['ctag']), dep, (child_address-1, child['ctag'])
            yield from custom_triples(child, tree)

In [481]:
list(custom_triples(tree.root, tree))

[((1, 'VBD'), 'dobj', (3, 'NN')),
 ((3, 'NN'), 'nmod:poss', (2, 'PRP$')),
 ((1, 'VBD'), 'nsubj', (0, 'NNP'))]

In [272]:
tree.get_by_address(8)

{'address': 8,
 'ctag': 'CC',
 'deps': defaultdict(list, {}),
 'feats': '_',
 'head': 4,
 'lemma': '_',
 'rel': 'cc',
 'tag': 'CC',
 'word': 'and'}

In [262]:
list(tree.triples())

[(('sparkled', 'VBD'), 'nsubj', ('eyes', 'NNS')),
 (('eyes', 'NNS'), 'nmod:poss', ('Mrs_Bennet', 'NNP')),
 (('Mrs_Bennet', 'NNP'), 'case', ("'s", 'POS')),
 (('sparkled', 'VBD'), 'nmod', ('pleasure', 'NN')),
 (('pleasure', 'NN'), 'case', ('with', 'IN')),
 (('sparkled', 'VBD'), 'cc', ('and', 'CC')),
 (('sparkled', 'VBD'), 'conj', ('calling', 'VBG')),
 (('calling', 'VBG'), 'nsubj', ('she', 'PRP')),
 (('calling', 'VBG'), 'aux', ('was', 'VBD')),
 (('calling', 'VBG'), 'advmod', ('eagerly', 'RB')),
 (('calling', 'VBG'), 'compound:prt', ('out', 'RP')),
 (('calling', 'VBG'), 'advcl', ('read', 'VBD')),
 (('read', 'VBD'), 'mark', ('while', 'IN')),
 (('read', 'VBD'), 'nsubj', ('daughter', 'NN')),
 (('daughter', 'NN'), 'nmod:poss', ('her', 'PRP$'))]

In [252]:
extract_features_from_text("Mrs_Bennet's eyes sparkled with pleasure, and she was eagerly calling out, while her daughter read,", debug=True)

triples


[(('3', 'VBD'), 'nsubj', ('2', 'NNS')),
 (('2', 'NNS'), 'nmod:poss', ('0', 'NNP')),
 (('0', 'NNP'), 'case', ('1', 'POS')),
 (('3', 'VBD'), 'advcl', ('9', 'VBD')),
 (('9', 'VBD'), 'mark', ('4', 'IN')),
 (('9', 'VBD'), 'nsubj', ('5', 'NN')),
 (('5', 'NN'), 'cc', ('7', 'CC')),
 (('5', 'NN'), 'conj', ('8', 'PRP')),
 (('9', 'VBD'), 'xcomp', ('11', 'VBG')),
 (('11', 'VBG'), 'advmod', ('10', 'RB')),
 (('11', 'VBG'), 'compound:prt', ('12', 'RP')),
 (('11', 'VBG'), 'nmod', ('17', 'NN')),
 (('17', 'NN'), 'case', ('14', 'IN')),
 (('17', 'NN'), 'nmod:poss', ('15', 'PRP$')),
 (('17', 'NN'), 'compound', ('16', 'NN'))]

[(0, (1, 'mrs_bennet')),
 (1, (0, "'s")),
 (2, (4, 'eye')),
 (3, (0, 'sparkl')),
 (4, (1, 'with')),
 (5, (4, 'pleasur')),
 (6, (0, ',')),
 (7, (0, 'and')),
 (8, (1, 'she')),
 (9, (1, 'was')),
 (10, (2, 'eager')),
 (11, (1, 'call')),
 (12, (2, 'out')),
 (13, (0, ',')),
 (14, (0, 'while')),
 (15, (0, 'her')),
 (16, (1, 'daughter')),
 (17, (3, 'read')),
 (18, (0, ','))]

[(0, (2, 'mrs_bennet')),
 (1, (0, "'s")),
 (2, (2, 'eye')),
 (3, (0, 'sparkl')),
 (4, (1, 'with')),
 (5, (2, 'pleasur')),
 (6, (0, ',')),
 (7, (0, 'and')),
 (8, (0, 'she')),
 (9, (1, 'was')),
 (10, (2, 'eager')),
 (11, (1, 'call')),
 (12, (2, 'out')),
 (13, (0, ',')),
 (14, (0, 'while')),
 (15, (2, 'her')),
 (16, (1, 'daughter')),
 (17, (4, 'read')),
 (18, (0, ','))]

pleasur {'fiancee': 'married', 'son': 'son', 'lady': 'wife', 'aunt': 'aunt', 'husband': 'husband', 'sister': 'sister', 'father': 'father', 'wife': 'wife', 'cousin': 'cousin', 'mother': 'mother', 'daughter': 'daughter', 'uncle': 'uncle'} False
read {'fiancee': 'married', 'son': 'son', 'lady': 'wife', 'aunt': 'aunt', 'husband': 'husband', 'sister': 'sister', 'father': 'father', 'wife': 'wife', 'cousin': 'cousin', 'mother': 'mother', 'daughter': 'daughter', 'uncle': 'uncle'} False
None None None


(['F', None, None], [None, None, None])

In [179]:
[int(t[2][0]) for t in parser_nsubj_triples if t[0][1] == 'VBN']

[]

In [205]:
"daughter" in true_roles

True