In [86]:
import nltk
from pprint import pprint
from collections import Counter
import itertools as it
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = it.tee(iterable)
    next(b, None)
    return zip(a, b)

from functools import reduce

In [87]:
from sample_chapters import *

In [88]:
def ne_preprocess(raw_text):
    sents = nltk.sent_tokenize(raw_text)
    tokenised_sents = [nltk.word_tokenize(sent) for sent in sents]
    tagged_sents = nltk.pos_tag_sents(tokenised_sents)
    ne_sents = nltk.ne_chunk_sents(tagged_sents, binary=True)
    return ne_sents

In [189]:
def get_name(ne_tree):
    return " ".join([tagged_leaf[0] for tagged_leaf in ne_tree.leaves()])
    
def get_named_entities(sent):
    for subsent in sent.subtrees(lambda ss:ss.label()=='NE'):
        yield get_name(subsent)


In [190]:
def get_main_character_by_voting(raw_text, breakdown=False, vote_over=get_named_entities):
    ne_sents = ne_preprocess(raw_text)

    votes=Counter(it.chain(*[vote_over(sent) for sent in ne_sents]))
    if breakdown:
        return votes
    else:
        if len(votes)>0:
            return votes.most_common(1)[0][0]
        else:
            return None


In [211]:
VERBS = frozenset({"MD","VB","VBD","VBG","VBN","VBP","VBZ"})
def patterned_name_entities(ne_sent):
    for e1,e2 in pairwise(ne_sent):
        if type(e1)!=nltk.tree.Tree or e1.label()!='NE':
            continue
        
        if type(e2)==tuple and e2[1] in VERBS:
            
            yield get_name(e1)

In [212]:
print(get_main_character_by_voting(sample_memoriesoflight_rand,vote_over=patterned_name_entities))
print(get_main_character_by_voting(sample_falloflight_renarr_1,vote_over=patterned_name_entities))
print(get_main_character_by_voting(sample_falloflight_Renarr_2,vote_over=patterned_name_entities))
print(get_main_character_by_voting(sample_falloflight_havaral_1,vote_over=patterned_name_entities))
print(get_main_character_by_voting(sample_falloflight_havaral_2,vote_over=patterned_name_entities))
print(get_main_character_by_voting(sample_falloflight_none,vote_over=patterned_name_entities))

Rand
Renarr
Renarr
Havaral
Havaral
Tug


In [197]:
import json
from collections import defaultdict

In [217]:
annotated_data = []

def load_append(fn):    
    with open(fn,"r") as fh:
        annotated_data.extend(json.load(fh))
        
load_append("test_data/asoiaf01-04.json")
load_append("test_data/aDwD.json")
load_append("test_data/Rick Riordan - [Heroes of Olympus 02] - The Son of Neptune (epub).json")
load_append("test_data/Rick Riordan - [Heroes of Olympus 05] - The Blood of Olympus (epub).json")
load_append("test_data/Leigh Bardugo - [Dregs 01] - Six of Crows.json")

#load_append("test_data/Jonathan Stroud - [Bartimaeus 01] - The Amulet of Samarkand.json")
#load_append("test_data/Jonathan Stroud - [Bartimaeus 02] - The Golem's Eye.json")
#load_append("test_data/Jonathan Stroud - [Bartimaeus 03] - Ptolemy's Gate.json")
#load_append("test_data/Jonathan Stroud - [Bartimaeus 04] - The Ring of Solomon.json")



In [218]:
unique_texts=set(datum['text'] for datum in annotated_data)
assert len(annotated_data)==len(unique_texts)

In [219]:
reference_characters = [datum['character'] for datum in annotated_data]

In [220]:
#reference_characters

In [225]:
output_characters = [get_main_character_by_voting(datum['text'], vote_over=get_named_entities) for datum in annotated_data]


In [226]:
name2nicknames = defaultdict(lambda:tuple(),{
    "Daenerys": {"Dany",},
    "Eddard" : {"Ned"},
    "Samwell" :{"Sam"},
})

def correct(actual, ref):
    return actual==ref or actual in name2nicknames[ref]   

In [227]:
errors = [(index, actual, ref) for index, (actual, ref) in enumerate(zip(output_characters, reference_characters)) 
    if not (correct(actual,ref))]
errors

[(3, 'Robert', 'Eddard'),
 (11, 'Robert', 'Eddard'),
 (38, 'Robert', 'Eddard'),
 (41, 'Bronn', 'Tyrion'),
 (62, 'Robb', 'Catelyn'),
 (104, 'Brienne', 'Catelyn'),
 (125, 'Cersei', 'Tyrion'),
 (127, 'Asha', 'Theon'),
 (143, 'Gendry', 'Arya'),
 (151, 'Brienne', 'Jaime'),
 (154, 'Robb', 'Catelyn'),
 (160, 'Robb', 'Catelyn'),
 (171, 'Brienne', 'Jaime'),
 (174, 'Hound', 'Arya'),
 (177, 'Brienne', 'Jaime'),
 (179, 'Lord Beric', 'Arya'),
 (185, 'Robb', 'Catelyn'),
 (189, 'Robb', 'Catelyn'),
 (193, 'Cersei', 'Tyrion'),
 (207, 'Kingsguard', 'Jaime'),
 (260, 'Illyrio', 'Tyrion'),
 (314, 'Percy', 'Hazel'),
 (324, 'Frank', 'Hazel'),
 (331, 'Phineas', 'Percy'),
 (342, 'Frank', 'Percy'),
 (353, 'Percy', 'Frank'),
 (380, 'Hylla', 'Reyna'),
 (385, 'Percy', 'Jason'),
 (387, 'Reyna', 'Nico'),
 (390, 'Apollo', 'Leo'),
 (391, 'Apollo', 'Leo'),
 (401, 'Jason', 'Piper'),
 (417, 'Kaz', 'Inej'),
 (419, 'Kaz', 'Inej'),
 (433, 'Jordie', 'Kaz'),
 (443, 'Nina', 'Inej'),
 (456, 'Nina', 'Matthias'),
 (457, 'Nina', '

In [228]:
len(errors)/len(reference_characters)

0.08874458874458875

In [230]:
#Basic Voting 0.08874458874458875
[(3, 'Robert', 'Eddard'),
 (11, 'Robert', 'Eddard'),
 (38, 'Robert', 'Eddard'),
 (41, 'Bronn', 'Tyrion'),
 (62, 'Robb', 'Catelyn'),
 (104, 'Brienne', 'Catelyn'),
 (125, 'Cersei', 'Tyrion'),
 (127, 'Asha', 'Theon'),
 (143, 'Gendry', 'Arya'),
 (151, 'Brienne', 'Jaime'),
 (154, 'Robb', 'Catelyn'),
 (160, 'Robb', 'Catelyn'),
 (171, 'Brienne', 'Jaime'),
 (174, 'Hound', 'Arya'),
 (177, 'Brienne', 'Jaime'),
 (179, 'Lord Beric', 'Arya'),
 (185, 'Robb', 'Catelyn'),
 (189, 'Robb', 'Catelyn'),
 (193, 'Cersei', 'Tyrion'),
 (207, 'Kingsguard', 'Jaime'),
 (260, 'Illyrio', 'Tyrion'),
 (314, 'Percy', 'Hazel'),
 (324, 'Frank', 'Hazel'),
 (331, 'Phineas', 'Percy'),
 (342, 'Frank', 'Percy'),
 (353, 'Percy', 'Frank'),
 (380, 'Hylla', 'Reyna'),
 (385, 'Percy', 'Jason'),
 (387, 'Reyna', 'Nico'),
 (390, 'Apollo', 'Leo'),
 (391, 'Apollo', 'Leo'),
 (401, 'Jason', 'Piper'),
 (417, 'Kaz', 'Inej'),
 (419, 'Kaz', 'Inej'),
 (433, 'Jordie', 'Kaz'),
 (443, 'Nina', 'Inej'),
 (456, 'Nina', 'Matthias'),
 (457, 'Nina', 'Inej'),
 (458, 'Kuwei', 'Nina'),
 (460, 'Van Eck', 'Kaz'),
 (461, 'Rollins', 'Pekka')]


#VERBs after Errors: 0.06493506493506493
[(3, 'Robert', 'Eddard'),
 (11, 'Robert', 'Eddard'),
 (38, 'Robert', 'Eddard'),
 (41, 'Bronn', 'Tyrion'),
 (151, 'Brienne', 'Jaime'),
 (160, 'Robb', 'Catelyn'),
 (171, 'Qyburn', 'Jaime'),
 (174, 'Hound', 'Arya'),
 (177, 'Bolton', 'Jaime'),
 (193, 'Lord Tywin', 'Tyrion'),
 (205, 'Hound', 'Arya'),
 (314, 'Percy', 'Hazel'),
 (324, 'Frank', 'Hazel'),
 (331, 'Phineas', 'Percy'),
 (333, 'Phineas', 'Percy'),
 (342, 'Frank', 'Percy'),
 (353, 'Percy', 'Frank'),
 (380, 'Hylla', 'Reyna'),
 (385, 'Percy', 'Jason'),
 (387, 'Bryce', 'Nico'),
 (390, 'Apollo', 'Leo'),
 (393, 'Asclepius', 'Leo'),
 (401, 'Hazel', 'Piper'),
 (407, 'Zeus', 'Jason'),
 (419, 'Kaz', 'Inej'),
 (433, 'Jordie', 'Kaz'),
 (456, 'Nina', 'Matthias'),
 (457, 'Nina', 'Inej'),
 (458, 'Kuwei', 'Nina'),
 (461, 'Rollins', 'Pekka'
#

SyntaxError: unexpected EOF while parsing (<ipython-input-230-31f149eda328>, line 32)

In [231]:
get_main_character_by_voting(annotated_data[1]['text'],True).most_common(10)

[('Catelyn', 11),
 ('Ned', 7),
 ('Eyrie', 4),
 ('Riverrun', 4),
 ('Starks', 3),
 ('Worship', 2),
 ('Jon', 2),
 ('Robert', 2),
 ('Bran', 2),
 ('Winterfell', 2)]

In [None]:
text = list(ne_preprocess( annotated_data[1]['text']))

In [None]:
print(type(text[0]))