In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from pprint import pprint

In [3]:
from collections import Counter, defaultdict

import json
from xgboost import XGBClassifier
import sklearn.tree
import sklearn.metrics
from sklearn.model_selection import cross_val_score

In [4]:
from sample_chapters import *
from feature_extraction import *

In [5]:
def character_scores(classifier, raw_text, feature_extractor = get_feature_vectors):
    names, feature_vectors, vector_keys = feature_extractor(raw_text)
    assert(len(names) == len(feature_vectors))
    scores = classifier.predict_proba(feature_vectors)[:,1] #second index is positive class
    return scores, names

"""
Merge the scores of characters nicknames into the real name
"""
def sanitize_name_scores(scores,names, nicknames2name):
    assert(len(names) == len(scores))
    for nickname,truename in nicknames2name.items():
        try:
            ind_nick = names.index(nickname)
            try:
                ind_true = names.index(truename)

                #transfer scores over
                scores[ind_true]+=scores[ind_nick]
                scores[ind_nick]=0
            except ValueError:
                    # truename not found, rename nick
                    names[ind_nick]=truename
        except ValueError:
            #nick not found in names
            #no worries
            pass       
    

def choose_character(classifier, raw_text, nicknames2name=dict(), feature_extractor = get_feature_vectors):
    scores, names = character_scores(classifier, raw_text, feature_extractor)
    sanitize_name_scores(scores,names, nicknames2name)
    
    return names[np.argmax(scores)]

In [11]:
def get_binary_choice_feature_vectors(raw_text, reference_name, name2nicknames):
    names, vectors, vector_keys = get_feature_vectors(raw_text, name2nicknames)
    return vectors, [(name == reference_name) for name in names]

def train_classifier(annotated_data, classifier, nicknames2name=dict()):
    Xs,Ys = zip(*[get_binary_choice_feature_vectors(datum['text'],datum['character'], nicknames2name) 
                  for datum in annotated_data])
    Xs = np.fromiter(it.chain(*Xs))
    Ys = np.fromiter(it.chain(*Ys))

    classifier.fit(Xs,Ys)
    return classifier

def run_classifier(annotated_data, classifier, nicknames2name=dict()):
    return [choose_character(classifier, datum['text'], nicknames2name)
            for datum in annotated_data]
    
        
def test_classifier(annotated_data, classifier, nicknames2name=dict()):
    output_characters = run_classifier(annotated_data, classifier)
    reference_characters = [datum['character'] for datum in annotated_data]
    return sklearn.metrics.accuracy_score(output_characters, reference_characters)

In [7]:
from sklearn.model_selection import KFold

def evaluate(annotated_data, nicknames2name, classifier=XGBClassifier(), n_splits=10):
    scores = []
    for train_inds, test_inds in KFold(n_splits=n_splits).split(annotated_data):
        train_chapters = annotated_data[train_inds]
        test_chapters = annotated_data[test_inds]

        train_classifier(train_chapters, classifier, nicknames2name)
        score = test_classifier(test_chapters, classifier, nicknames2name)
        
        print(score)
        scores.append(score)
    return scores

In [8]:
nicknames2name_GoT = {
    "Dany":"Daenerys",
    "Ned" : "Eddard",
    "Robert" : "Eddard",
    "Sam" : "Samwell",
}
  
with open("../flat_data/asoif01-04.json","r") as fh:
    ann_GoT = np.asarray(json.load(fh))

In [10]:
cls = train_classifier(ann_GoT, XGBClassifier(), nicknames2name_GoT)

In [13]:
output_characters = run_classifier(ann_GoT, cls, nicknames2name_GoT)
reference_characters = [datum['character'] for datum in ann_GoT]

In [14]:
sklearn.metrics.accuracy_score(output_characters, reference_characters)

1.0

In [19]:
?list.sort

In [24]:
_, _,vector_keys = get_feature_vectors(ann_GoT[1]['text'])
feature_weights = list(zip(cls.feature_importances_,vector_keys))
feature_weights.sort(reverse=True)
feature_weights

[(0.122, 'before_POS_was_percent_,'),
 (0.097999997, 'rank_percent'),
 (0.092, 'occur_percent'),
 (0.082000002, 'after_POS_was_percent_.'),
 (0.074000001, 'after_POS_was_VBD'),
 (0.071999997, 'after_POS_was_percent_VBD'),
 (0.068000004, 'before_POS_was_.'),
 (0.064000003, 'rank'),
 (0.048, 'occur_count'),
 (0.039999999, 'before_POS_was_,'),
 (0.035999998, 'last_occur_percent'),
 (0.029999999, 'before_POS_was_percent_JJ'),
 (0.026000001, 'after_POS_was_percent_VBZ'),
 (0.02, 'before_POS_was_NN'),
 (0.016000001, 'after_POS_was_MD'),
 (0.014, 'after_POS_was_percent_NN'),
 (0.012, 'after_POS_was_percent_,'),
 (0.0099999998, 'last_occur_position'),
 (0.0099999998, "before_POS_was_percent_''"),
 (0.0080000004, 'before_POS_was_percent_.'),
 (0.0080000004, 'after_POS_was_percent_CC'),
 (0.0060000001, 'before_POS_was_percent_WRB'),
 (0.0060000001, 'before_POS_was_percent_VBN'),
 (0.0060000001, 'before_POS_was_percent_VBD'),
 (0.0060000001, 'before_POS_was_percent_NNP'),
 (0.0040000002, 'before_

array([ 0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.006,  0.   ,
        0.   ,  0.   ,  0.   ,  0.064,  0.   ,  0.   ,  0.   ,  0.03 ,
        0.006,  0.   ,  0.   ,  0.   ,  0.008,  0.   ,  0.002,  0.   ,
        0.   ,  0.098,  0.004,  0.   ,  0.01 ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.004,  0.   ,  0.   ,  0.   ,
        0.04 ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.014,  0.   ,  0.   ,  0.   ,  0.072,
        0.074,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.004,
        0.   ,  0.006,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.016,  0.002,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.026,
        0.   ,  0.   ,  0.01 ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.002,  0.   ,  0.   ,  0.   ,
      

In [None]:

scores = evaluate(ann_GoT, nicknames2name_GoT, XGBClassifier(n_estimators=100))
np.mean(scores)

In [None]:
with open("../flat_data/Warbreaker.json","r") as fh:
    warbreaker = json.load(fh)

In [None]:
warbreaker_characters = run_classifier(warbreaker, cls)
ann_warbreaker = [(char, datum['text'][1:500]) for char,datum in zip(warbreaker_characters, warbreaker)]
ann_warbreaker

In [None]:
for imp, name in zip(classifier.feature_importances_, FeatureVec().keys()):
    print(name, "\t", imp )

In [None]:
from IPython.display import Image  
from sklearn.externals.six import StringIO  
import pydotplus as pydot

dot_data = StringIO()  
sklearn.tree.export_graphviz(classifier, out_file=dot_data,  
                     feature_names=list(FeatureVec().keys()),  
                     #class_names=iris.target_names,  
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())  