In [34]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
import numpy as np
from pprint import pprint

In [36]:
from collections import Counter, defaultdict

import json
from xgboost import XGBClassifier
import sklearn.tree
import sklearn.metrics
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

In [37]:
from sample_chapters import *
from feature_extraction import *
from classify import *

In [38]:
def extract_texts_and_characters(annotated_data):
    full_characters = np.asarray([datum['character'] for datum in annotated_data])
    full_texts = np.asarray([datum['text'] for datum in annotated_data])
    return full_texts, full_characters

In [39]:
from sklearn.model_selection import KFold

def evaluate(annotated_data, classifier=XGBClassifier(), nicknames2name=dict(), n_splits=10):
    
    full_texts, full_characters = extract_texts_and_characters(annotated_data)
    
    scores = []
    for train_inds, test_inds in KFold(n_splits=n_splits).split(annotated_data):
        train_texts = full_texts[train_inds]
        train_characters = full_characters[train_inds]
        
        test_characters = full_characters[test_inds]
        test_texts = full_texts[test_inds]
        

        train_classifier(train_texts, train_characters, classifier, nicknames2name)
        score = test_classifier(test_texts, test_characters, classifier, nicknames2name)
        
        print(score)
        scores.append(score)
    return scores

In [40]:
nicknames2name_GoT = {
    "Dany":"Daenerys",
    "Ned" : "Eddard",
    "Robert" : "Eddard",
    "Sam" : "Samwell",
}
  
with open("../flat_data/asoif01-04.json","r") as fh:
    ann_GoT = np.asarray(json.load(fh))

In [18]:
scores = evaluate(ann_GoT, XGBClassifier(), nicknames2name_GoT, n_splits=2)
np.mean(scores)

0.7421875
0.828125


0.78515625

In [41]:
cls = train_classifier(*extract_texts_and_characters(ann_GoT), 
                       classifier=XGBClassifier(),
                       nicknames2name=nicknames2name_GoT)

output_characters = run_classifier(extract_texts_and_characters(ann_GoT)[0], 
                       classifier=cls,
                       nicknames2name=nicknames2name_GoT)
reference_characters = [datum['character'] for datum in ann_GoT]

print("acc: ", sklearn.metrics.accuracy_score(output_characters, reference_characters))

joblib.dump(cls, "trained_models/GoT-no-headings.pkl")


acc:  1.0


['trained_models/GoT-no-headings.pkl']

## Feature importance stuff

In [24]:
_, _,vector_keys = get_feature_vectors(ann_GoT[1]['text'])
feature_weights = list(zip(cls.feature_importances_,vector_keys))
feature_weights.sort(reverse=True)
feature_weights

[(0.122, 'before_POS_was_percent_,'),
 (0.097999997, 'rank_percent'),
 (0.092, 'occur_percent'),
 (0.082000002, 'after_POS_was_percent_.'),
 (0.074000001, 'after_POS_was_VBD'),
 (0.071999997, 'after_POS_was_percent_VBD'),
 (0.068000004, 'before_POS_was_.'),
 (0.064000003, 'rank'),
 (0.048, 'occur_count'),
 (0.039999999, 'before_POS_was_,'),
 (0.035999998, 'last_occur_percent'),
 (0.029999999, 'before_POS_was_percent_JJ'),
 (0.026000001, 'after_POS_was_percent_VBZ'),
 (0.02, 'before_POS_was_NN'),
 (0.016000001, 'after_POS_was_MD'),
 (0.014, 'after_POS_was_percent_NN'),
 (0.012, 'after_POS_was_percent_,'),
 (0.0099999998, 'last_occur_position'),
 (0.0099999998, "before_POS_was_percent_''"),
 (0.0080000004, 'before_POS_was_percent_.'),
 (0.0080000004, 'after_POS_was_percent_CC'),
 (0.0060000001, 'before_POS_was_percent_WRB'),
 (0.0060000001, 'before_POS_was_percent_VBN'),
 (0.0060000001, 'before_POS_was_percent_VBD'),
 (0.0060000001, 'before_POS_was_percent_NNP'),
 (0.0040000002, 'before_

array([ 0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.006,  0.   ,
        0.   ,  0.   ,  0.   ,  0.064,  0.   ,  0.   ,  0.   ,  0.03 ,
        0.006,  0.   ,  0.   ,  0.   ,  0.008,  0.   ,  0.002,  0.   ,
        0.   ,  0.098,  0.004,  0.   ,  0.01 ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.004,  0.   ,  0.   ,  0.   ,
        0.04 ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.014,  0.   ,  0.   ,  0.   ,  0.072,
        0.074,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.004,
        0.   ,  0.006,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.016,  0.002,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.026,
        0.   ,  0.   ,  0.01 ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.002,  0.   ,  0.   ,  0.   ,
      

In [None]:

scores = evaluate(ann_GoT, nicknames2name_GoT, XGBClassifier(n_estimators=100))
np.mean(scores)

In [30]:
with open("../flat_data/Warbreaker.json","r") as fh:
    warbreaker = json.load(fh)

In [31]:
cls = joblib.load("trained_models/GoT-no-headings.pkl")
warbreaker_characters = run_classifier(extract_texts_and_characters(warbreaker)[0], 
                       classifier=cls,)
ann_warbreaker = [(char, datum['text'][1:125]) for char,datum in zip(warbreaker_characters, warbreaker)]
ann_warbreaker

[('Brandon Sanderson',
  '\nIntroduction to WARBREAKER\nWelcome!  My name is Brandon Sanderson.  Before anything else, I’d like to thank you for your in'),
 ('Breath',
  '\nPrologue\nIt’s funny,  Vasher thought,  how many things begin with me getting thrown into prison. \nThe guards laughed to one'),
 ('Austre',
  '\nChapter One\nThere were great advantages to being unimportant.\nTrue, by many people’s standards, Siri wasn’t ‘unimportant.’ '),
 ('Dedelin',
  '\nChapter Two\nSiri sat, stunned, in a rattling carriage, her homeland growing more and more distant with each bump and shake.'),
 ('Hallandren',
  '\nChapter Three\nLightsong didn’t remember dying.\nHis priests, however, assured him that his death had been extremely inspirin'),
 ('Father',
  '\nChapter Four\nSiri’s carriage rolled to a stop outside of T’Telir, capital of Hallandren.  She stared out the window and rea'),
 ('Gods',
  '\nChapter Five\nThis will complicate things,  Vasher thought, standing in the shadows atop the wal

In [32]:
for imp, name in zip(classifier.feature_importances_, FeatureVec().keys()):
    print(name, "\t", imp )

NameError: name 'classifier' is not defined