In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from pprint import pprint

In [3]:
from collections import Counter, defaultdict

import json
from xgboost import XGBClassifier
import sklearn.tree
import sklearn.metrics
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

In [4]:
from sample_chapters import *
from feature_extraction import *
from classify import *

In [5]:
def extract_texts_and_characters(annotated_data):
    full_characters = np.asarray([datum['character'] for datum in annotated_data])
    full_texts = np.asarray([datum['text'] for datum in annotated_data])
    return full_texts, full_characters

In [6]:
from sklearn.model_selection import KFold

def evaluate(annotated_data, classifier=XGBClassifier(), nicknames2name=dict(), n_splits=10):
    
    full_texts, full_characters = extract_texts_and_characters(annotated_data)
    
    scores = []
    for train_inds, test_inds in KFold(n_splits=n_splits).split(annotated_data):
        train_texts = full_texts[train_inds]
        train_characters = full_characters[train_inds]
        
        test_characters = full_characters[test_inds]
        test_texts = full_texts[test_inds]
        

        train_classifier(train_texts, train_characters, classifier)
        score = test_classifier(test_texts, test_characters, classifier, nicknames2name)
        
        print(score)
        scores.append(score)
    return scores

In [7]:
nicknames2name_comb = {
    "Dany":"Daenerys",
    "Ned" : "Eddard",
    "Sam" : "Samwell",
    "Rollins" : "Pekka"
}
  
with open("../flat_data/asoif01-04.json","r") as fh:
    ann_GoT = np.asarray(json.load(fh))

In [None]:
scores = evaluate(ann_GoT, XGBClassifier(), nicknames2name_comb)
np.mean(scores)

0.9230769230769231
0.8846153846153846
0.9615384615384616
1.0
0.9230769230769231


In [None]:
with open("../flat_data/dregs01.json","r") as fh:
    ann_Dregs = np.asarray(json.load(fh))
with open("../flat_data/dregs01.json","r") as fh:
    ann_Dregs = np.hstack([ann_Dregs, np.asarray(json.load(fh))])

In [None]:
ann_comb = np.hstack([ann_GoT, ann_Dregs])
np.random.shuffle(ann_comb)
                     
scores = evaluate(ann_comb,
                  XGBClassifier(), nicknames2name_GoT)
np.mean(scores)

In [None]:

output_characters = list(run_classifier(extract_texts_and_characters(ann_Dregs)[0], 
                       classifier=cls))
reference_characters = extract_texts_and_characters(ann_Dregs)[1]
print("acc: ", sklearn.metrics.accuracy_score(output_characters, reference_characters))

In [None]:
cls = train_classifier(*extract_texts_and_characters(ann_GoT), 
                       classifier=XGBClassifier())

output_characters = list(run_classifier(extract_texts_and_characters(ann_GoT)[0], 
                       classifier=cls,
                       nicknames2name=nicknames2name_GoT))
reference_characters = [datum['character'] for datum in ann_GoT]

print("acc: ", sklearn.metrics.accuracy_score(output_characters, reference_characters))

joblib.dump(cls, "../trained_models/GoT-no-headings.pkl")


## Feature importance stuff

In [None]:
_, _,vector_keys = get_feature_vectors(ann_GoT[1]['text'])
feature_weights = list(zip(cls.feature_importances_,vector_keys))
feature_weights.sort(reverse=True)
feature_weights

In [None]:

scores = evaluate(ann_GoT, nicknames2name_GoT, XGBClassifier(n_estimators=100))
np.mean(scores)

In [None]:
with open("../flat_data/Warbreaker.json","r") as fh:
    warbreaker = json.load(fh)

In [None]:
cls = joblib.load("trained_models/GoT-no-headings.pkl")
warbreaker_characters = run_classifier(extract_texts_and_characters(warbreaker)[0], 
                       classifier=cls,)
ann_warbreaker = [(char, datum['text'][1:125]) for char,datum in zip(warbreaker_characters, warbreaker)]
ann_warbreaker

In [None]:
for imp, name in zip(classifier.feature_importances_, FeatureVec().keys()):
    print(name, "\t", imp )