In [11]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
import numpy as np
from pprint import pprint

from collections import Counter, defaultdict

import json
from xgboost import XGBClassifier
import sklearn.tree
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

from sample_chapters import *
from feature_extraction import *
from classify import *

In [13]:
def extract_texts_and_characters(annotated_data):
    full_characters = np.asarray([datum['character'] for datum in annotated_data])
    full_texts = np.asarray([datum['text'] for datum in annotated_data])
    return full_texts, full_characters

In [24]:
from sklearn.model_selection import KFold

def cross_validation_evaluate(annotated_data, solver, n_splits=10, metric=accuracy_score):
    
    full_texts, full_characters = extract_texts_and_characters(annotated_data)
    
    scores = []
    for train_inds, test_inds in KFold(n_splits=n_splits).split(annotated_data):
        train_texts = full_texts[train_inds]
        train_characters = full_characters[train_inds]
        
        test_characters = full_characters[test_inds]
        test_texts = full_texts[test_inds]
        

        solver.train(train_texts, train_characters)
        score = solver.test(test_texts, test_characters, metric=metric)
        
        print(score)
        scores.append(score)
    return scores

In [15]:
nicknames2name_comb = {
    "Dany":"Daenerys",
    "Ned" : "Eddard",
    "Sam" : "Samwell",
    "Rollins" : "Pekka"
}

with open("../flat_data/asoif01-04.json","r") as fh:
    ann_GoT = np.asarray(json.load(fh))

In [35]:
scores = cross_validation_evaluate(ann_GoT, MLCharacterSolver(XGBClassifier(), nicknames2name_comb),
                                  metric=lambda tt,pp: precision_recall_fscore_support(tt,pp, average='macro')[0:3])
np.mean(scores, axis=0)

  'precision', 'predicted', average, warn_for)


(0.8666666666666667, 0.8888888888888888, 0.8765432098765432)


  'precision', 'predicted', average, warn_for)


(0.7333333333333333, 0.8, 0.76)


  'precision', 'predicted', average, warn_for)


(0.8611111111111112, 0.8888888888888888, 0.873015873015873)
(1.0, 1.0, 1.0)


  'precision', 'predicted', average, warn_for)


(0.7772727272727273, 0.8181818181818182, 0.7950937950937952)
(1.0, 1.0, 1.0)


  'precision', 'predicted', average, warn_for)


(0.6666666666666666, 0.7692307692307693, 0.7025641025641025)
(1.0, 1.0, 1.0)
(1.0, 1.0, 1.0)
(1.0, 1.0, 1.0)


0.9025819283597062

In [36]:
np.mean(scores, axis=0)

array([0.89050505, 0.91651904, 0.9007217 ])

In [33]:
scores = cross_validation_evaluate(ann_GoT, FirstMentionedSolver(nicknames2name_comb),
                                  metric=lambda tt,pp: precision_recall_fscore_support(tt,pp, average='macro')[0:3])
np.mean(scores, axis=0)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


(0.09375, 0.16666666666666666, 0.11071428571428571)
(0.08333333333333333, 0.14285714285714285, 0.09999999999999999)
(0.04285714285714286, 0.10714285714285714, 0.058333333333333334)
(0.07976190476190477, 0.21428571428571427, 0.11394557823129252)
(0.07183908045977012, 0.13793103448275862, 0.08850574712643677)
(0.06055555555555556, 0.13333333333333333, 0.08126984126984128)
(0.1358974358974359, 0.23076923076923078, 0.16025641025641027)
(0.0734567901234568, 0.16666666666666666, 0.09770723104056438)
(0.18939393939393942, 0.29545454545454547, 0.22121212121212122)
(0.03518518518518518, 0.1111111111111111, 0.05185185185185185)


array([0.08660304, 0.17062183, 0.10837964])

In [29]:
?np.mean

In [39]:
scores = cross_validation_evaluate(ann_GoT, MostMentionedSolver(nicknames2name_comb),
    metric=lambda tt,pp: precision_recall_fscore_support(tt,pp, average='weighted')[0:3])
np.mean(scores, axis=0)

  'precision', 'predicted', average, warn_for)


(0.9019230769230769, 0.9230769230769231, 0.9065934065934067)
(0.6965811965811965, 0.8076923076923077, 0.7373626373626373)
(0.8846153846153846, 0.9230769230769231, 0.8974358974358974)
(1.0, 1.0, 1.0)
(0.8057692307692308, 0.8846153846153846, 0.8382173382173382)
(0.8782051282051282, 0.9230769230769231, 0.8948717948717948)
(0.6746666666666666, 0.8, 0.7235555555555555)
(0.8266666666666667, 0.88, 0.8440000000000001)
(0.9333333333333332, 0.96, 0.9440000000000001)
(1.0, 1.0, 1.0)


array([0.86017607, 0.91015385, 0.87860366])

In [19]:
with open("../flat_data/dregs01.json","r") as fh:
    ann_Dregs = np.asarray(json.load(fh))
with open("../flat_data/dregs01.json","r") as fh:
    ann_Dregs = np.hstack([ann_Dregs, np.asarray(json.load(fh))])

In [None]:
ann_comb = np.hstack([ann_GoT, ann_Dregs])
np.random.shuffle(ann_comb)
                     
scores = evaluate(ann_comb,
                  XGBClassifier(), nicknames2name_GoT)
np.mean(scores)

In [None]:

output_characters = list(run_classifier(extract_texts_and_characters(ann_Dregs)[0], 
                       classifier=cls))
reference_characters = extract_texts_and_characters(ann_Dregs)[1]
print("acc: ", sklearn.metrics.accuracy_score(output_characters, reference_characters))

In [None]:
cls = train_classifier(*extract_texts_and_characters(ann_GoT), 
                       classifier=XGBClassifier())

output_characters = list(run_classifier(extract_texts_and_characters(ann_GoT)[0], 
                       classifier=cls,
                       nicknames2name=nicknames2name_GoT))
reference_characters = [datum['character'] for datum in ann_GoT]

print("acc: ", sklearn.metrics.accuracy_score(output_characters, reference_characters))

joblib.dump(cls, "../trained_models/GoT-no-headings.pkl")


## Feature importance stuff

In [None]:
_, _,vector_keys = get_feature_vectors(ann_GoT[1]['text'])
feature_weights = list(zip(cls.feature_importances_,vector_keys))
feature_weights.sort(reverse=True)
feature_weights

In [None]:

scores = evaluate(ann_GoT, nicknames2name_GoT, XGBClassifier(n_estimators=100))
np.mean(scores)

In [None]:
with open("../flat_data/Warbreaker.json","r") as fh:
    warbreaker = json.load(fh)

In [None]:
cls = joblib.load("trained_models/GoT-no-headings.pkl")
warbreaker_characters = run_classifier(extract_texts_and_characters(warbreaker)[0], 
                       classifier=cls,)
ann_warbreaker = [(char, datum['text'][1:125]) for char,datum in zip(warbreaker_characters, warbreaker)]
ann_warbreaker

In [None]:
for imp, name in zip(classifier.feature_importances_, FeatureVec().keys()):
    print(name, "\t", imp )