In [11]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
import numpy as np
from pprint import pprint

from collections import Counter, defaultdict

import json
from xgboost import XGBClassifier
import sklearn.tree
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

from sample_chapters import *
from feature_extraction import *
from classify import *

In [103]:
def extract_texts_and_characters(annotated_data):
    full_characters = np.asarray([datum['character'] for datum in annotated_data])
    full_texts = np.asarray([datum['text'] for datum in annotated_data])
    return full_texts, full_characters



In [125]:
from sklearn.model_selection import KFold

def xval_evaluate(annotated_data, solver, n_splits=10, metric=accuracy_score, mute=True):
    
    full_texts, full_characters = extract_texts_and_characters(annotated_data)
    
    scores = []
    for train_inds, test_inds in KFold(n_splits=n_splits).split(annotated_data):        
        train_ann_data = annotated_data[train_inds]
        test_ann_data = annotated_data[test_inds]

        score = train_test_evaluate(train_ann_data, test_ann_data, solver, n_splits, metric)
        
        if not(mute):
            print(score)
            
        scores.append(score)
    return np.mean(scores, axis=0)


def evaluate(train_ann_data, test_ann_data, solver, n_splits=10, metric=accuracy_score):   
    train_texts, train_characters = extract_texts_and_characters(train_ann_data)
    test_texts, test_characters = extract_texts_and_characters(test_ann_data)

    solver.train(train_texts, train_characters)
    score = solver.test(test_texts, test_characters, metric=metric)
    return score



In [138]:
nicknames2name_comb = {
    "Dany":"Daenerys",
    "Ned" : "Eddard",
    "Sam" : "Samwell",
    "Rollins" : "Pekka"
}

with open("../flat_data/asoif01-04.json","r") as fh:
    ann_ASOIAF = np.asarray(json.load(fh))
    
    
with open("../flat_data/dregs01.json","r") as fh:
    ann_SOC = np.asarray(json.load(fh))
with open("../flat_data/dregs01.json","r") as fh:
    ann_SOC = np.hstack([ann_SOC, np.asarray(json.load(fh))])

ann_comb = np.hstack([ann_SOC, ann_ASOIAF])

np.random.shuffle(ann_ASOIAF)
np.random.shuffle(ann_SOC)
np.random.shuffle(ann_comb)
print("lengths: ", len(ann_comb), len(ann_SOC), len(ann_ASOIAF))
print("POVs: ", *[len(np.unique(extract_texts_and_characters(ann)[1])) for ann in (ann_comb, ann_SOC, ann_ASOIAF)])

lengths:  348 92 256
POVs:  22 7 15


In [137]:
np.unique(extract_texts_and_characters(ann_SOC)[1])

array(['Inej', 'Jesper', 'Joost', 'Kaz', 'Matthias', 'Nina', 'Pekka'],
      dtype='<U8')

In [None]:
scores = cross_validation_evaluate(ann_ASOIAF, MLCharacterSolver(XGBClassifier(), nicknames2name_comb),
                                  metric=lambda tt,pp: precision_recall_fscore_support(tt,pp, average='macro', warn_for={})[0:3])
np.mean(scores, axis=0)

# Evaluation Program

In [60]:
import pandas as pd

## main eval

In [139]:
ML_mdl = MLCharacterSolver(XGBClassifier(), nicknames2name_comb)
FM_mdl = FirstMentionedSolver(nicknames2name_comb)
MC_mdl = MostMentionedSolver(nicknames2name_comb)


program = {
    ("ML Classical Features Trained on SOC","ASOIAF"): (ann_SOC, ann_ASOIAF, ML_mdl) ,
    ("ML Classical Features Trained on ASOIAF","SOC"): (ann_ASOIAF, ann_SOC, ML_mdl),   
    ("First Mentioned","SOC") :                        ([], ann_SOC, FM_mdl),
    ("First Mentioned","ASOIAF"):                      ([], ann_ASOIAF, FM_mdl),
    ("Most Commonly Mentioned","SOC"):                 ([], ann_SOC, MC_mdl),
    ("Most Commonly Mentioned","ASOIAF"):              ([], ann_ASOIAF, MC_mdl),
}


res = pd.DataFrame(index=program.keys())
res.sort_index(inplace=True)

for ind in res.index:
    print(ind, end="")
    score = evaluate(*program[ind])
    res.loc[ind, "Acc"] = score
    print(" ", score)
    
res

('First Mentioned', 'ASOIAF')  0.25
('First Mentioned', 'SOC')  0.3695652173913043
('ML Classical Features Trained on ASOIAF', 'SOC')  0.9130434782608695
('ML Classical Features Trained on SOC', 'ASOIAF')  0.91796875
('Most Commonly Mentioned', 'ASOIAF')  0.91015625
('Most Commonly Mentioned', 'SOC')  0.782608695652174


Unnamed: 0,Acc
"(First Mentioned, ASOIAF)",0.25
"(First Mentioned, SOC)",0.369565
"(ML Classical Features Trained on ASOIAF, SOC)",0.913043
"(ML Classical Features Trained on SOC, ASOIAF)",0.917969
"(Most Commonly Mentioned, ASOIAF)",0.910156
"(Most Commonly Mentioned, SOC)",0.782609


## Cross Evaluation
To test how much it effects things from different styles.

In [140]:
program = {
    ("ML Classical Features","ASOIAF"):                (ann_ASOIAF, ML_mdl),
    ("ML Classical Features","SOC"):                   (ann_SOC, ML_mdl),
    ("ML Classical Features","combined"):              (ann_comb, ML_mdl),
    
    ("First Mentioned","SOC") :                        (ann_SOC, FM_mdl),
    ("First Mentioned","ASOIAF"):                      (ann_ASOIAF, FM_mdl),
    ("First Mentioned","combined"):                    (ann_comb, FM_mdl),
    
    ("Most Commonly Mentioned","SOC"):                 (ann_SOC, MC_mdl),
    ("Most Commonly Mentioned","ASOIAF"):              (ann_ASOIAF, MC_mdl),
    ("Most Commonly Mentioned","combined"):            (ann_comb, MC_mdl),
}


res_xval = pd.DataFrame(index=program.keys())
res_xval.sort_index(inplace=True)

for ind in res_xval.index:
    print(ind, end="")
    score = xval_evaluate(*program[ind]) 
    res_xval.loc[ind, "Acc"] = score
    print(" ", score)
    
res_xval

('First Mentioned', 'ASOIAF')  0.25015384615384617
('First Mentioned', 'SOC')  0.37
('First Mentioned', 'combined')  0.28168067226890753
('ML Classical Features', 'ASOIAF')  0.9607692307692307
('ML Classical Features', 'SOC')  0.9777777777777779
('ML Classical Features', 'combined')  0.9598319327731092
('Most Commonly Mentioned', 'ASOIAF')  0.9099999999999999
('Most Commonly Mentioned', 'SOC')  0.7844444444444444
('Most Commonly Mentioned', 'combined')  0.876470588235294


Unnamed: 0,Acc
"(First Mentioned, ASOIAF)",0.250154
"(First Mentioned, SOC)",0.37
"(First Mentioned, combined)",0.281681
"(ML Classical Features, ASOIAF)",0.960769
"(ML Classical Features, SOC)",0.977778
"(ML Classical Features, combined)",0.959832
"(Most Commonly Mentioned, ASOIAF)",0.91
"(Most Commonly Mentioned, SOC)",0.784444
"(Most Commonly Mentioned, combined)",0.876471


# Save some trained model

In [None]:
cls = train_classifier(*extract_texts_and_characters(ann_ASOIAF), 
                       classifier=XGBClassifier())

output_characters = list(run_classifier(extract_texts_and_characters(ann_ASOIAF)[0], 
                       classifier=cls,
                       nicknames2name=nicknames2name_ASOIAF))
reference_characters = [datum['character'] for datum in ann_ASOIAF]

print("acc: ", sklearn.metrics.accuracy_score(output_characters, reference_characters))

joblib.dump(cls, "../trained_models/ASOIAF-no-headings.pkl")


## Feature importance stuff

In [None]:
_, _,vector_keys = get_feature_vectors(ann_ASOIAF[1]['text'])
feature_weights = list(zip(cls.feature_importances_,vector_keys))
feature_weights.sort(reverse=True)
feature_weights

In [None]:

scores = evaluate(ann_ASOIAF, nicknames2name_ASOIAF, XGBClassifier(n_estimators=100))
np.mean(scores)

In [None]:
with open("../flat_data/Warbreaker.json","r") as fh:
    warbreaker = json.load(fh)

In [None]:
cls = joblib.load("trained_models/ASOIAF-no-headings.pkl")
warbreaker_characters = run_classifier(extract_texts_and_characters(warbreaker)[0], 
                       classifier=cls,)
ann_warbreaker = [(char, datum['text'][1:125]) for char,datum in zip(warbreaker_characters, warbreaker)]
ann_warbreaker

In [None]:
for imp, name in zip(classifier.feature_importances_, FeatureVec().keys()):
    print(name, "\t", imp )