In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from pprint import pprint

from collections import Counter, defaultdict

import json
from xgboost import XGBClassifier
import sklearn.tree
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

from sample_chapters import *
from feature_extraction import *
from classify import *
from book import *

In [3]:
def extract_texts_and_characters(annotated_data):
    full_characters = np.asarray([datum['character'] for datum in annotated_data])
    full_texts = np.asarray([datum['text'] for datum in annotated_data])
    return full_texts, full_characters

In [6]:
from sklearn.model_selection import KFold

def xval_evaluate(annotated_data, solver, n_splits=10, metric=accuracy_score, mute=True):
    
    full_texts, full_characters = extract_texts_and_characters(annotated_data)
    
    scores = []
    for train_inds, test_inds in KFold(n_splits=n_splits).split(annotated_data):        
        train_ann_data = annotated_data[train_inds]
        test_ann_data = annotated_data[test_inds]

        score = train_test_evaluate(train_ann_data, test_ann_data, solver, n_splits, metric)
        if not(mute):
            print(score)
            
        scores.append(score)
    return np.mean(scores, axis=0)


def evaluate(train_ann_data, test_ann_data, solver, metric=accuracy_score):   
    train_texts, train_characters = extract_texts_and_characters(train_ann_data)
    test_texts, test_characters = extract_texts_and_characters(test_ann_data)

    solver.train(train_texts, train_characters)
    score = solver.test(test_texts, test_characters, metric=metric)
    return score



In [7]:
nicknames2name_comb = {
    "Dany":"Daenerys",
    "Ned" : "Eddard",
    "Sam" : "Samwell",
    "Rollins" : "Pekka"
}

with open("../flat_data/asoif01-04.json","r") as fh:
    ann_ASOIAF = np.asarray(json.load(fh))
    
    
with open("../flat_data/dregs01.json","r") as fh:
    ann_SOC = np.asarray(json.load(fh))
with open("../flat_data/dregs01.json","r") as fh:
    ann_SOC = np.hstack([ann_SOC, np.asarray(json.load(fh))])

ann_comb = np.hstack([ann_SOC, ann_ASOIAF])

np.random.shuffle(ann_ASOIAF)
np.random.shuffle(ann_SOC)
np.random.shuffle(ann_comb)
print("lengths: ", len(ann_comb), len(ann_SOC), len(ann_ASOIAF))
print("POVs: ", *[len(np.unique(extract_texts_and_characters(ann)[1])) for ann in (ann_comb, ann_SOC, ann_ASOIAF)])

lengths:  348 92 256
POVs:  22 7 15


In [8]:
def hybrid_features(x):
    emb_names, emb_vecs, _ = get_embedding_features(x, 5)
    cl_names, cl_vecs, _ = get_feature_vectors(x)
    #print(cl_names)
    #print("*********")
    #print(emb_names)
    assert cl_names == emb_names
    vecs = np.hstack([cl_vecs, emb_vecs])
    assert vecs.shape[0]==len(cl_names)
    assert vecs.shape[1]>2
    return cl_names, vecs, "hybrid"

# Evaluation Program

In [9]:
import pandas as pd

def all_metrics(tt,pp):
    prf = precision_recall_fscore_support(tt,pp, average='macro', labels=np.unique(tt))[0:3]
    acc = accuracy_score(tt,pp)
    return np.hstack([prf, acc])

all_metrics_names = ["P", "R", "F1", "Acc"]

In [10]:
WE_mdl = lambda: MLCharacterSolver(XGBClassifier(), nicknames2name_comb, lambda x:get_embedding_features(x,5))
CL_mdl = lambda: MLCharacterSolver(XGBClassifier(), nicknames2name_comb)
HY_mdl = lambda: MLCharacterSolver(XGBClassifier(), nicknames2name_comb, hybrid_features)

FM_mdl = lambda: FirstMentionedSolver(nicknames2name_comb)
MC_mdl = lambda: MostMentionedSolver(nicknames2name_comb)

datasets = [("ASIAF", ann_ASOIAF), ("SOC", ann_SOC)]
base_mdls = [("ML Classical Features", CL_mdl),
        ("ML Hybrid Features", HY_mdl),
        ("ML Word Emb. Features", WE_mdl),
        ("First Mentioned", FM_mdl),
        ("Most Commonly Mentioned", MC_mdl)
       ]
        


## main eval

In [None]:
def make_program(datasets, mdls):
    program = dict()    
    for (test_data_name, test_data),(mdl_name,mdl) in it.product(datasets, mdls):
        if mdl_name[0:2]=="ML":
            for (train_data_name, train_data) in datasets:
                program[(test_data_name, mdl_name+" trained on " + train_data_name)] = (
                    train_data,
                    test_data,
                    mdl
                )
        else:
            program[(test_data_name, mdl_name)] = ([], test_data, mdl())
    return program

program = make_program(datasets, base_mdls)


res = pd.DataFrame(index=pd.MultiIndex.from_tuples(program.keys()),
                   columns = all_metrics_names)
res.sort_index(inplace=True)

for ind in res.index:
    print(ind, end="")
    score = evaluate(*program[ind], metric=all_metrics)
    res.loc[ind,:] = score
    print(" ", score)

    
res.to_csv("../results/maineval.csv", index_label=["Testset", "Method"])
res

## Cross Evaluation
To test how much it effects things from different styles.

In [None]:
def make_program(datasets, mdls):
    program = dict()    
    for (data_name, data),(mdl_name,mdl) in it.product(datasets, mdls):
        program[(data_name, mdl_name)] = (data, mdl())
    return program
program = make_program(datasets+[("Combined", ann_comb)], base_mdls)


res_xval = pd.DataFrame(index=pd.MultiIndex.from_tuples(program.keys()),,
                        columns = all_metrics_names)
res_xval.sort_index(inplace=True)

for ind in res_xval.index:
    print(ind, end="")
    score = xval_evaluate(*program[ind], metric=all_metrics) 
    res_xval.loc[ind, :] = score
    print(" ", score)

res_xval.to_csv("../results/crosseval.csv", index_label=["Dataset", "Method"])
    
res_xval

# Save some trained model

In [None]:
def train_and_save_model(ann, filename):
    mdl = MLCharacterSolver(XGBClassifier())
    mdl.train(*extract_texts_and_characters(ann))
    joblib.dump(mdl, "../trained_models/"+filename+".pkl")
    return mdl

train_and_save_model(ann_ASOIAF, "classic_ASOIAF")
train_and_save_model(ann_SOC, "classic_SOC")

mdl_comb = train_and_save_model(ann_comb, "classic_comb")



## Feature importance stuff

In [None]:
_, _,vector_keys = get_feature_vectors(ann_comb[1]['text'])
feature_weights = list(zip(mdl_comb.classifier.feature_importances_,vector_keys))
feature_weights.sort(reverse=True)
feature_weights[1:10]

In [None]:
len([weight for weight, _ in feature_weights if weight>0])

# Webstuff

In [None]:
book= load_book("../input_books/unlabelled/warbreaker/Warbreaker.epub")
texts, indexes = load_chapters(book)


In [None]:
len(mdl_comb.feature_extractor(texts[4])[2])

In [None]:
mdl_comb.character_scores(texts[0])