In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from pprint import pprint

from collections import Counter, defaultdict

import json
from xgboost import XGBClassifier
import sklearn.tree
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

from sample_chapters import *
from feature_extraction import *
from classify import *
from book import *

In [3]:
def extract_texts_and_characters(annotated_data):
    full_characters = np.asarray([datum['character'] for datum in annotated_data])
    full_texts = np.asarray([datum['text'] for datum in annotated_data])
    return full_texts, full_characters

In [20]:
from sklearn.model_selection import KFold

def xval_evaluate(annotated_data, solver, n_splits=10, metric=accuracy_score, mute=True):
    
    full_texts, full_characters = extract_texts_and_characters(annotated_data)
    
    scores = []
    for train_inds, test_inds in KFold(n_splits=n_splits).split(annotated_data):        
        train_ann_data = annotated_data[train_inds]
        test_ann_data = annotated_data[test_inds]

        score = evaluate(train_ann_data, test_ann_data, solver, metric)
        if not(mute):
            print(score)
            
        scores.append(score)
    return np.mean(scores, axis=0)


def evaluate(train_ann_data, test_ann_data, solver, metric=accuracy_score):   
    train_texts, train_characters = extract_texts_and_characters(train_ann_data)
    test_texts, test_characters = extract_texts_and_characters(test_ann_data)

    solver.train(train_texts, train_characters)
    score = solver.test(test_texts, test_characters, metric=metric)
    return score



In [60]:
nicknames2name_comb = {
    "Dany":"Daenerys",
    "Ned" : "Eddard",
    "Sam" : "Samwell",
    "Rollins" : "Pekka"
}

with open("../flat_data/asoif01-04.json","r") as fh:
    ann_ASOIAF = np.asarray(json.load(fh))
    
    
with open("../flat_data/dregs01.json","r") as fh:
    ann_SOC = np.asarray(json.load(fh))
with open("../flat_data/dregs01.json","r") as fh:
    ann_SOC = np.hstack([ann_SOC, np.asarray(json.load(fh))])

ann_comb = np.hstack([ann_SOC, ann_ASOIAF])

np.random.shuffle(ann_ASOIAF)
np.random.shuffle(ann_SOC)
np.random.shuffle(ann_comb)
print("lengths: ", len(ann_comb), len(ann_SOC), len(ann_ASOIAF))
print("POVs: ", *[len(np.unique(extract_texts_and_characters(ann)[1])) for ann in (ann_comb, ann_SOC, ann_ASOIAF)])

lengths:  348 92 256
POVs:  22 7 15


In [8]:
def hybrid_features(x):
    emb_names, emb_vecs, _ = get_embedding_features(x, 5)
    cl_names, cl_vecs, _ = get_feature_vectors(x)
    #print(cl_names)
    #print("*********")
    #print(emb_names)
    assert cl_names == emb_names
    vecs = np.hstack([cl_vecs, emb_vecs])
    assert vecs.shape[0]==len(cl_names)
    assert vecs.shape[1]>2
    return cl_names, vecs, "hybrid"

# Evaluation Program

In [14]:
import pandas as pd

def all_metrics(tt,pp):
    prf = precision_recall_fscore_support(tt,pp, average='micro', labels=np.unique(tt))[0:3]
    acc = accuracy_score(tt,pp)
    return np.hstack([prf, acc])

all_metrics_names = ["P", "R", "F1", "Acc"]

In [160]:
def relabel_grp_preds(y_true, y_pred):
    if  y_true.sum()<1:
        return #No positive samples, thus no rescaling required
    
    assert (y_true==0).sum()==len(y_true)-1

    target_inds = y_true>0.5 # Actually it is always 0 or 1, but force it to logical
    other_inds = np.logical_not(target_inds)

    target_score = y_pred[target_inds]
    target_penalty = 0 #y_pred[other_inds].max()
    other_penalty = 0#target_score/(len(y_true)-1)

    # Fake out the scores
    y_pred[target_inds] -= target_penalty # increase it's loss
    y_pred[other_inds]  += other_penalty #Decreases their loss

class GroupedMLCharacterSolver(MLCharacterSolver):


    def var_logregobj(y_true, y_pred):

        #oy_pred=y_pred.copy()
        for g_ind in G_inds:
            relabel_grp_preds(y_true[g_ind], y_pred[g_ind]) #Inplace

        #print(np.vstack([oy_pred, y_pred, y_true]).T)
        #print("------------")

        y_pred = 1.0 / (1.0 + np.exp(-y_pred))
        grad = y_pred-y_true
        hess = y_pred * (1.0 - y_true)
        return grad, hess

    def train(self, texts, reference_characters):
        Xs = [] # Feature vectors
        Ys = [] # Binary as to if this feature is the target
        last_ind = 0
        G_inds = []
        for reference_name, raw_text in zip(reference_characters, texts):
            names, vectors, _ = self.feature_extractor(raw_text)
            Xs.extend(vectors)
            y = [(name == reference_name) for name in names]
            Ys.extend(y)
            first_ind = last_ind
            last_ind += len(names)
            G_inds.append(slice(first_ind,last_ind))

        Xs = np.asarray(Xs)
        Ys = np.asarray(Ys)
        assert Xs.shape[0]==Ys.shape[0], (Xs.shape[0], Ys.shape[0])
        assert len(Xs.shape)==2, "Xs.shape = "+str(Xs.shape)
        assert Xs.shape[1]>2, "Xs.shape[1] = "+str(Xs.shape[1])

        # closure over G_inds
        def var_logregobj(y_true, y_pred):
            #for g_ind in G_inds:
            #    relabel_grp_preds(y_true[g_ind], y_pred[g_ind]) #Inplace
            
            y_pred =  1.0 / (1.0 + np.exp(-y_pred))
            grad = y_pred-y_true
            eps=1e-16;
            hess = np.maximum(y_pred * (1.0 - y_pred), eps);
            return grad, hess

        self.classifier.objective = var_logregobj #Over write it to the closure
        self.classifier.fit(Xs,Ys)
        return self

In [116]:
ann_SOC_texts, ann_SOC_chars  = extract_texts_and_characters(ann_SOC)

In [99]:
clmdd = CL_mdl()
evaluate(ann_SOC, ann_SOC, clmdd)
clmdd.choose_character

1.0

In [159]:
GML_mdl = GroupedMLCharacterSolver(XGBClassifier(), nicknames2name=nicknames2name_comb)
print(evaluate(ann_SOC, ann_SOC, GML_mdl))
out_chars = list(GML_mdl.choose_characters(ann_SOC_texts) )

0.695652173913


In [155]:
inds = ann_SOC_chars!=out_chars
hard_chars = ann_SOC_chars[inds]
hard_texts = ann_SOC_texts[inds]

In [156]:
list(zip(*clmdd.character_scores(hard_texts[5])))

[(0.071906172, 'Nina'),
 (0.88273704, 'Matthias'),
 (0.0004262751, 'Brum'),
 (0.01146984, 'Jarl'),
 (0.00045013832, 'Elderclock'),
 (0.00040778279, 'Ferolind'),
 (0.00032514276, 'Grisha'),
 (0.00049540796, 'Fjerdan'),
 (0.0022502665, 'Specht'),
 (0.00027771172, 'Inej'),
 (0.00020608658, 'Grisha Fabrikators'),
 (0.00043655053, 'Claas'),
 (0.00027771172, 'Corecloth'),
 (0.00043655053, 'Giert'),
 (0.00023983637, 'Avfalle'),
 (0.00022914723, 'Djel'),
 (0.005693309, 'Wylan'),
 (0.00027771172, 'Nothing'),
 (0.00027771172, 'Lars'),
 (0.00022914723, 'Jesper'),
 (0.00020608658, 'None')]

In [157]:
list(zip(*GML_mdl.character_scores(hard_texts[5])))

[(0.019663431, 'Nina'),
 (1.5805703e-09, 'Matthias'),
 (1.5805703e-09, 'Brum'),
 (1.5805703e-09, 'Jarl'),
 (1.5805703e-09, 'Elderclock'),
 (1.5805703e-09, 'Ferolind'),
 (1.5805703e-09, 'Grisha'),
 (1.5805703e-09, 'Fjerdan'),
 (1.5805703e-09, 'Specht'),
 (1.5805703e-09, 'Inej'),
 (1.5805703e-09, 'Grisha Fabrikators'),
 (1.5805703e-09, 'Claas'),
 (1.5805703e-09, 'Corecloth'),
 (1.5805703e-09, 'Giert'),
 (1.5805703e-09, 'Avfalle'),
 (1.5805703e-09, 'Djel'),
 (1.5805703e-09, 'Wylan'),
 (1.5805703e-09, 'Nothing'),
 (1.5805703e-09, 'Lars'),
 (1.5805703e-09, 'Jesper'),
 (1.5805703e-09, 'None')]

In [129]:
hard_chars

array(['Inej', 'Matthias', 'Inej', 'Kaz', 'Nina', 'Kaz', 'Nina', 'Matthias'], 
      dtype='<U8')

In [76]:
GML_mdl = GroupedMLCharacterSolver(XGBClassifier(), nicknames2name=nicknames2name_comb)
evaluate(ann_ASOIAF, ann_SOC, GML_mdl) #0.86956521739130432

0.84782608695652173

In [64]:
evaluate(ann_ASOIAF, ann_SOC, CL_mdl())

0.91304347826086951

# Eval

In [175]:
WE_mdl = lambda: MLCharacterSolver(XGBClassifier(), nicknames2name_comb, lambda x:get_embedding_features(x,5))
CL_mdl = lambda: MLCharacterSolver(XGBClassifier(), nicknames2name_comb)
HY_mdl = lambda: MLCharacterSolver(XGBClassifier(), nicknames2name_comb, hybrid_features)


FM_mdl = lambda: FirstMentionedSolver(nicknames2name_comb)
MC_mdl = lambda: MostMentionedSolver(nicknames2name_comb)

datasets = [("ASIAF", ann_ASOIAF), ("SOC", ann_SOC)]
base_mdls = [("ML Classical Features", CL_mdl),
        ("ML Hybrid Features", HY_mdl),
        ("ML Word Emb. Features", WE_mdl),
        ("First Mentioned", FM_mdl),
        ("Most Commonly Mentioned", MC_mdl)
       ]

## main eval

In [None]:
WO_mdl = lambda: MLCharacterSolver(XGBClassifier(), nicknames2name_comb, lambda x:get_embedding_features(x,5, True))

In [169]:
evaluate(ann_ASOIAF, ann_SOC, WO_mdl(), metric=all_metrics) #0.84

array([ 0.84782609,  0.84782609,  0.84782609,  0.84782609])

In [170]:
evaluate(ann_SOC, ann_ASOIAF, WO_mdl(), metric=all_metrics) #0.921

array([ 0.921875,  0.921875,  0.921875,  0.921875])

In [162]:
def make_program(datasets, mdls):
    program = dict()    
    for (test_data_name, test_data),(mdl_name,mdl) in it.product(datasets, mdls):
        if mdl_name[0:2]=="ML":
            for (train_data_name, train_data) in datasets:
                if train_data_name==test_data_name:
                    continue
                
                program[(test_data_name, mdl_name+" trained on " + train_data_name)] = (
                    train_data,
                    test_data,
                    mdl()
                )
        else:
            program[(test_data_name, mdl_name)] = ([], test_data, mdl())
    return program

program = make_program(datasets, base_mdls)


res = pd.DataFrame(index=pd.MultiIndex.from_tuples(program.keys()),
                   columns = all_metrics_names)
res.sort_index(inplace=True)

for ind in res.index:
    print(ind, end="")
    score = 
    res.loc[ind,:] = score
    print(" ", score)

    
res.to_csv("../results/maineval.csv", index_label=["Testset", "Method"])
res

('ASIAF', 'First Mentioned')  [ 0.25101215  0.2421875   0.24652087  0.2421875 ]
('ASIAF', 'ML Classical Features trained on SOC')  [ 0.9140625  0.9140625  0.9140625  0.9140625]
('ASIAF', 'ML Hybrid Features trained on SOC')  [ 0.9140625  0.9140625  0.9140625  0.9140625]
('ASIAF', 'ML Word Emb. Features trained on SOC')  [ 0.89453125  0.89453125  0.89453125  0.89453125]
('ASIAF', 'Most Commonly Mentioned')  [ 0.92578125  0.92578125  0.92578125  0.92578125]
('SOC', 'First Mentioned')  [ 0.34090909  0.32608696  0.33333333  0.32608696]
('SOC', 'ML Classical Features trained on ASIAF')  [ 0.91304348  0.91304348  0.91304348  0.91304348]
('SOC', 'ML Hybrid Features trained on ASIAF')  [ 0.91304348  0.91304348  0.91304348  0.91304348]
('SOC', 'ML Word Emb. Features trained on ASIAF')  [ 0.69565217  0.69565217  0.69565217  0.69565217]
('SOC', 'Most Commonly Mentioned')  [ 0.84782609  0.84782609  0.84782609  0.84782609]


Unnamed: 0,Unnamed: 1,P,R,F1,Acc
ASIAF,First Mentioned,0.2510121,0.2421875,0.2465209,0.2421875
ASIAF,ML Classical Features trained on SOC,0.9140625,0.9140625,0.9140625,0.9140625
ASIAF,ML Hybrid Features trained on SOC,0.9140625,0.9140625,0.9140625,0.9140625
ASIAF,ML Word Emb. Features trained on SOC,0.8945312,0.8945312,0.8945312,0.8945312
ASIAF,Most Commonly Mentioned,0.9257812,0.9257812,0.9257812,0.9257812
SOC,First Mentioned,0.3409091,0.326087,0.3333333,0.326087
SOC,ML Classical Features trained on ASIAF,0.9130435,0.9130435,0.9130435,0.9130435
SOC,ML Hybrid Features trained on ASIAF,0.9130435,0.9130435,0.9130435,0.9130435
SOC,ML Word Emb. Features trained on ASIAF,0.6956522,0.6956522,0.6956522,0.6956522
SOC,Most Commonly Mentioned,0.8478261,0.8478261,0.8478261,0.8478261


## Cross Evaluation
To test how much it effects things from different styles.

In [None]:
def make_program(datasets, mdls):
    program = dict()    
    for (data_name, data),(mdl_name,mdl) in it.product(datasets, mdls):
        program[(data_name, mdl_name)] = (data, mdl())
    return program
program = make_program(datasets+[("Combined", ann_comb)], base_mdls)


res_xval = pd.DataFrame(index=pd.MultiIndex.from_tuples(program.keys()),
                        columns = all_metrics_names)
res_xval.sort_index(inplace=True)

for ind in res_xval.index:
    print(ind, end="")
    score = xval_evaluate(*program[ind], metric=all_metrics) 
    res_xval.loc[ind, :] = score
    print(" ", score)

res_xval.to_csv("../results/crosseval.csv", index_label=["Dataset", "Method"])
    
res_xval

('ASIAF', 'First Mentioned') 

# Save some trained model

In [None]:
def train_and_save_model(ann, filename):
    mdl = MLCharacterSolver(XGBClassifier())
    mdl.train(*extract_texts_and_characters(ann))
    joblib.dump(mdl, "../trained_models/"+filename+".pkl")
    return mdl

train_and_save_model(ann_ASOIAF, "classic_ASOIAF")
train_and_save_model(ann_SOC, "classic_SOC")

mdl_comb = train_and_save_model(ann_comb, "classic_comb")



## Feature importance stuff

In [None]:
_, _,vector_keys = get_feature_vectors(ann_comb[1]['text'])
feature_weights = list(zip(mdl_comb.classifier.feature_importances_,vector_keys))
feature_weights.sort(reverse=True)
feature_weights[1:10]

In [None]:
len([weight for weight, _ in feature_weights if weight>0])

# Webstuff

In [None]:
book= load_book("../input_books/unlabelled/warbreaker/Warbreaker.epub")
texts, indexes = load_chapters(book)


In [None]:
len(mdl_comb.feature_extractor(texts[4])[2])

In [None]:
mdl_comb.character_scores(texts[0])