In [11]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
import numpy as np
from pprint import pprint

from collections import Counter, defaultdict

import json
from xgboost import XGBClassifier
import sklearn.tree
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

from sample_chapters import *
from feature_extraction import *
from classify import *
from book import *

In [2]:
def extract_texts_and_characters(annotated_data):
    full_characters = np.asarray([datum['character'] for datum in annotated_data])
    full_texts = np.asarray([datum['text'] for datum in annotated_data])
    return full_texts, full_characters



In [125]:
from sklearn.model_selection import KFold

def xval_evaluate(annotated_data, solver, n_splits=10, metric=accuracy_score, mute=True):
    
    full_texts, full_characters = extract_texts_and_characters(annotated_data)
    
    scores = []
    for train_inds, test_inds in KFold(n_splits=n_splits).split(annotated_data):        
        train_ann_data = annotated_data[train_inds]
        test_ann_data = annotated_data[test_inds]

        score = train_test_evaluate(train_ann_data, test_ann_data, solver, n_splits, metric)
        
        if not(mute):
            print(score)
            
        scores.append(score)
    return np.mean(scores, axis=0)


def evaluate(train_ann_data, test_ann_data, solver, n_splits=10, metric=accuracy_score):   
    train_texts, train_characters = extract_texts_and_characters(train_ann_data)
    test_texts, test_characters = extract_texts_and_characters(test_ann_data)

    solver.train(train_texts, train_characters)
    score = solver.test(test_texts, test_characters, metric=metric)
    return score



In [3]:
nicknames2name_comb = {
    "Dany":"Daenerys",
    "Ned" : "Eddard",
    "Sam" : "Samwell",
    "Rollins" : "Pekka"
}

with open("../flat_data/asoif01-04.json","r") as fh:
    ann_ASOIAF = np.asarray(json.load(fh))
    
    
with open("../flat_data/dregs01.json","r") as fh:
    ann_SOC = np.asarray(json.load(fh))
with open("../flat_data/dregs01.json","r") as fh:
    ann_SOC = np.hstack([ann_SOC, np.asarray(json.load(fh))])

ann_comb = np.hstack([ann_SOC, ann_ASOIAF])

np.random.shuffle(ann_ASOIAF)
np.random.shuffle(ann_SOC)
np.random.shuffle(ann_comb)
print("lengths: ", len(ann_comb), len(ann_SOC), len(ann_ASOIAF))
print("POVs: ", *[len(np.unique(extract_texts_and_characters(ann)[1])) for ann in (ann_comb, ann_SOC, ann_ASOIAF)])

lengths:  348 92 256
POVs:  22 7 15


# Evaluation Program

In [236]:
[1,2,3],[1,5,3])

array([1, 3])

In [243]:
import pandas as pd

def all_metrics(tt,pp):
    prf = precision_recall_fscore_support(tt,pp, average='macro', labels=np.unique(tt))[0:3]
    acc = accuracy_score(tt,pp)
    return np.hstack([prf, acc])

all_metrics_names = ["P", "R", "F1", "Acc"]

## main eval

In [244]:
ML_mdl = MLCharacterSolver(XGBClassifier(), nicknames2name_comb)
FM_mdl = FirstMentionedSolver(nicknames2name_comb)
MC_mdl = MostMentionedSolver(nicknames2name_comb)


program = {
    ("ASOIAF", "ML Classical Features Trained on SOC"): (ann_SOC, ann_ASOIAF, ML_mdl) ,
    ("SOC", "ML Classical Features Trained on ASOIAF"): (ann_ASOIAF, ann_SOC, ML_mdl),   
    ("SOC", "First Mentioned") :                        ([], ann_SOC, FM_mdl),
    ("ASOIAF", "First Mentioned"):                      ([], ann_ASOIAF, FM_mdl),
    ("SOC", "Most Commonly Mentioned"):                 ([], ann_SOC, MC_mdl),
    ("ASOIAF", "Most Commonly Mentioned"):              ([], ann_ASOIAF, MC_mdl),
}


res = pd.DataFrame(index=program.keys(), columns = all_metrics_names)
res.sort_index(inplace=True)

for ind in res.index:
    print(ind, end="")
    score = evaluate(*program[ind], metric=all_metrics)
    res.loc[ind,:] = score
    print(" ", score)

res

('ASOIAF', 'First Mentioned')  [0.02190518 0.07219788 0.03227254 0.25      ]
('ASOIAF', 'ML Classical Features Trained on SOC')

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  [0.46787058 0.48834536 0.47662706 0.91796875]
('ASOIAF', 'Most Commonly Mentioned')  [0.55891593 0.58322807 0.56732114 0.91015625]
('SOC', 'First Mentioned')  [0.1010101  0.19498747 0.13106484 0.36956522]
('SOC', 'ML Classical Features Trained on ASOIAF')  [0.72222222 0.74691358 0.73384168 0.91304348]
('SOC', 'Most Commonly Mentioned')  [0.53948577 0.57805326 0.55175282 0.7826087 ]


  'precision', 'predicted', average, warn_for)


Unnamed: 0,P,R,F1,Acc
"(ASOIAF, First Mentioned)",0.0219052,0.0721979,0.0322725,0.25
"(ASOIAF, ML Classical Features Trained on SOC)",0.467871,0.488345,0.476627,0.917969
"(ASOIAF, Most Commonly Mentioned)",0.558916,0.583228,0.567321,0.910156
"(SOC, First Mentioned)",0.10101,0.194987,0.131065,0.369565
"(SOC, ML Classical Features Trained on ASOIAF)",0.722222,0.746914,0.733842,0.913043
"(SOC, Most Commonly Mentioned)",0.539486,0.578053,0.551753,0.782609


## Cross Evaluation
To test how much it effects things from different styles.

In [245]:
program = {
    ("ASOIAF", "ML Classical Features"):                (ann_ASOIAF, ML_mdl),
    ("SOC", "ML Classical Features"):                   (ann_SOC, ML_mdl),
    ("combined", "ML Classical Features"):              (ann_comb, ML_mdl),
    
    ("SOC", "First Mentioned"):                         (ann_SOC, FM_mdl),
    ("ASOIAF", "First Mentioned"):                      (ann_ASOIAF, FM_mdl),
    ("combined", "First Mentioned"):                    (ann_comb, FM_mdl),
    
    ("SOC", "Most Commonly Mentioned"):                 (ann_SOC, MC_mdl),
    ("ASOIAF", "Most Commonly Mentioned"):              (ann_ASOIAF, MC_mdl),
    ("combined", "Most Commonly Mentioned"):            (ann_comb, MC_mdl),
}

res_xval = pd.DataFrame(index=program.keys(), columns = all_metrics_names)
res_xval.sort_index(inplace=True)

for ind in res_xval.index:
    print(ind, end="")
    score = xval_evaluate(*program[ind], metric=all_metrics) 
    res_xval.loc[ind, :] = score
    print(" ", score)
    
res_xval

('ASOIAF', 'First Mentioned')

  'precision', 'predicted', average, warn_for)


  [0.11669051 0.20019236 0.13732956 0.25015385]
('ASOIAF', 'ML Classical Features')

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  [0.92700105 0.94379371 0.93134033 0.96076923]
('ASOIAF', 'Most Commonly Mentioned')  [0.81248182 0.84622183 0.82171873 0.91      ]
('SOC', 'First Mentioned')  [0.27833333 0.33738095 0.28857143 0.37      ]
('SOC', 'ML Classical Features')  [1.         0.97777778 0.98333333 0.97777778]
('SOC', 'Most Commonly Mentioned')  [0.75125    0.75875    0.73228571 0.78444444]
('combined', 'First Mentioned')

  'precision', 'predicted', average, warn_for)


  [0.12824748 0.21695928 0.15304311 0.28168067]
('combined', 'ML Classical Features')

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  [0.90938834 0.92889816 0.91559743 0.95983193]
('combined', 'Most Commonly Mentioned')  [0.7859963  0.80562511 0.78858755 0.87647059]


Unnamed: 0,P,R,F1,Acc
"(ASOIAF, First Mentioned)",0.116691,0.200192,0.13733,0.250154
"(ASOIAF, ML Classical Features)",0.927001,0.943794,0.93134,0.960769
"(ASOIAF, Most Commonly Mentioned)",0.812482,0.846222,0.821719,0.91
"(SOC, First Mentioned)",0.278333,0.337381,0.288571,0.37
"(SOC, ML Classical Features)",1.0,0.977778,0.983333,0.977778
"(SOC, Most Commonly Mentioned)",0.75125,0.75875,0.732286,0.784444
"(combined, First Mentioned)",0.128247,0.216959,0.153043,0.281681
"(combined, ML Classical Features)",0.909388,0.928898,0.915597,0.959832
"(combined, Most Commonly Mentioned)",0.785996,0.805625,0.788588,0.876471


# Save some trained model

In [4]:
def train_and_save_model(ann, filename):
    mdl = MLCharacterSolver(XGBClassifier())
    mdl.train(*extract_texts_and_characters(ann))
    joblib.dump(mdl, "../trained_models/"+filename+".pkl")
    return mdl

train_and_save_model(ann_ASOIAF, "classic_ASOIAF")
train_and_save_model(ann_SOC, "classic_SOC")

mdl_comb = train_and_save_model(ann_comb, "classic_comb")



## Feature importance stuff

In [9]:
_, _,vector_keys = get_feature_vectors(ann_comb[1]['text'])
feature_weights = list(zip(mdl_comb.classifier.feature_importances_,vector_keys))
feature_weights.sort(reverse=True)
feature_weights[1:10]

[(0.07692308, 'occur_percent'),
 (0.07371795, 'last_occur_percent'),
 (0.057692308, 'rank'),
 (0.057692308, 'after_POS_was_VBD'),
 (0.051282052, 'rank_percent'),
 (0.049679488, 'before_POS_was_percent_.'),
 (0.043269232, 'last_occur_position'),
 (0.03846154, 'before_POS_was_percent_NN'),
 (0.033653848, 'before_POS_was_,')]

In [12]:
len([weight for weight, _ in feature_weights if weight>0])

41

In [None]:
""