In [1]:
import numpy as np

from collections import Counter, defaultdict

import json
from xgboost import XGBClassifier
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

from feature_extraction import *
from classify import *
from book import *

In [2]:
from evaluation import *

ann_comb, ann_SOC, ann_ASOIAF, ann_SA
lengths:  [622, 91, 256, 275]
POVs:  30 9 15 6


In [3]:
def hybrid_features(x):
    emb_names, emb_vecs, _ = get_embedding_features(x)
    cl_names, cl_vecs, _ = get_feature_vectors(x)
    #print(cl_names)
    #print("*********")
    #print(emb_names)
    assert cl_names == emb_names
    if len(cl_names)==0:
        return [], [], "hybrid"
    
    vecs = np.hstack([cl_vecs, emb_vecs])
    assert vecs.shape[0]==len(cl_names)
    assert vecs.shape[1]>2
    return cl_names, vecs, "hybrid"

# Evaluation Program

In [4]:
import pandas as pd

def all_metrics(tt,pp):
    prf = precision_recall_fscore_support(tt,pp, average='micro', labels=np.unique(tt))[0:3]
    acc = accuracy_score(tt,pp)
    return np.hstack([prf, acc])

all_metrics_names = ["P", "R", "F1", "Acc"]

# Eval

In [5]:
WE_mdl = lambda: MLCharacterSolver(XGBClassifier(), nicknames2name_comb, get_embedding_features)
CL_mdl = lambda: MLCharacterSolver(XGBClassifier(), nicknames2name_comb)
HY_mdl = lambda: MLCharacterSolver(XGBClassifier(), nicknames2name_comb, hybrid_features)


FM_mdl = lambda: FirstMentionedSolver(nicknames2name_comb)
MC_mdl = lambda: MostMentionedSolver(nicknames2name_comb)

datasets = [("ASIAF", ann_ASOIAF), ("SOC", ann_SOC), ("SA", ann_SA)]
base_mdls = [("ML Classical Features", CL_mdl),
             ("ML Hybrid Features", HY_mdl),
             ("ML Word Emb. Features", WE_mdl),
             ("First Mentioned", FM_mdl),
             ("Most Commonly Mentioned", MC_mdl)
       ]

## main eval

In [None]:
def make_program(datasets, mdls):
    program = dict()    
    for (test_data_name, test_data),(mdl_name,mdl) in it.product(datasets, mdls):
        if mdl_name[0:2]=="ML":
            combined_data = []
            combined_data_names = []
            for (train_data_name, train_data) in datasets:
                if train_data_name==test_data_name:
                    continue
                program[(test_data_name,mdl_name, train_data_name)] = (
                    train_data,
                    test_data,
                    mdl()
                )
                combined_data.append(train_data)
                combined_data_names.append(train_data_name)
            
            if len(combined_data) > 1:
                train_data_name = ", ".join(combined_data_names)
                program[(test_data_name, mdl_name, train_data_name)] = (
                    np.hstack(combined_data),
                    test_data,
                    mdl()
                )
        else:
            program[(test_data_name, mdl_name, "---")] = ([], test_data, mdl())
    return program

program = make_program(datasets, base_mdls)


res = pd.DataFrame(index=pd.MultiIndex.from_tuples(program.keys()),
                   columns = all_metrics_names)
res.sort_index(inplace=True)

for ind in res.index:
    print(ind, end="")
    score = evaluate(*program[ind], metric=all_metrics)
    res.loc[ind,:] = score
    print(" ", score)

    
res.to_csv("../results/maineval.csv", index_label=["Test Set", "Method", "Train Set"])
res

## Cross Evaluation
To test how much it effects things from different styles.

In [None]:
def make_program(datasets, mdls):
    program = dict()    
    for (data_name, data),(mdl_name,mdl) in it.product(datasets, mdls):
        program[(data_name, mdl_name)] = (data, mdl())
    return program
program = make_program(datasets+[("Combined", ann_comb)], base_mdls)


res_xval = pd.DataFrame(index=pd.MultiIndex.from_tuples(program.keys()),
                        columns = all_metrics_names)
res_xval.sort_index(inplace=True)

for ind in res_xval.index:
    print(ind, end="")
    score = xval_evaluate(*program[ind], metric=all_metrics) 
    res_xval.loc[ind, :] = score
    print(" ", score)

res_xval.to_csv("../results/crosseval.csv", index_label=["Dataset", "Method"])
    
res_xval

# Save some trained model

In [None]:
def train_and_save_model(ann, filename):
    mdl = MLCharacterSolver(XGBClassifier())
    mdl.train(*extract_texts_and_characters(ann))
    joblib.dump(mdl, "../trained_models/"+filename+".pkl")
    return mdl


mdl_comb = train_and_save_model(ann_comb, "classic_comb")

In [None]:
mdl_SA = train_and_save_model(ann_SA, "classic_SA")
mdl_SOC = train_and_save_model(ann_SOC, "classic_SOC")
mdl_SOIAF = train_and_save_model(ann_ASOIAF, "classic_ASOIAF")

## Feature importance stuff

In [None]:
_, _,vector_keys = get_feature_vectors(ann_comb[1]['text'])
feature_weights = list(zip(mdl_comb.classifier.feature_importances_,vector_keys))
feature_weights.sort(reverse=True)
feature_weights[1:10]

In [None]:
len([weight for weight, _ in feature_weights if weight>0])