In [2]:
import sklearn
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import *
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

In [3]:
import numpy as np

from collections import *

import json
from xgboost import XGBClassifier
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

from feature_extraction import *
from classify import *
from book import *

In [4]:
from evaluation import *

ann_comb, ann_SOC, ann_ASOIAF, ann_SA
lengths:  [622, 91, 256, 275]
POVs:  30 9 15 6


In [30]:
ann_SOC_split = []
for entry in ann_SOC:
    char = entry["character"]
    text = entry['text']
    for section in text.split("*"):
        if len(section) < 200:
            continue
        ann_SOC_split.append({"character": char, "text": section})

print(len(ann_SOC_split))

122


In [None]:
#TODO check this actually worked sanely

# Evaluation Program

In [25]:
import pandas as pd

def all_metrics(tt,pp):
    #prf = precision_recall_fscore_support(tt,pp, average='micro', labels=np.unique(tt))[0:3]
    acc = accuracy_score(tt,pp)
    return acc #np.hstack([prf, acc])

all_metrics_names = ["Acc"]

In [26]:
def make_classic_classifier():
    return make_pipeline(
    MaxAbsScaler(),
    LogisticRegression(C=1, dual=False, penalty="l2")
)

def make_highdim_classifier():
    return make_pipeline(
        StandardScaler(),
        sklearn.svm.SVC(C=1.0, probability=True)
    )
    #return make_pipeline(
    #    StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.1, min_samples_leaf=19, min_samples_split=19, n_estimators=100)),
    #    GradientBoostingClassifier(learning_rate=0.1, max_depth=10, max_features=0.2, min_samples_leaf=2, min_samples_split=3, n_estimators=100, subsample=0.1)
    #)


CL_mdl = lambda: MLCharacterSolver(make_classic_classifier(), nicknames2name_comb)
WE_mdl = lambda: MLCharacterSolver(make_highdim_classifier(), nicknames2name_comb, get_embedding_features)
HY_mdl = lambda: MLCharacterSolver(make_highdim_classifier(), nicknames2name_comb, hybrid_features)


FM_mdl = lambda: FirstMentionedSolver(nicknames2name_comb)
MC_mdl = lambda: MostMentionedSolver(nicknames2name_comb)

datasets = [("ASOIAF", ann_ASOIAF), ("SOC", ann_SOC_split)]
supdatasets = [("SA", ann_SA)]
base_mdls = [("ML Classical Features", CL_mdl),
             ("ML Word Emb. Features", WE_mdl),
             ("ML Hybrid Features", HY_mdl),
             ("First Mentioned", FM_mdl),
             ("Most Commonly Mentioned", MC_mdl)
       ]

## main eval

In [27]:
def make_program(datasets, mdls):
    program = OrderedDict()    
    for (test_data_name, test_data),(mdl_name,mdl) in it.product(datasets, mdls):
        if mdl_name[0:2]=="ML":
            for (train_data_name, train_data) in datasets:
                if train_data_name==test_data_name:
                    continue
                program[(test_data_name,mdl_name, train_data_name)] = (
                    train_data,
                    test_data,
                    mdl()
                )
        else:
            program[(test_data_name, mdl_name, "---")] = ([], test_data, mdl())
    return program

program = make_program(datasets, base_mdls)


res = pd.DataFrame(index=pd.MultiIndex.from_tuples(program.keys()),
                   columns = all_metrics_names)
res.sort_index(inplace=True)

for ind in res.index:
    print(ind, end="")
    score = evaluate(*program[ind], metric=all_metrics)
    res.loc[ind,:] = score
    print(" ", score)
    res.to_csv("../results/maineval.csv", index_label=["Test Set", "Method", "Train Set"])
    
res

('ASOIAF', 'First Mentioned', '---')  0.25
('ASOIAF', 'ML Classical Features', 'SOC')  0.953125
('ASOIAF', 'ML Hybrid Features', 'SOC')  0.8828125
('ASOIAF', 'ML Word Emb. Features', 'SOC')  0.86328125
('ASOIAF', 'Most Commonly Mentioned', '---')  0.9140625
('SOC', 'First Mentioned', '---')  0.4365079365079365
('SOC', 'ML Classical Features', 'ASOIAF')  0.8809523809523809
('SOC', 'ML Hybrid Features', 'ASOIAF')  0.8412698412698413
('SOC', 'ML Word Emb. Features', 'ASOIAF')  0.8809523809523809
('SOC', 'Most Commonly Mentioned', '---')  0.7222222222222222


Unnamed: 0,Unnamed: 1,Unnamed: 2,Acc
ASOIAF,First Mentioned,---,0.25
ASOIAF,ML Classical Features,SOC,0.953125
ASOIAF,ML Hybrid Features,SOC,0.882812
ASOIAF,ML Word Emb. Features,SOC,0.863281
ASOIAF,Most Commonly Mentioned,---,0.914062
SOC,First Mentioned,---,0.436508
SOC,ML Classical Features,ASOIAF,0.880952
SOC,ML Hybrid Features,ASOIAF,0.84127
SOC,ML Word Emb. Features,ASOIAF,0.880952
SOC,Most Commonly Mentioned,---,0.722222


In [31]:
# I am guessing this comes down to not correctly splitting sections.
# Eg getting nothing but random characters or something
# Would be good to investigate more later.
