In [1]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import *
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

In [2]:
import numpy as np

from collections import Counter, defaultdict

import json
from xgboost import XGBClassifier
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

from feature_extraction import *
from classify import *
from book import *

In [3]:
from evaluation import *

ann_comb, ann_SOC, ann_ASOIAF, ann_SA
lengths:  [622, 91, 256, 275]
POVs:  30 9 15 6


# Evaluation Program

In [4]:
import pandas as pd

def all_metrics(tt,pp):
    #prf = precision_recall_fscore_support(tt,pp, average='micro', labels=np.unique(tt))[0:3]
    acc = accuracy_score(tt,pp)
    return acc #np.hstack([prf, acc])

all_metrics_names = ["Acc"]

In [8]:
def make_classic_classifier():
    return make_pipeline(
    MaxAbsScaler(),
    LogisticRegression(C=1, dual=False, penalty="l2")
)

def make_highdim_classifier():
    return make_pipeline(
        StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.1, min_samples_leaf=19, min_samples_split=19, n_estimators=100)),
        GradientBoostingClassifier(learning_rate=0.1, max_depth=10, max_features=0.2, min_samples_leaf=2, min_samples_split=3, n_estimators=100, subsample=0.1)
    )


WE_mdl = lambda: MLCharacterSolver(make_highdim_classifier(), nicknames2name_comb, get_embedding_features)
CL_mdl = lambda: MLCharacterSolver(make_classic_classifier(), nicknames2name_comb)
HY_mdl = lambda: MLCharacterSolver(make_highdim_classifier(), nicknames2name_comb, hybrid_features)


FM_mdl = lambda: FirstMentionedSolver(nicknames2name_comb)
MC_mdl = lambda: MostMentionedSolver(nicknames2name_comb)

datasets = [("ASOIAF", ann_ASOIAF), ("SOC", ann_SOC)]
supdatasets = [("SA", ann_SA)]
base_mdls = [("ML Classical Features", CL_mdl),
             ("ML Hybrid Features", HY_mdl),
             ("ML Word Emb. Features", WE_mdl),
             ("First Mentioned", FM_mdl),
             ("Most Commonly Mentioned", MC_mdl)
       ]

## main eval

In [9]:
def make_program(datasets, mdls):
    program = dict()    
    for (test_data_name, test_data),(mdl_name,mdl) in it.product(datasets, mdls):
        if mdl_name[0:2]=="ML":
            for (train_data_name, train_data) in datasets:
                if train_data_name==test_data_name:
                    continue
                program[(test_data_name,mdl_name, train_data_name)] = (
                    train_data,
                    test_data,
                    mdl()
                )
        else:
            program[(test_data_name, mdl_name, "---")] = ([], test_data, mdl())
    return program

program = make_program(datasets, base_mdls)


res = pd.DataFrame(index=pd.MultiIndex.from_tuples(program.keys()),
                   columns = all_metrics_names)
res.sort_index(inplace=True)

for ind in res.index:
    print(ind, end="")
    score = evaluate(*program[ind], metric=all_metrics)
    res.loc[ind,:] = score
    print(" ", score)
    res.to_csv("../results/maineval.csv", index_label=["Test Set", "Method", "Train Set"])
    
res

('ASOIAF', 'First Mentioned', '---')  0.25
('ASOIAF', 'ML Classical Features', 'SOC')  0.95703125
('ASOIAF', 'ML Hybrid Features', 'SOC')  0.94921875
('ASOIAF', 'ML Word Emb. Features', 'SOC')  0.82421875
('ASOIAF', 'Most Commonly Mentioned', '---')  0.9140625
('SOC', 'First Mentioned', '---')  0.42857142857142855
('SOC', 'ML Classical Features', 'ASOIAF')  0.9230769230769231
('SOC', 'ML Hybrid Features', 'ASOIAF')  0.6263736263736264
('SOC', 'ML Word Emb. Features', 'ASOIAF')  0.31868131868131866
('SOC', 'Most Commonly Mentioned', '---')  0.7802197802197802


Unnamed: 0,Unnamed: 1,Unnamed: 2,Acc
ASOIAF,First Mentioned,---,0.25
ASOIAF,ML Classical Features,SOC,0.957031
ASOIAF,ML Hybrid Features,SOC,0.949219
ASOIAF,ML Word Emb. Features,SOC,0.824219
ASOIAF,Most Commonly Mentioned,---,0.914062
SOC,First Mentioned,---,0.428571
SOC,ML Classical Features,ASOIAF,0.923077
SOC,ML Hybrid Features,ASOIAF,0.626374
SOC,ML Word Emb. Features,ASOIAF,0.318681
SOC,Most Commonly Mentioned,---,0.78022


## Cross Evaluation
To test how much it effects things from different styles.

In [7]:
def make_program(datasets, mdls):
    program = dict()    
    for (data_name, data),(mdl_name,mdl) in it.product(datasets, mdls):
        program[(data_name, mdl_name)] = (data, mdl())
    return program
program = make_program(datasets+[("Combined", ann_comb)], base_mdls)


res_xval = pd.DataFrame(index=pd.MultiIndex.from_tuples(program.keys()),
                        columns = all_metrics_names)
res_xval.sort_index(inplace=True)

for ind in res_xval.index:
    print(ind, end="")
    score = xval_evaluate(*program[ind], metric=all_metrics) 
    res_xval.loc[ind, :] = score
    print(" ", score)
    res_xval.to_csv("../results/crosseval.csv", index_label=["Dataset", "Method"])
    
res_xval

('ASIAF', 'First Mentioned')  0.25
('ASIAF', 'ML Classical Features')  0.9453846153846154
('ASIAF', 'ML Hybrid Features')  0.9646153846153848
('ASIAF', 'ML Word Emb. Features')  0.944923076923077
('ASIAF', 'Most Commonly Mentioned')  0.9141538461538463
('Combined', 'First Mentioned')  0.23796722990271374
('Combined', 'ML Classical Features')  0.8938812083973374
('Combined', 'ML Hybrid Features')  0.9003072196620583
('Combined', 'ML Word Emb. Features')  0.8908090117767538
('Combined', 'Most Commonly Mentioned')  0.8682539682539682
('SOC', 'First Mentioned')  0.4311111111111111
('SOC', 'ML Classical Features')  0.9133333333333333
('SOC', 'ML Hybrid Features')  0.9455555555555556
('SOC', 'ML Word Emb. Features')  0.9444444444444444
('SOC', 'Most Commonly Mentioned')  0.7811111111111111


Unnamed: 0,Unnamed: 1,Acc
ASIAF,First Mentioned,0.25
ASIAF,ML Classical Features,0.945385
ASIAF,ML Hybrid Features,0.964615
ASIAF,ML Word Emb. Features,0.944923
ASIAF,Most Commonly Mentioned,0.914154
Combined,First Mentioned,0.237967
Combined,ML Classical Features,0.893881
Combined,ML Hybrid Features,0.900307
Combined,ML Word Emb. Features,0.890809
Combined,Most Commonly Mentioned,0.868254


# Supp data

In [12]:
def make_program(datasets, supdatasets, mdls):
    all_datasets = list(datasets)
    all_datasets.extend(supdatasets)
    
    program = dict()    
    for (test_data_name, test_data),(mdl_name,mdl) in it.product(datasets, mdls):
        if mdl_name[0:2]=="ML":
            combined_data = []
            combined_data_names = []
            for (train_data_name, train_data) in all_datasets:
                if train_data_name==test_data_name:
                    continue
                combined_data.append(train_data)
                combined_data_names.append(train_data_name)
            
            if len(combined_data) > 1:
                train_data_name = " and ".join(combined_data_names)
                program[(test_data_name, mdl_name, train_data_name)] = (
                    np.hstack(combined_data),
                    test_data,
                    mdl()
                )
    return program

program = make_program(datasets,supdatasets, base_mdls)


res = pd.DataFrame(index=pd.MultiIndex.from_tuples(program.keys()),
                   columns = all_metrics_names)
res.sort_index(inplace=True)

for ind in res.index:
    print(ind, end="")
    
    score = evaluate(*program[ind], metric=all_metrics)
    res.loc[ind,:] = score
    print(" ", score)
    res.to_csv("../results/extradata.csv", index_label=["Test Set", "Method", "Train Set"])
    
res

('ASIAF', 'ML Classical Features', 'SOC and SA')  0.953125
('ASIAF', 'ML Hybrid Features', 'SOC and SA')  0.953125
('ASIAF', 'ML Word Emb. Features', 'SOC and SA')  0.921875
('SOC', 'ML Classical Features', 'ASIAF and SA')  0.8241758241758241
('SOC', 'ML Hybrid Features', 'ASIAF and SA')  0.8571428571428571
('SOC', 'ML Word Emb. Features', 'ASIAF and SA')  0.9340659340659341


Unnamed: 0,Unnamed: 1,Unnamed: 2,Acc
ASIAF,ML Classical Features,SOC and SA,0.953125
ASIAF,ML Hybrid Features,SOC and SA,0.953125
ASIAF,ML Word Emb. Features,SOC and SA,0.921875
SOC,ML Classical Features,ASIAF and SA,0.824176
SOC,ML Hybrid Features,ASIAF and SA,0.857143
SOC,ML Word Emb. Features,ASIAF and SA,0.934066


# Save some trained model

In [41]:
def train_and_save_model(ann, filename, model):
    mdl = model()
    mdl.train(*extract_texts_and_characters(ann))
    joblib.dump(mdl, "../trained_models/"+filename+".pkl")
    return mdl

In [42]:
CL_SOC = train_and_save_model(ann_SOC, "CL_SOC", CL_mdl)
#HY_SOC = train_and_save_model(ann_SOC, "HY_SOC", HY_mdl)

In [43]:
CL_ASOIAF = train_and_save_model(ann_ASOIAF, "CL_ASOIAF", CL_mdl)
#HY_ASOIAF = train_and_save_model(ann_ASOIAF, "HY_ASOIAF", HY_mdl)

## Feature importance stuff

In [11]:
def feature_importance(mdl):
    _, _,vector_keys = get_feature_vectors(ann_comb[1]['text'])
    feature_weights = list(zip(mdl.classifier.feature_importances_,vector_keys))
    feature_weights.sort(reverse=True)
    non_zero_weights = [(weight,name) for weight, name in feature_weights if weight>0]
    print("Number of nonzeo weights: ", len(non_zero_weights))
    print("\n".join(", ".join(map(str,wt)) for wt in non_zero_weights))


In [36]:
feature_importance(CL_SOC)

Number of nonzeo weights:  31
0.11742424, before_POS_was_percent_,
0.11174242, after_POS_was_percent_VBD
0.09469697, before_POS_was_.
0.09280303, occur_percent
0.071969695, occur_rank
0.054924242, last_occur_percent
0.053030305, after_POS_was_percent_NNP
0.047348484, before_POS_was_percent_NN
0.041666668, occur_rank_percent
0.039772727, after_POS_was_percent_,
0.03787879, before_POS_was_percent_.
0.03787879, after_POS_was_percent_MD
0.035984848, last_occur_position
0.026515152, after_POS_was_MD
0.024621213, before_POS_was_percent_IN
0.017045455, before_POS_was_percent_VBD
0.017045455, before_POS_was_,
0.013257576, after_POS_was_percent_.
0.009469697, before_POS_was_percent_JJ
0.009469697, before_POS_was_percent_CC
0.009469697, after_POS_was_.
0.0056818184, before_POS_was_percent_WRB
0.0056818184, before_POS_was_percent_TO
0.003787879, occur_count
0.003787879, before_POS_was_VBN
0.003787879, before_POS_was_VBD
0.003787879, after_POS_was_percent_VB
0.003787879, after_POS_was_VBD
0.001893

In [46]:
feature_importance(HY_SOC)

Number of nonzeo weights:  14
0.08, occur_percent
0.07157895, occur_rank
0.06526316, before_POS_was_.
0.042105265, after_POS_was_percent_VBD
0.035789475, before_POS_was_percent_,
0.02736842, occur_rank_percent
0.025263159, after_POS_was_MD
0.010526316, before_POS_was_percent_.
0.0063157897, last_occur_percent
0.004210526, after_POS_was_NNP
0.002105263, occur_count
0.002105263, before_POS_was_percent_JJ
0.002105263, after_POS_was_percent_MD
0.002105263, after_POS_was_percent_,


In [12]:
feature_importance(CL_ASOIAF)

Number of nonzeo weights:  43
0.09724473, occur_rank_percent
0.09400324, last_occur_percent
0.07779579, before_POS_was_percent_,
0.07131281, after_POS_was_VBD
0.06320908, occur_rank
0.051863857, occur_percent
0.050243113, before_POS_was_,
0.04376013, before_POS_was_percent_RB
0.035656404, after_POS_was_percent_VBD
0.035656404, after_POS_was_percent_IN
0.034035657, before_POS_was_percent_.
0.029173419, before_POS_was_percent_NN
0.029173419, before_POS_was_NNP
0.025931928, last_occur_position
0.022690438, before_POS_was_percent_VBP
0.022690438, after_POS_was_percent_.
0.019448947, before_POS_was_percent_IN
0.019448947, after_POS_was_percent_MD
0.019448947, after_POS_was_percent_,
0.017828202, before_POS_was_percent_CC
0.016207455, before_POS_was_percent_VB
0.016207455, after_POS_was_percent_NNP
0.016207455, after_POS_was_percent_NN
0.014586709, before_POS_was_percent_DT
0.012965964, after_POS_was_percent_VBZ
0.008103727, before_POS_was_RB
0.008103727, before_POS_was_IN
0.006482982, befor

In [13]:
feature_importance(HY_ASOIAF)

Number of nonzeo weights:  23
0.05939005, after_POS_was_VBD
0.043338683, occur_rank
0.030497592, occur_rank_percent
0.030497592, before_POS_was_,
0.028892456, last_occur_percent
0.02247191, occur_percent
0.019261638, before_POS_was_percent_,
0.016051365, before_POS_was_NNP
0.014446228, before_POS_was_percent_RB
0.0048154094, before_POS_was_percent_VBP
0.0048154094, before_POS_was_percent_JJ
0.0048154094, before_POS_was_.
0.0048154094, after_POS_was_percent_VBD
0.0048154094, after_POS_was_percent_,
0.0032102729, occur_count
0.0032102729, after_POS_was_percent_IN
0.0032102729, after_POS_was_percent_.
0.0032102729, after_POS_was_NN
0.0016051364, before_POS_was_percent_IN
0.0016051364, before_POS_was_percent_''
0.0016051364, before_POS_was_RB
0.0016051364, after_POS_was_percent_NN
0.0016051364, after_POS_was_CC
