In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
import numpy as np
from pprint import pprint

from collections import Counter, defaultdict

import json
from xgboost import XGBClassifier
import sklearn.tree
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

from sample_chapters import *
from feature_extraction import *
from classify import *
from book import *

In [2]:
from evaluation import *

ann_comb, ann_SOC, ann_ASOIAF, ann_SA
lengths:  [622, 91, 256, 275]
POVs:  30 9 15 6


# Grouping Loss

In [None]:
def relabel_grp_preds(y_true, y_pred):
    if  y_true.sum()<1:
        return #No positive samples, thus no rescaling required
    
    assert (y_true==0).sum()==len(y_true)-1

    target_inds = y_true>0.5 # Actually it is always 0 or 1, but force it to logical
    other_inds = np.logical_not(target_inds)

    target_score = y_pred[target_inds]
    target_penalty = 0 #y_pred[other_inds].max()
    other_penalty = 0#target_score/(len(y_true)-1)

    # Fake out the scores
    y_pred[target_inds] -= target_penalty # increase it's loss
    y_pred[other_inds]  += other_penalty #Decreases their loss

class GroupedMLCharacterSolver(MLCharacterSolver):


    def var_logregobj(y_true, y_pred):

        #oy_pred=y_pred.copy()
        for g_ind in G_inds:
            relabel_grp_preds(y_true[g_ind], y_pred[g_ind]) #Inplace

        #print(np.vstack([oy_pred, y_pred, y_true]).T)
        #print("------------")

        y_pred = 1.0 / (1.0 + np.exp(-y_pred))
        grad = y_pred-y_true
        hess = y_pred * (1.0 - y_true)
        return grad, hess

    def train(self, texts, reference_characters):
        Xs = [] # Feature vectors
        Ys = [] # Binary as to if this feature is the target
        last_ind = 0
        G_inds = []
        for reference_name, raw_text in zip(reference_characters, texts):
            names, vectors, _ = self.feature_extractor(raw_text)
            Xs.extend(vectors)
            y = [(name == reference_name) for name in names]
            Ys.extend(y)
            first_ind = last_ind
            last_ind += len(names)
            G_inds.append(slice(first_ind,last_ind))

        Xs = np.asarray(Xs)
        Ys = np.asarray(Ys)
        assert Xs.shape[0]==Ys.shape[0], (Xs.shape[0], Ys.shape[0])
        assert len(Xs.shape)==2, "Xs.shape = "+str(Xs.shape)
        assert Xs.shape[1]>2, "Xs.shape[1] = "+str(Xs.shape[1])

        # closure over G_inds
        def var_logregobj(y_true, y_pred):
            #for g_ind in G_inds:
            #    relabel_grp_preds(y_true[g_ind], y_pred[g_ind]) #Inplace
            
            y_pred =  1.0 / (1.0 + np.exp(-y_pred))
            grad = y_pred-y_true
            eps=1e-16;
            hess = np.maximum(y_pred * (1.0 - y_pred), eps);
            return grad, hess

        self.classifier.objective = var_logregobj #Over write it to the closure
        self.classifier.fit(Xs,Ys)
        return self

In [None]:
ann_SOC_texts, ann_SOC_chars  = extract_texts_and_characters(ann_SOC)

In [None]:
clmdd = CL_mdl()
evaluate(ann_SOC, ann_SOC, clmdd)
clmdd.choose_character

In [None]:
GML_mdl = GroupedMLCharacterSolver(XGBClassifier(), nicknames2name=nicknames2name_comb)
print(evaluate(ann_SOC, ann_SOC, GML_mdl))
out_chars = list(GML_mdl.choose_characters(ann_SOC_texts) )

In [None]:
inds = ann_SOC_chars!=out_chars
hard_chars = ann_SOC_chars[inds]
hard_texts = ann_SOC_texts[inds]

In [None]:
list(zip(*clmdd.character_scores(hard_texts[5])))

In [None]:
list(zip(*GML_mdl.character_scores(hard_texts[5])))

In [None]:
hard_chars

In [None]:
GML_mdl = GroupedMLCharacterSolver(XGBClassifier(), nicknames2name=nicknames2name_comb)
evaluate(ann_ASOIAF, ann_SOC, GML_mdl) #0.86956521739130432

In [None]:
evaluate(ann_ASOIAF, ann_SOC, CL_mdl())

## Try out aumented Word Emb + Occurrencee

In [None]:
WO_mdl = lambda: MLCharacterSolver(
    XGBClassifier(),
    nicknames2name_comb,
    lambda x: get_embedding_features(x, include_occur_count_statistics=True))

In [None]:
evaluate(ann_ASOIAF, ann_SOC, WO_mdl(), metric=all_metrics) #0.84

In [None]:
evaluate(ann_SOC, ann_ASOIAF, WO_mdl(), metric=all_metrics) #0.921

# More rank features


In [5]:
from collections import OrderedDict

In [11]:
Counter([1,1,1,2,3,4]).most_common()

[(4, 1), (3, 1), (2, 1), (1, 3)]

In [33]:
def get_rank_feature_vectors(raw_text):
    ne_words = 2*[PADDING_TOKEN] + ne_preprocess(raw_text) + 2*[PADDING_TOKEN]
    features = ["occurs"]
    features.extend(["POS_w-1_"+tag for tag in tagset])
    features.extend(["POS_w+1_"+tag for tag in tagset])
    
    feature_counters = OrderedDict((feature, Counter()) for feature in features)
    all_names = set()
    for ii, (before, cur, after) in enumerate(triplewise(ne_words)):
        if type(cur)==nltk.tree.Tree and cur.label()=='NE':
            name = get_name(cur)
            all_names.add(name)
            feature_counters["occurs"][name]+=1
            feature_counters["POS_w-1_" + get_tag(before)][name]+=1
            feature_counters["POS_w+1_" + get_tag(after)][name]+=1
                
    #Final Percent processing, and flattening
    name2vecs = defaultdict(lambda: np.zeros(2*len(features)))
    for feature_ii_base, feature_counts in enumerate(feature_counters.values()):
        feature_ii = 2*feature_ii_base
        for inv_rank,(name, count) in enumerate(feature_counts.most_common()[::-1],1):
            # note: inv_rank, so higher is better, so zero is sane default
            # make it out of 1 so has upper and lower bound
            name2vecs[name][feature_ii] = inv_rank/len(all_names)
            name2vecs[name][feature_ii+1] = count
            
    
    return list(name2vecs.keys()), list(name2vecs.values()), features

In [34]:
ML_mdl = MLCharacterSolver(
    XGBClassifier(),
    nicknames2name=nicknames2name_comb,
    feature_extractor= get_rank_feature_vectors
)
evaluate(ann_ASOIAF, ann_SOC, ML_mdl)

0.86813186813186816

# More POS features


In [40]:
list(range(1,3))

[1, 2]

In [51]:

def ExpandedFeatureVec(half_win):
    vec=OrderedDict()
    vec["occur_count"]=0
    vec["occur_percent"]=0.0
    vec["first_occur_position"]=0
    vec["first_occur_percent"]=0.0
    vec["last_occur_position"]=0
    vec["last_occur_percent"]=0.0
    vec["occur_rank"]=0
    vec["occur_rank_percent"]=0
    for tag in tagset:
        for ii in range(1,half_win+1):
            vec["POS_w-%i_"%ii +tag]=0
            vec["POS_w+%i_"%ii +tag]=0
            vec["POS_w-%i__was_percent_"%ii +tag]=0.0
            vec["POS_w+%i__was_percent_"%ii +tag]=0.0
        
    return vec


"""
Returns a list of names, feature_vectors, and a definition of the feature vector keys
"""
def get_more_feature_vectors(raw_text):
    
    ne_words = 2*[PADDING_TOKEN] + ne_preprocess(raw_text) + 2*[PADDING_TOKEN]
    
    feature_vecs = defaultdict(lambda: ExpandedFeatureVec(2))
    overall_counts = Counter()
    
    for ii, (wn2,wn1,cur,wp1,wp2) in enumerate(nwise(5, ne_words)):
        if type(cur)==nltk.tree.Tree and cur.label()=='NE':
            name = get_name(cur)
            
            overall_counts[name]+=1
            vec = feature_vecs[name]
            vec["occur_count"]+=1 #should be equal to overall_counts

            vec["POS_w-2_" +get_tag(wn2)]+=1
            vec["POS_w-1_" +get_tag(wn1)]+=1
            vec["POS_w+1_" +get_tag(wp1)]+=1
            vec["POS_w+2_" +get_tag(wp2)]+=1
            
            vec["last_occur_position"] = ii #update last seen to this sent
            if not("first_occur_position" in vec): #if not set then this sent must be first time
                vec["first_occur_position"] = ii
    
    ###Basic data collected
    number_named_entities = len(overall_counts)

    #Final Percent processing, and flattening
    vectors=[]
    names=[]
    vector_keys = list(ExpandedFeatureVec(2).keys())
    for rank,(name, count) in enumerate(overall_counts.most_common(),1):
        
        vec = feature_vecs[name]
        assert(count==vec["occur_count"])
        vec["occur_percent"] = 100*count/sum(overall_counts.values())
        
        vec["occur_rank"] = rank
        vec["occur_rank_percent"] = 100*rank/number_named_entities
        vec["first_occur_percent"] = 100*vec["first_occur_position"] / len(ne_words)
        vec["last_occur_percent"] = 100*vec["last_occur_position"] / len(ne_words)
        
        for tag in tagset:
            vec["POS_w-2__was_percent_" + tag]=vec["POS_w-2_"+tag]*100/count
            vec["POS_w-1__was_percent_" + tag]=vec["POS_w-1_"+tag]*100/count
            vec["POS_w+1__was_percent_" + tag]=vec["POS_w+1_"+tag]*100/count
            vec["POS_w+2__was_percent_" + tag]=vec["POS_w+2_"+tag]*100/count
        
        
        vectors.append(list(vec.values()))
        assert len(vectors[-1])==len(vector_keys), "%i != %i" % (len(vectors[-1]), len(vector_keys) )#Make sure I have everything
        names.append(name)
    
    return names, np.asarray(vectors), vector_keys

In [54]:
ML_mdl = MLCharacterSolver(
    XGBClassifier(),
    nicknames2name=nicknames2name_comb,
    feature_extractor= get_more_feature_vectors
)
evaluate(ann_ASOIAF, ann_SOC, ML_mdl)

0.84615384615384615

# Swap NLP Preprocessors

In [1]:
import spacy

ImportError: cannot import name 'GoldCorpus'