In [56]:
import pickle
import numpy as np
import scipy
import scipy.sparse
from scipy.optimize import minimize
import sklearn
from sklearn import metrics
from copy import deepcopy
import os
import sys

PROJECT_DIR = '/home/michal/diplomka/code'
if PROJECT_DIR not in sys.path:
    sys.path.append(PROJECT_DIR)
from lib.utils import load_csr_matrix

FEATURES = ['a_hypernyms', 'a_head_form', 'a_head_number', 'a_non_article_det', 'a_parent', 'a_pos_after_head', 'a_pos_before_head', 'a_words_after_head', 'a_words_after_np', 'a_words_before_head', 'a_words_before_np', 'b_head_proper', 'b_head_pos_simple', 'b_object_form', 'b_pos_after_head_as_list', 'b_pos_before_head_as_list', 'b_pp_object_form', 'b_postmodification_type', 'b_referent', 'b_words_after_head_as_list', 'b_words_after_np_as_list', 'b_words_before_head_as_list', 'b_words_before_np_as_list', 'c_countability_bnc', 'd_head_form_embeddings', 'e_kenlm_ggl_5_lc_nbs']
TEST_SET_NAME = 'heldout'
MODEL_PATH = '/home/michal/diplomka/data/model/'
DATA_PATH = '/home/michal/diplomka/data/features/penn/postprocessed'

In [13]:
def load_data(features, dataset_name, sample=None):
    y = np.load(os.path.join(DATA_PATH, dataset_name, 'Y_article'))
    if sample:
        np.random.seed(seed=42)
        sample_indices = [np.random.choice(len(y), sample, replace=False)]
        y = y[sample_indices]
        
    feature_matrices = []
    for feature_name in features:
        feature_matrix = load_csr_matrix(
            os.path.join(DATA_PATH, dataset_name, feature_name + '.npz')
        )
        if sample:
            feature_matrix = scipy.sparse.csr_matrix(feature_matrix.toarray()[sample_indices])
        feature_matrices.append(feature_matrix)
    x =  scipy.sparse.hstack(feature_matrices)
    assert x.shape[0] == len(y)
    return x, y

def load_model(name):
    return pickle.load(open(os.path.join(MODEL_PATH, name), 'rb'))

In [5]:
test_x, test_y = load_data(FEATURES, TEST_SET_NAME)

In [36]:
data_size = 20000
model_the = load_model('logreg_ovr_binarized_{}_allfeatures_{}.pkl'.format('THE', data_size))
model_a = load_model('logreg_ovr_binarized_{}_allfeatures_{}.pkl'.format('A', data_size))
model_zero = load_model('logreg_ovr_binarized_{}_allfeatures_{}.pkl'.format('ZERO', data_size))

In [26]:
def softmax(z):
    return np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)

def cost(Y, T):
    # Y - predicted matrix
    # T - target matrix
    return - np.multiply(T, np.log(Y)).sum()

In [42]:
model_the_params = model_the.coef_
model_the_params

array([[-0.01216528,  0.01529098,  0.35410395, ...,  0.31741846,
         0.90595677, -0.16581469]])

In [54]:
pred_y = np.array([model_the.predict_proba(test_x)[:,0], model_a.predict_proba(test_x)[:,0], model_zero.predict_proba(test_x)[:,0]]).T
true_y = np.array([test_y == 'THE', test_y == 'A', test_y == 'ZERO']).T

In [140]:
def mycost(coefs):
    print(coefs)
    new_pred = pred_y * coefs
    new_pred = new_pred / new_pred.sum(axis=1, keepdims=True)
    new_pred[new_pred < 0] = 0
    assert (new_pred >= 0).all(), new_pred[new_pred < 0]
    cost = - (1/pred_y.shape[0])*np.multiply(true_y, np.log(new_pred * coefs)).sum()
    print(cost)
    return cost

In [63]:
def mycost_mod(coefs):
    print(coefs)
    model_the_, model_a_, model_zero_ = deepcopy(model_the), deepcopy(model_a), deepcopy(model_zero)
    model_the_.intercept_[0] += coefs[0]
    model_a_.intercept_[0] += coefs[1]
    model_zero_.intercept_[0] += coefs[2]
    new_pred = np.array([model_the_.predict_proba(test_x)[:,0], model_a_.predict_proba(test_x)[:,0], model_zero_.predict_proba(test_x)[:,0]]).T
    cost = - (1/new_pred.shape[0])*np.multiply(true_y, np.log(new_pred)).sum() + (1/pred_y.shape[0])*(np.array(
        [model_the_.intercept_[0] + model_the_.coef_, model_a_.intercept_[0] + model_a_.coef_, model_zero_.intercept_[0]+ + model_zero_.coef_]) ** 2).sum()
    print("cost:", cost)
    print("acc:", metrics.accuracy_score(test_y, [('THE', 'A', 'ZERO')[idx] for idx in np.argmax(new_pred, axis=1)]))
    return cost
    #model_kvik.intercept_[0] -= 1
    #new_pred = pred_y * coefs
    #new_pred = new_pred / new_pred.sum(axis=1, keepdims=True)

In [48]:
print(np.array([model_the.intercept_[0], model_a.intercept_[0], model_zero.intercept_[0]]))
(np.array([model_the.intercept_[0], model_a.intercept_[0], model_zero.intercept_[0]]) ** 2).sum()

[ 1.05756054  1.71348778 -1.26151973]


5.645906715738616

In [148]:
coefs = [ 1.03333333, 0.9 , 1.16666667]
#coefs = [1.35000000, -7.10542736e-15,   3.35000000e+00]
#coefs = [ 0.3382716,   0.18024691,  5.81666667]
coefs = [ 0.0001, 0.0001, 102876098]
cost = mycost(coefs)
print('cost: {}'.format(cost))
print(true_y[:10,:])
print(pred_y[:10,:])
npr = pred_y * coefs
npr = npr / npr.sum(axis=1, keepdims=True)
print(npr[:10,:])
print(metrics.accuracy_score(test_y, [('THE', 'A', 'ZERO')[idx] for idx in np.argmax(pred_y, axis=1)]))
print(metrics.accuracy_score(test_y, [('THE', 'A', 'ZERO')[idx] for idx in np.argmax(npr, axis=1)]))

[0.0001, 0.0001, 102876098]
-2.15704466487
cost: -2.1570446648736286
[[False False  True]
 [ True False False]
 [False False  True]
 [ True False False]
 [False False  True]
 [False False  True]
 [False False  True]
 [ True False False]
 [False False  True]
 [ True False False]]
[[  7.81273510e-02   1.56155310e-03   9.45086747e-01]
 [  9.50872875e-01   5.52627824e-05   6.22502417e-02]
 [  6.02076009e-02   4.60515366e-01   4.56935241e-01]
 [  8.88335465e-01   2.78838330e-01   1.88764209e-03]
 [  3.30354280e-01   3.44761399e-02   5.19204413e-01]
 [  4.28446859e-03   4.73908781e-02   9.74992830e-01]
 [  1.39957831e-03   1.69429552e-03   9.99432185e-01]
 [  9.57437879e-01   5.98481863e-03   4.80073175e-02]
 [  5.79297535e-02   2.12585466e-03   8.69109641e-01]
 [  9.08458116e-01   2.01207981e-01   1.05510037e-02]]
[[  8.03557470e-14   1.60609267e-15   1.00000000e+00]
 [  1.48479649e-11   8.62933287e-16   1.00000000e+00]
 [  1.28080255e-13   9.79659126e-13   1.00000000e+00]
 [  4.57449192e-1

In [64]:
res = minimize(mycost_mod, [1,1,1], method='nelder-mead', options={'xtol': 1e-2, 'disp': True, 'maxiter': 20})

[ 1.  1.  1.]
cost: 42.1293226869
acc: 0.868201667328
[ 1.05  1.    1.  ]
cost: 42.8697189825
acc: 0.868201667328
[ 1.    1.05  1.  ]
cost: 43.097519977
acc: 0.868598650258
[ 1.    1.    1.05]
cost: 42.0515156126
acc: 0.868102421596
[ 1.03333333  0.95        1.03333333]
cost: 41.6165652267
acc: 0.867804684399
[ 1.05  0.9   1.05]
cost: 40.9084817478
acc: 0.867506947201
[ 0.98333333  0.93333333  1.06666667]
cost: 40.5231968059
acc: 0.868003175863
[ 0.95  0.9   1.1 ]
cost: 39.3853531917
acc: 0.867506947201
[ 1.          0.86666667  1.13333333]
cost: 39.4657634631
acc: 0.867109964272
[ 1.          0.77777778  1.13888889]
cost: 37.8675539431
acc: 0.866812227074
[ 1.          0.66666667  1.18333333]
cost: 35.9274057197
acc: 0.866911472807
[ 0.91666667  0.72222222  1.22777778]
cost: 35.6667153631
acc: 0.867804684399
[ 0.85        0.63333333  1.31666667]
cost: 33.2609918339
acc: 0.866911472807
[ 0.86666667  0.6         1.26666667]
cost: 32.9218178493
acc: 0.866712981342
[ 0.8         0.4666666