In [None]:
### Xgboosted trees as feature transformation

In [53]:
import os
import sys
import scipy
import scipy.sparse
import pandas as pd
import numpy as np
import xgboost as xgb
import time
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
import pickle

PROJECT_DIR = '/home/michal/diplomka/code'
if PROJECT_DIR not in sys.path:
    sys.path.append(PROJECT_DIR)
from lib.utils import load_csr_matrix

In [29]:
FEATURES = ['a_hypernyms', 'a_head_form', 'a_head_number', 'a_non_article_det', 'a_parent', 'a_pos_after_head', 'a_pos_before_head', 'a_words_after_head', 'a_words_after_np', 'a_words_before_head', 'a_words_before_np', 'b_head_proper', 'b_head_pos_simple', 'b_object_form', 'b_pos_after_head_as_list', 'b_pos_before_head_as_list', 'b_pp_object_form', 'b_postmodification_type', 'b_referent', 'b_words_after_head_as_list', 'b_words_after_np_as_list', 'b_words_before_head_as_list', 'b_words_before_np_as_list', 'c_countability_bnc', 'd_head_form_embeddings', 'e_kenlm_ggl_5_lc_nbs']
DATA_PATH = os.path.join(PROJECT_DIR, '../data/features/penn/postprocessed')
MODEL_PATH = os.path.join(PROJECT_DIR, '../data/model/xgboost_with_logreg')
CV_RESULTS_LOG_PATH = os.path.join(PROJECT_DIR, 'logs/experiments/model_results/penn/xgboost-with-logreg')  # logovani vysledku z cross-validace
NUM_CLASS = 3
TRAIN_SET_NAME = 'train'
TEST_SET_NAME = 'heldout'

In [88]:
train_x, train_y = load_data(FEATURES, TRAIN_SET_NAME)
test_x, test_y = load_data(FEATURES, TEST_SET_NAME)
assert train_x.shape[1] == test_x.shape[1]
print(train_x.shape, test_x.shape)

(263088, 35516) (10076, 35516)


In [89]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_y)
train_y = le.transform(train_y)
test_y = le.transform(test_y)

In [5]:
XGB_DEFAULT_PARAMS = { 
    'learning_rate': 0.1,
    'n_estimators': 2000,
    'max_depth': 5,
    'min_child_weight': 1,
    'gamma': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'multi:softmax',
    'nthread': 24,
    'scale_pos_weight': 1,
    'seed': 27
}

In [3]:
def load_data(features, dataset_name, sample=None):
    y = np.load(os.path.join(DATA_PATH, dataset_name, 'Y_article'))
    if sample:
        np.random.seed(seed=42)
        sample_indices = [np.random.choice(len(y), sample, replace=False)]
        y = y[sample_indices]
        
    feature_matrices = []
    for feature_name in features:
        feature_matrix = load_csr_matrix(
            os.path.join(DATA_PATH, dataset_name, feature_name + '.npz')
        )
        if sample:
            feature_matrix = scipy.sparse.csr_matrix(feature_matrix.toarray()[sample_indices])
        feature_matrices.append(feature_matrix)
    x =  scipy.sparse.hstack(feature_matrices)
    assert x.shape[0] == len(y)
    return x, y

In [65]:
def train_and_save(model, xgb_train_x, xgb_train_y, save_file_name):
    start_time = time.time()
    model.fit(xgb_train_x, xgb_train_y)
    pickle.dump(model, open(os.path.join(MODEL_PATH, save_file_name), 'wb'))
    end_time = time.time()
    print("Done in {} minutes".format((end_time - start_time)/60))
    return model

In [8]:
train_x_20000, train_y_20000 = load_data(FEATURES, TRAIN_SET_NAME, sample=20000)

In [17]:
train_x_20000_ = train_x_20000.toarray()
train_x_a, train_y_a = scipy.sparse.csr_matrix(train_x_20000_[:10000]), train_y_20000[:10000]
train_x_b, train_y_b = scipy.sparse.csr_matrix(train_x_20000_[10000:]), train_y_20000[10000:]
print(train_x_a.shape, len(train_y_a))
print(train_x_b.shape, len(train_y_b))

(10000, 35516) 10000
(10000, 35516) 10000


In [66]:
def train_ensemble(xgb_model, logreg_model, xgb_train_x, xgb_train_y, logreg_train_x, logreg_train_y, model_name):
    xgb_model = train_and_save(xgb_model, xgb_train_x, xgb_train_y, save_file_name='xgboost_'+model_name+'.model')
    logreg_train_x = xgb_model.booster().predict(xgb.DMatrix(logreg_train_x), pred_leaf=True)
    train_and_save(logreg_model, logreg_train_x, logreg_train_y, save_file_name='xgbWithLogreg_'+model_name+'.model')

In [116]:
def test_ensemble(model_name, test_x, test_y):
    xgb_model = pickle.load(open(os.path.join(MODEL_PATH, 'xgboost_'+model_name+'.model'), "rb"))
    xgblogreg_model = pickle.load(open(os.path.join(MODEL_PATH, 'xgbWithLogreg_'+model_name+'.model'), "rb"))
    test_predictions = xgblogreg_model.predict(xgb_model.booster().predict(xgb.DMatrix(test_x), pred_leaf=True))
    xgb_test_predictions = xgb_model.predict(test_x)
    print("Accuracy ({}): {:.4f}\n".format(TEST_SET_NAME, metrics.accuracy_score(test_y, test_predictions)))
    print("Accuracy xgb ({}): {:.4f}\n".format(TEST_SET_NAME, metrics.accuracy_score(test_y, xgb_test_predictions)))

In [68]:
model_params = {'n_estimators': 162, 'max_depth':9, 'min_child_weight':1, 'gamma':0.1, 'subsample':1, 'colsample_bytree':0.8}
xgb_model = XGBClassifier(**dict(XGB_DEFAULT_PARAMS, **model_params, nthread=8))
logreg_model = LogisticRegression(penalty='l1', solver='liblinear', C=1)
train_ensemble(xgb_model, logreg_model, train_x_a, train_y_a, train_x_b, train_y_b, '010_000_separate')

Done in 0.6854831536610921 minutes
Done in 3.101623260974884 minutes


In [117]:
test_ensemble('010_000_separate', test_x, le.inverse_transform(test_y))

Accuracy (heldout): 0.8385

Accuracy xgb (heldout): 0.8635



In [109]:
train_x, train_y = load_data(FEATURES, TRAIN_SET_NAME, sample=40000)
train_x = train_x.toarray()
train_x_a, train_y_a = scipy.sparse.csr_matrix(train_x[:20000]), train_y[:20000]
train_x_b, train_y_b = scipy.sparse.csr_matrix(train_x[20000:]), train_y[20000:]
print(train_x_a.shape, len(train_y_a))
print(train_x_b.shape, len(train_y_b))
model_params = {'n_estimators': 198, 'max_depth':7, 'min_child_weight':1, 'gamma': 0.4, 'subsample':0.9, 'colsample_bytree':1}
xgb_model = XGBClassifier(**dict(XGB_DEFAULT_PARAMS, **model_params, nthread=8))
logreg_model = LogisticRegression(penalty='l1', solver='liblinear', C=1)
train_ensemble(xgb_model, logreg_model, train_x_a, train_y_a, train_x_b, train_y_b, '020_000_separate')

(20000, 35516) 20000
(20000, 35516) 20000
Done in 1.4325003147125244 minutes
Done in 12.9325422167778 minutes


In [118]:
test_ensemble('020_000_separate', test_x, le.inverse_transform(test_y))

Accuracy (heldout): 0.8553

Accuracy xgb (heldout): 0.8750



In [111]:
train_x, train_y = load_data(FEATURES, TRAIN_SET_NAME, sample=100000)
train_x = train_x.toarray()
train_x_a, train_y_a = scipy.sparse.csr_matrix(train_x[:50000]), train_y[:50000]
train_x_b, train_y_b = scipy.sparse.csr_matrix(train_x[50000:]), train_y[50000:]
print(train_x_a.shape, len(train_y_a))
print(train_x_b.shape, len(train_y_b))
model_params = {'n_estimators': 354, 'max_depth':9, 'min_child_weight':1, 'gamma':0, 'subsample':0.9, 'colsample_bytree':1}
xgb_model = XGBClassifier(**dict(XGB_DEFAULT_PARAMS, **model_params, nthread=8))
logreg_model = LogisticRegression(penalty='l1', solver='liblinear', C=1)
train_ensemble(xgb_model, logreg_model, train_x_a, train_y_a, train_x_b, train_y_b, '050_000_separate')

(50000, 35516) 50000
(50000, 35516) 50000
Done in 7.208277940750122 minutes
Done in 41.59245844284693 minutes


In [119]:
test_ensemble('050_000_separate', test_x, le.inverse_transform(test_y))

Accuracy (heldout): 0.8632

Accuracy xgb (heldout): 0.8944



In [None]:
train_x, train_y = load_data(FEATURES, TRAIN_SET_NAME, sample=200000)
train_x = train_x.toarray()
train_x_a, train_y_a = scipy.sparse.csr_matrix(train_x[:100000]), train_y[:100000]
train_x_b, train_y_b = scipy.sparse.csr_matrix(train_x[100000:]), train_y[100000:]
print(train_x_a.shape, len(train_y_a))
print(train_x_b.shape, len(train_y_b))
model_params = {'n_estimators': 487, 'max_depth':9, 'min_child_weight':1, 'gamma':0.1, 'subsample':0.8, 'colsample_bytree':1}
xgb_model = XGBClassifier(**dict(XGB_DEFAULT_PARAMS, **model_params, nthread=8))
logreg_model = LogisticRegression(penalty='l1', solver='liblinear', C=1)
train_ensemble(xgb_model, logreg_model, train_x_a, train_y_a, train_x_b, train_y_b, '100_000_separate')

(100000, 35516) 100000
(100000, 35516) 100000
Done in 12.540844289461772 minutes
Done in 69.7520286043485 minutes


In [123]:
test_ensemble('100_000_separate', test_x, le.inverse_transform(test_y))

Accuracy (heldout): 0.8769

Accuracy xgb (heldout): 0.9023



In [92]:
model_params = {'n_estimators': 162, 'max_depth':9, 'min_child_weight':1, 'gamma':0.1, 'subsample':1, 'colsample_bytree':0.8}
xgb_model = XGBClassifier(**dict(XGB_DEFAULT_PARAMS, **model_params, nthread=8))
train_and_save(xgb_model, train_x_a, le.transform(train_y_a), save_file_name='xgboost_010_000.model')

Done in 0.7081761916478475 minutes


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.1, learning_rate=0.1, max_delta_step=0, max_depth=9,
       min_child_weight=1, missing=None, n_estimators=162, nthread=8,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=1)

In [93]:
xgbmatrix_train_x_b = xgb.DMatrix(train_x_b)

In [94]:
xgbmatrix_test_x = xgb.DMatrix(test_x)

In [95]:
xgb_model = pickle.load(open(os.path.join(MODEL_PATH, 'xgboost_010_000.model'), "rb"))

In [96]:
logregmodel = LogisticRegression(penalty='l1', solver='liblinear', C=1)
train_and_save(logregmodel, train_x_b, le.transform(train_y_b), save_file_name='logreg_010_000.model')

Done in 1.4778326869010925 minutes


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [97]:
logreg_model = pickle.load(open(os.path.join(MODEL_PATH, 'logreg_010_000.model'), "rb"))

In [98]:
train_x_b_new = xgb_model.booster().predict(xgbmatrix_train_x_b, pred_leaf=True)

In [99]:
xgblogreg_model = LogisticRegression(penalty='l1', solver='liblinear', C=1)
train_and_save(xgblogreg_model, train_x_b_new, le.transform(train_y_b), save_file_name='xgbWithLogreg_010_000.model')

Done in 2.7609941085179646 minutes


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [100]:
xgblogreg_model = pickle.load(open(os.path.join(MODEL_PATH, 'xgbWithLogreg_010_000.model'), "rb"))

In [101]:
test_predictions = logreg_model.predict(test_x)
print("Accuracy ({}): {:.4f}\n".format(TEST_SET_NAME, metrics.accuracy_score(test_y, test_predictions)))

Accuracy (heldout): 0.8564



In [102]:
test_predictions = xgb_model.predict(test_x)
print("Accuracy ({}): {:.4f}\n".format(TEST_SET_NAME, metrics.accuracy_score(test_y, test_predictions)))

Accuracy (heldout): 0.8635



In [103]:
test_predictions = xgblogreg_model.predict(xgb_model.booster().predict(xgbmatrix_test_x, pred_leaf=True))
print("Accuracy ({}): {:.4f}\n".format(TEST_SET_NAME, metrics.accuracy_score(test_y, test_predictions)))

Accuracy (heldout): 0.8387

