In [None]:
__author__ = 'Nick Sarris (ngs5st)'

import os
import time
import random
import numpy as np
import pandas as pd
import xgboost as xgb

from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
import itertools

print(os.listdir("./data"))

In [None]:
def seed_everything(seed=1235):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(1235)

In [None]:
start_time = time.time()
print("Loading Data ...")

directory = "./data/"
train_df = pd.read_csv(directory + 'train.csv')
test_df = pd.read_csv(directory + 'test.csv')

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
def encode(charcode):
    r = 0
    ln = len(str(charcode))
    for i in range(ln):
        r += (ord(str(charcode)[i]) - ord('A') + 1) * 26 ** (ln - i - 1)
    return r

def mungeskewed(train, test, numeric_feats):
    ntrain = train.shape[0]
    test['loss'] = 0
    train_test = pd.concat((train, test)).reset_index(drop=True)
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
        
    return train_test, ntrain

In [None]:
start_time = time.time()
print("Feature Engineering [1] ...")

COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,' \
               'cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,' \
               'cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,' \
               'cat4,cat14,cat38,cat24,cat82,cat25'.split(',')

numeric_feats = [x for x in train_df.columns[1:-1] if 'cont' in x]
train_test, ntrain = mungeskewed(train_df, test_df, numeric_feats)

for comb in itertools.combinations(COMB_FEATURE, 2):
    feat = comb[0] + "_" + comb[1]
    train_test[feat] = train_test[comb[0]] + train_test[comb[1]]
    train_test[feat] = train_test[feat].apply(encode)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Feature Engineering [2] ...")

categorical_feats = [x for x in train_df.columns[1:-1] if 'cat' in x]

for col in categorical_feats:
    train_test[col] = train_test[col].apply(encode)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Scaling Features ...")

ss = StandardScaler()
train_test[numeric_feats] = ss.fit_transform(train_test[numeric_feats].values)

train = train_test.iloc[:ntrain, :].copy()
test = train_test.iloc[ntrain:, :].copy()

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Splitting Data ...")

shift = 200
ids = pd.read_csv(directory + 'test.csv')['id']
train_x = np.array(train.drop(['loss','id'], axis=1))
labels = np.array(np.log(train['loss'] + shift))
test_x = np.array(test.drop(['loss','id'], axis=1))

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Establishing Parameters ...")

xgb_params = {}
xgb_params["seed"] = 0
xgb_params["colsample_bytree"] = 0.7
xgb_params["silent"] = 1
xgb_params["subsample"] = 0.7
xgb_params["learning_rate"] = 0.03
xgb_params["objective"] = 'reg:linear'
xgb_params["max_depth"] = 12
xgb_params["min_child_weight"] = 100
xgb_params["booster"] = 'gbtree'

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
def fair_obj(preds, dtrain):
    labels = dtrain.get_label()
    x = (preds - labels)
    den = abs(x) + 2
    grad = 2 * x / (den)
    hess = 4 / (den * den)
    return grad, hess

def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(
        np.exp(y)-shift, np.exp(yhat)-shift)

In [None]:
start_time = time.time()
print("Training Model ...")

cv_sum = 0
n_folds = 10
fpred = []
xgb_rounds = []

kf = KFold(n_splits=n_folds, shuffle=True, random_state=2018).split(train_x)
dtest = xgb.DMatrix(test_x)

for i, (train_index, test_index) in enumerate(kf):
    x_tr = train_x[train_index]
    y_tr = labels[train_index]
    x_te = train_x[test_index]
    y_te = labels[test_index]

    dtrain = xgb.DMatrix(x_tr, label=y_tr)
    dvalid = xgb.DMatrix(x_te, label=y_te)
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

    clf = xgb.train(xgb_params, dtrain, 10000, watchlist,
                    early_stopping_rounds=50, obj=fair_obj,
                    feval=xg_eval_mae, verbose_eval=20)

    xgb_rounds.append(clf.best_iteration)
    scores_val = clf.predict(dvalid, ntree_limit=clf.best_ntree_limit)
    cv_score = mean_absolute_error(np.exp(y_te), np.exp(scores_val))
    y_pred = np.exp(clf.predict(dtest, ntree_limit=clf.best_ntree_limit)) - shift
    print("XG-CV: {}".format(mean_absolute_error(np.exp(y_te), np.exp(scores_val))))

    if i > 0:
        fpred = pred + y_pred
    else:
        fpred = y_pred
                             
    pred = fpred
    cv_sum = cv_sum + cv_score

mpred = pred / n_folds
score = cv_sum / n_folds
print('Average XG-CV: {}'.format(score))

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
print("Generate Submission ...")

submission = pd.DataFrame()
submission['id'] = ids
submission['loss'] = mpred
submission.to_csv("output_data.csv", index=False)

print("--- %s seconds ---" % (time.time() - start_time))