In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix

In [2]:
from sys import stdout
from time import time

In [3]:
import xgboost as xgb

In [4]:
X = pd.read_csv("data/X.csv")
data = X.groupby('user')

In [5]:
def remove_zeros_rows(x):
    nz = np.nonzero(x.any(axis=1))[0]
    return (x[nz, :])

def build_calendar(X):
    s = csr_matrix((X['sum'].values, (X.user.values - 1, X.day.values - 1))).todense()
    s = np.concatenate([s, np.zeros((s.shape[0], 3), dtype=np.int)], axis=1)
    return ([0] + [remove_zeros_rows(np.asarray(user.reshape(448//7, 7))) for user in s])

In [6]:
%time cal = build_calendar(X)

CPU times: user 4.12 s, sys: 147 ms, total: 4.27 s
Wall time: 4.26 s


In [7]:
%time data_pred = data.last().values
data_pred = data_pred[:, 0]

CPU times: user 137 ms, sys: 23.3 ms, total: 160 ms
Wall time: 156 ms


In [8]:
%time test = data.apply(lambda df: df.values[-2])
test = np.array(list(test))
data_test, target_test = test[:, 0], test[:, 1]

CPU times: user 1.06 s, sys: 56.7 ms, total: 1.11 s
Wall time: 1.11 s


In [9]:
def get_features(cal, day, n=15):
    nz_day = cal[:, day]
    nz_day = nz_day[nz_day.nonzero()]
    ret = []
    ret += [nz_day.mean()]
    ret += [nz_day.sum()]
    ret += [nz_day.max()]
    ret += [nz_day.min()]
    ret += [np.median(nz_day)]
    ret += cal[:, day][:n].tolist()
    ret += cal[-n-1: -1].mean(axis=1).tolist()
    ret += cal[-n-1: -1].sum(axis=1).tolist()
    
    return ret

#### Test

In [29]:
X_xg_test = np.zeros((data_test.size, 50))
for u, day in enumerate((data_test - 1) % 7):
    X_xg_test[u, :] = np.array(get_features(cal[u + 1], day))

In [30]:
y_xg_test = np.array(target_test)

In [36]:
reg = xgb.XGBRegressor()
reg.fit(X_xg_test, y_xg_test)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [39]:
y_test_pred = reg.predict(X_xg_test)
print(mean_absolute_error(y_xg_test, y_test_pred))

194.673579789


####  Test normalized

In [40]:
X_xg_test_norm = normalize(X_xg_test)

In [41]:
reg_n = xgb.XGBRegressor()
reg_n.fit(X_xg_test_norm, y_xg_test)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [43]:
y_test_n_pred = reg_n.predict(X_xg_test_norm)
print(mean_absolute_error(y_xg_test, y_test_n_pred))

249.836694441


### Pred

In [47]:
ans_275 = pd.read_csv('data/myans_1.csv')['sum'].values

In [35]:
X_xg = np.zeros((data_pred.size, 50))
for u, day in enumerate((data_pred - 1) % 7):
    X_xg[u, :] = np.array(get_features(cal[u + 1], day))

In [44]:
y_pred = reg.predict(X_xg)

In [56]:
np.vstack((y_pred, ans_275)).T[:30]

array([[  462.97616577,   330.5547315 ],
       [  172.9669342 ,   227.04705756],
       [ 1009.01287842,   769.15553372],
       [  478.42523193,   466.26335122],
       [  389.00512695,   367.88551994],
       [  639.58251953,   397.60649257],
       [  697.54968262,   736.75948432],
       [  426.81420898,   441.31795502],
       [  530.94073486,   421.29144594],
       [  715.70794678,   677.90108304],
       [  248.63244629,   551.13108757],
       [  321.76077271,   292.19544793],
       [  510.67611694,   528.75781142],
       [ 1336.80285645,   994.87940236],
       [  285.58248901,   267.59554532],
       [  321.27456665,   238.42993707],
       [ 1759.3684082 ,  1356.40336742],
       [ 1100.46337891,   751.53034856],
       [ 1096.09155273,   773.55117081],
       [  952.14801025,   881.39635177],
       [  302.72348022,   364.97081322],
       [  252.94404602,   292.69157895],
       [ 1366.27893066,  1358.05510575],
       [  144.22163391,   239.54147241],
       [  489.09

In [49]:
print(mean_absolute_error(ans_275, y_pred))

162.875797391


In [63]:
myres_csv = pd.DataFrame(data=y_pred, index=pd.RangeIndex(1, len(y_pred) + 1, name='user'), columns=['sum'])

In [65]:
myres_csv.to_csv("data/myans_xgb.csv")

###  pred norm

In [57]:
X_xg_norm = normalize(X_xg)

In [58]:
y_pred_n = reg_n.predict(X_xg_norm)

In [60]:
np.vstack((y_pred_n, ans_275)).T[:30]

array([[  484.16531372,   330.5547315 ],
       [   37.94115448,   227.04705756],
       [  924.29876709,   769.15553372],
       [  479.38037109,   466.26335122],
       [  395.43603516,   367.88551994],
       [  809.86682129,   397.60649257],
       [  771.89898682,   736.75948432],
       [  560.98046875,   441.31795502],
       [  672.96618652,   421.29144594],
       [  701.5489502 ,   677.90108304],
       [   39.63309097,   551.13108757],
       [  415.3480835 ,   292.19544793],
       [  543.12475586,   528.75781142],
       [ 1204.93188477,   994.87940236],
       [  396.54006958,   267.59554532],
       [  365.60836792,   238.42993707],
       [ 1273.00683594,  1356.40336742],
       [ 1040.86962891,   751.53034856],
       [  996.81640625,   773.55117081],
       [  953.31256104,   881.39635177],
       [  352.5062561 ,   364.97081322],
       [  108.71702576,   292.69157895],
       [ 1007.92547607,  1358.05510575],
       [  420.90539551,   239.54147241],
       [  722.94

In [59]:
print(mean_absolute_error(ans_275, y_pred_n))

194.971463128


In [66]:
myres_n_csv = pd.DataFrame(data=y_pred_n, index=pd.RangeIndex(1, len(y_pred_n) + 1, name='user'), columns=['sum'])

In [69]:
myres_n_csv.to_csv("data/myans_xgb_norm.csv")

### Compare

In [62]:
np.vstack((y_pred, y_pred_n)).T[:30]

array([[  462.97616577,   484.16531372],
       [  172.9669342 ,    37.94115448],
       [ 1009.01287842,   924.29876709],
       [  478.42523193,   479.38037109],
       [  389.00512695,   395.43603516],
       [  639.58251953,   809.86682129],
       [  697.54968262,   771.89898682],
       [  426.81420898,   560.98046875],
       [  530.94073486,   672.96618652],
       [  715.70794678,   701.5489502 ],
       [  248.63244629,    39.63309097],
       [  321.76077271,   415.3480835 ],
       [  510.67611694,   543.12475586],
       [ 1336.80285645,  1204.93188477],
       [  285.58248901,   396.54006958],
       [  321.27456665,   365.60836792],
       [ 1759.3684082 ,  1273.00683594],
       [ 1100.46337891,  1040.86962891],
       [ 1096.09155273,   996.81640625],
       [  952.14801025,   953.31256104],
       [  302.72348022,   352.5062561 ],
       [  252.94404602,   108.71702576],
       [ 1366.27893066,  1007.92547607],
       [  144.22163391,   420.90539551],
       [  489.09