In [23]:
train_data_path = '../../data/processed/model_valid/pre_valid_train'
test_data_path = '../../data/processed/model_valid/all_train'
train_data = {
    'knn10': 'KNN_10.hdf',
    'knn3': 'KNN_3.hdf',
    'linreg': 'linreg.hdf',
    'xgb_25': 'xgb_25_feat_11_PCA_valid.hdf',
    'xgb_all': 'xgb_all.hdf'
}
test_data = {
    'knn10': 'knn10_test.hdf',
    'knn3': 'KNN_3_test.hdf',
    'linreg': 'linreg_test.hdf',
    'xgb_25': 'xgb_25_feat_11_PCA_test.hdf',
    'xgb_all': 'xgb_all_test.hdf'
}

In [24]:
import pandas as pd 
import os 
def renamed_series_train(name, path):
    s = pd.read_hdf(os.path.join(train_data_path, path))
    s.name = name 
    return s 
def renamed_series_test(name, path):
    s = pd.read_hdf(os.path.join(test_data_path, path))
    col_name = s.columns[1]
    s.set_index('ID', inplace=True)
    s = s[col_name]
    s.name = name 
    return s 

In [25]:
df_train = pd.DataFrame(data={
    name: renamed_series_train(name, path) for name, path in train_data.items()
})
df_test = pd.DataFrame(data={
    name: renamed_series_test(name, path) for name, path in test_data.items()
})
df_valid_true = pd.read_hdf('../../data/processed/model_valid/pre_valid_train/true.hdf')
df_valid_true = df_valid_true.clip(0, 20)

In [26]:
df_train.head()

Unnamed: 0,knn10,knn3,linreg,xgb_25,xgb_all
2356200,0.9,3.333333,1.314807,2.89096,2.739996
2356201,0.2,3.333333,0.712276,2.718085,2.961049
2356202,1.8,0.0,2.548201,1.61681,1.201391
2356203,0.1,0.0,0.0,0.089291,0.130663
2356204,1.0,0.0,1.502367,1.36049,1.253611


In [45]:
def second_order_df(df):
    df_2o = pd.DataFrame(data={
        '{}_{}'.format(fst_col, snd_col): df[fst_col] * df[snd_col] 
        for fst_col in df.columns
        for snd_col in df.columns
        if fst_col <= snd_col
    })
    df_2o[df.columns] = df
    return df_2o

In [46]:
from itertools import product
def third_order_df(df):
    df_3o = pd.DataFrame(data={
        '{}_{}_{}'.format(fst_col, snd_col,thrd_col): df[fst_col] * df[snd_col] * df[thrd_col]
        for fst_col in df.columns
        for snd_col in df.columns
        for thrd_col in df.columns
        if fst_col <= snd_col
        and snd_col <= thrd_col 
    })
    df_2o = second_order_df(df)
    df_3o[df_2o.columns] = df_2o
    df_3o[df.columns] = df
    return df_2o
        

In [54]:
df_train_2o = second_order_df(df_train)

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error
def evaluate(preds):
    print('r2_score: ', r2_score(df_valid_true, preds))
    print('mean_squared_error: ', mean_squared_error(df_valid_true, preds))
    
model = LinearRegression()
preds_1st_order = cross_val_predict(model, df_train, df_valid_true).clip(0, 20)
preds_2nd_order = cross_val_predict(model, df_train_2o, df_valid_true).clip(0, 20)

print('1st orrder')
evaluate(preds_1st_order)
print('2nd order')
evaluate(preds_2nd_order)

1st orrder
r2_score:  0.38441198366041485
mean_squared_error:  0.7488210459690807
2nd order
r2_score:  0.38630125250933545
mean_squared_error:  0.746522878626614


In [50]:
model = LinearRegression()
df_train_3o = third_order_df(df_train)

preds_3rd_order = cross_val_predict(model, df_train_3o, df_valid_true).clip(0, 20)
print('3rd order')
evaluate(preds_3rd_order)

3rd order
r2_score:  0.3863144063727556
mean_squared_error:  0.7465068778444295


In [32]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=10, max_depth=4, n_jobs=-1)
preds_rf_2o = cross_val_predict(model, df_train_2o, df_valid_true).clip(0, 20)

In [34]:
evaluate((preds_rf_2o + preds_2nd_order)/2)

r2_score:  0.38605070346042614
mean_squared_error:  0.7468276545414968


In [35]:
from xgboost import XGBRegressor
model = XGBRegressor(max_depth=5, num_round=100)
preds_xgb_2o = cross_val_predict(model, df_train_2o, df_valid_true).clip(0, 20)
evaluate(preds_xgb_2o)

r2_score:  0.38062854881284414
mean_squared_error:  0.7534233377043229


In [36]:
evaluate((preds_xgb_2o + preds_2nd_order)/2)

r2_score:  0.3869204132484314
mean_squared_error:  0.7457697116058689


In [51]:
evaluate((preds_xgb_2o + preds_3rd_order)/2)

r2_score:  0.38691643820586175
mean_squared_error:  0.7457745469753949


In [55]:
model_xgb = XGBRegressor(max_depth=5, num_round=100)
model_xgb.fit(df_train_2o, df_valid_true)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, num_round=100, objective='reg:linear',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)

In [56]:
model_linreg = LinearRegression()
model_linreg.fit(df_train_2o, df_valid_true)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [None]:
df_train_2o

In [60]:
df_test.drop('item_cnt_month', axis=1, inplace=True)
df_test['pred'] = (
    (model_xgb.predict(second_order_df(df_test)) + model_linreg.predict(second_order_df(df_test)))/2
).clip(0, 20)
df_test.head()

Unnamed: 0_level_0,knn10,knn3,linreg,xgb_25,xgb_all,pred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4,0.666667,0.823794,0.590379,0.51635,0.485136
2,0.2,0.333333,0.822782,1.113075,1.065165,0.93183
15,1.3,0.333333,4.382483,1.851596,1.720403,1.56966
21,0.0,0.0,0.027899,0.451284,0.377604,0.380837
23,0.3,0.333333,1.164632,0.693333,0.605296,0.575454


In [63]:
df_test.rename(columns={'pred': 'item_cnt_month'}, inplace=True)
df_test['item_cnt_month'].to_csv('../../data/submissions/2nd_order_bagged.csv', header=True)

In [64]:
df_test.head()

Unnamed: 0_level_0,knn10,knn3,linreg,xgb_25,xgb_all,item_cnt_month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4,0.666667,0.823794,0.590379,0.51635,0.485136
2,0.2,0.333333,0.822782,1.113075,1.065165,0.93183
15,1.3,0.333333,4.382483,1.851596,1.720403,1.56966
21,0.0,0.0,0.027899,0.451284,0.377604,0.380837
23,0.3,0.333333,1.164632,0.693333,0.605296,0.575454
