In [1]:
from sklearn import preprocessing
import pandas as pd 
from datetime import datetime
import xgboost as xgb
df = pd.read_hdf('../data/processed/train/aggr_train_zeros_25feat.hdf')

threshold = datetime(2015, 9, 1)

df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str) + '-1')

In [4]:


def X(df):
    df['day_of_year'] = df['date'].dt.dayofyear
    date_cols = [
        'month', 'year',
        'fridays', 'day_of_year', 'days_in_a_month'
    ]
    one_hot_cols = [
        'shop_id', 'mall', 'city',
        'item_id', 'item_category_id', 'first_big_category', 'last_big_category',
    ]
    additional = [
        'expected_sales'
    ]

    df_X = df[
        [
            col for col in df.columns
            if col.startswith('count')
        ]
        + date_cols + one_hot_cols + additional
        ]

    # df_X = pd.get_dummies(df_X, columns=one_hot_cols)
    # df_X = pd.get_dummies(df_X, columns=one_hot_cols + date_cols)
    for f in df_X.columns:
        if df_X[f].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(df_X[f].values))
            df_X[f] = lbl.transform(list(df_X[f].values))
    return df_X

def XY(df):
    df_X = X(df)
    df_y = df['item_cnt_day']
    return df_X, df_y


In [4]:
df_train_pca = pd.read_hdf('../data/processed/train/pca.hdf')
df_train_pca.head()

Unnamed: 0,ID,pca_0,pca_1,pca_2,pca_3
0,0,-35840.121448,-311164.411766,-19230.753314,-45744.62235
1,2,-35840.146211,-311164.281054,-19230.841393,-45744.618013
2,15,-35839.95006,-311165.30203,-19230.138928,-45744.653964
3,21,-35840.146211,-311164.281055,-19230.841393,-45744.618013
4,23,-35840.1245,-311164.38824,-19230.76888,-45744.620433


In [9]:
pca_cols = ['pca_{}'.format(i) for i in range(4)]
df[pca_cols]= df_train_pca[pca_cols]

In [7]:
df_train_pca.shape

(2784600, 5)

In [10]:

df_train = df[df['date'] < threshold]

In [11]:


df_X_train, df_y_train = df_train.drop(['item_cnt_day', 'date'], axis=1), df_train['item_cnt_day']

In [29]:



params = {'max_depth': 15, 'eta': 0.05, 'silent': 1}
num_round = 230
dtrain = xgb.DMatrix(df_X_train, df_y_train)

In [31]:
model = xgb.train(params, dtrain, num_round)

In [33]:
model = pickle.load(open('../models/model_xgb_25_feat_13_PCA.pck', 'rb'))

In [30]:
import pickle
pickle.dump(model, open('../models/model_xgb_25_feat_13_PCA.pck', 'wb'))


In [16]:
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt
def get_preds_trained_on_all(model, df_X):
    return model.predict(xgb.DMatrix(df_X)).clip(0, 20)


def previous_value_preds(df_X):
    df_X = df_X.reindex(columns=df_X_train.columns, fill_value=0)
    return df_X['count_aggr_1_month_ago_shop_id_item_id'].clip(0, 20)


def validate(model, df_y_valid, df_X_valid):
    df_y_valid = df_y_valid.copy().clip(0, 20)
    df_y_valid_preds = get_preds_trained_on_all(model, df_X_valid)
    # df_y_valid_preds = previous_value_preds(df_X_valid)
    print('r2 score: ', r2_score(df_y_valid, df_y_valid_preds))
    print('RMSE: ', sqrt(mean_squared_error(df_y_valid, df_y_valid_preds)))

In [17]:
df_valid = df[df['date'] >= threshold]

In [18]:
# depth: 10 
# PCA 
# 25 features
# 100 rounds, 0.1
df_X_valid, df_y_valid = df_valid.drop(['item_cnt_day', 'date'], axis=1), df_valid['item_cnt_day']
validate(model, df_y_valid, df_X_valid)

r2 score:  0.3592850171848767
RMSE:  0.882828576499819


In [21]:
# depth: 11
# PCA 
# 25 features
# 230 rounds, 0.05
df_X_valid, df_y_valid = df_valid.drop(['item_cnt_day', 'date'], axis=1), df_valid['item_cnt_day']
validate(model, df_y_valid, df_X_valid)

r2 score:  0.3609760703133814
RMSE:  0.8816627723310763


In [34]:
# depth: 13
# PCA 
# 25 features
# 230 rounds, 0.05
df_X_valid, df_y_valid = df_valid.drop(['item_cnt_day', 'date'], axis=1), df_valid['item_cnt_day']
validate(model, df_y_valid, df_X_valid)

r2 score:  0.3688473720512039
RMSE:  0.8762159207204084


In [32]:
# depth: 15
# PCA 
# 25 features
# 230 rounds, 0.05
df_X_valid, df_y_valid = df_valid.drop(['item_cnt_day', 'date'], axis=1), df_valid['item_cnt_day']
validate(model, df_y_valid, df_X_valid)

r2 score:  0.3622632429519045
RMSE:  0.8807743672199442


In [35]:
df_test = pd.read_hdf('../data/processed/test/aggr_test_25feat.hdf')
df_test_pca = pd.read_hdf('../data/processed/test/pca.hdf')
df_test[pca_cols] = df_test_pca[pca_cols]
df_test['item_cnt_month'] = get_preds_trained_on_all(model, df_test.drop('ID', axis=1))
df_test[['ID', 'item_cnt_month']].to_csv('../data/submissions/submission_trained_on_year_xgb_13_25feat.csv', index=False)

linreg_test = pd.read_csv('../data/submissions/if_sells_submission_linreg.csv')
print('linreg RMSE: ', sqrt(mean_squared_error(linreg_test['item_cnt_month'], df_test['item_cnt_month'])))
rf_test = pd.read_csv('../data/submissions/if_sells_submission_rf.csv')
print('rf RMSE: ', sqrt(mean_squared_error(rf_test['item_cnt_month'], df_test['item_cnt_month'])))

linreg RMSE:  1.3027673459701847
rf RMSE:  2.099549456906779
