In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (15, 5)
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error
import catboost as ctb

In [2]:
df_sales_train = pd.read_hdf("../input/sales_train.h5")
df_sales_test = pd.read_hdf("../input/sales_test.h5")

df_sales_train["date"] = pd.to_datetime(df_sales_train["date"], format="%d/%m/%Y")
df_sales_test["date"] = pd.to_datetime(df_sales_test["date"], format="%d/%m/%Y")

df_sales = pd.concat([df_sales_train, df_sales_test])

In [3]:
df_sales.describe()

Unnamed: 0,id,weekly_sales,store,dept
count,421570.0,285089.0,421570.0,421570.0
mean,210784.5,15973.820312,22.200546,44.260317
std,121696.920829,22738.791016,12.785297,30.492054
min,0.0,-4988.939941,1.0,1.0
25%,105392.25,2133.610107,11.0,18.0
50%,210784.5,7676.72998,22.0,37.0
75%,316176.75,20183.179688,33.0,74.0
max,421569.0,693099.375,45.0,99.0


In [5]:
df_sales["week"] = df_sales["date"].dt.week
df_sales["year"] = df_sales["date"].dt.year
df_sales["dayofweek"] = df_sales["date"].dt.dayofweek
df_sales["dayofyear"] = df_sales["date"].dt.dayofyear

df_sales_train["week"] = df_sales_train["date"].dt.week
df_sales_train["year"] = df_sales_train["date"].dt.year
df_sales_train["dayofweek"] = df_sales_train["date"].dt.dayofweek
df_sales_train["dayofyear"] = df_sales_train["date"].dt.dayofyear

In [6]:
#statystyki ze startera
df_store = df_sales_train[ ["store", "dept", "weekly_sales"] ].groupby(["store", "dept"]).agg(["mean", "std", "median", "size","sum"])["weekly_sales"].reset_index()

df_sales = pd.merge(df_sales, df_store, on=["store", "dept"], how="left")


In [7]:
#F/T na int
df_sales["is_holiday"] = df_sales.is_holiday.map(lambda x: int(x))

In [8]:
#listopad, grudzień - zakupy świąteczne
df_sales["month"] = df_sales["date"].dt.month
df_sales["xmas"] = df_sales["month"].map(lambda x: 1 if x in [11,12] else 0)

In [9]:
df_sales

Unnamed: 0,id,weekly_sales,store,dept,date,is_holiday,week,year,dayofweek,dayofyear,mean,std,median,size,sum,month,xmas
0,1,41272.601562,42,94,2010-07-09,0,27,2010,4,190,34954.531250,3600.595947,34704.488281,97.0,3.390590e+06,7,0
1,3,6212.770020,19,32,2011-08-19,0,33,2011,4,231,7794.950195,4158.146484,7002.959961,97.0,7.561102e+05,8,0
2,4,32530.310547,45,7,2011-06-10,0,23,2011,4,161,24444.597656,21043.888672,17425.750000,97.0,2.371126e+06,6,0
3,7,6199.310059,12,71,2011-05-20,0,20,2011,4,140,7402.809570,1781.085571,7088.459961,97.0,7.180725e+05,5,0
4,9,4262.890137,29,24,2010-09-10,1,36,2010,4,253,4570.558594,1736.986206,4262.890137,97.0,4.433442e+05,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421565,421557,,35,1,2012-05-25,0,21,2012,4,146,17688.738281,7684.835938,15274.030273,97.0,1.715808e+06,5,0
421566,421560,,21,18,2012-08-17,0,33,2012,4,230,8158.437012,11735.409180,2839.270020,69.0,5.629321e+05,8,0
421567,421564,,26,67,2011-12-16,0,50,2011,4,350,6521.995605,2574.423828,5529.770020,97.0,6.326336e+05,12,1
421568,421568,,38,59,2012-04-20,0,16,2012,4,111,67.870926,38.494797,65.370003,97.0,6.583480e+03,4,0


In [10]:
#dodatkowe cechy sklepu
df_store = pd.read_csv("../input/stores_data.csv")
df_store

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875
5,6,A,202505
6,7,B,70713
7,8,A,155078
8,9,B,125833
9,10,B,126512


In [11]:
df_sales = df_sales.merge(df_store, left_on="store", right_on="Store", how="left")

In [12]:
#one hot encoding
df_sales=pd.get_dummies(df_sales)

In [13]:
#starter 3
df_sales_group = df_sales_train.groupby(["store", "dept", "week"]).agg("mean")[ ["weekly_sales"] ].reset_index()
train_dict = { (x["store"], x["dept"], x["week"]): x["weekly_sales"] for x in df_sales_group.to_dict(orient="items").values() }
df_sales["weekly_sales_prev_y"] = df_sales.apply(lambda x: train_dict.get( (x["store"], x["dept"], x["week"]), -1), axis=1)

In [14]:
feats = ["store", "dept","is_holiday","week","Size","Type_A","Type_B","Type_C","xmas","mean", "std", "median", "size","dayofweek","dayofyear","sum","month"]

In [15]:
def check_log_model(df, feats, model):
    df_train = df[ ~df["weekly_sales"].isna()].copy()
    df_test = df[ df["weekly_sales"].isna()].copy()
    
    X = df_train[feats]
    y = df_train["weekly_sales"]
    
    #dodaje srednią żeby wyeliminować log(0)
    m = df_train["weekly_sales"].mean()
    y_log = np.log(y + m)
    
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = []
    for train_idx, test_idx in cv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_log_train, y_test = y_log.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_log_train)
        y_log_pred = model.predict(X_test)
        y_pred = np.exp(y_log_pred) - m

        score = mean_absolute_error(y_test, y_pred)
        scores.append(score)

    return np.mean(scores), np.std(scores)

In [40]:
check_log_model(df_sales, feats, model=xgb.XGBRegressor(max_depth=10, n_estimators=100, learning_rate=0.3, random_state=0))

(1320.5295, 12.764975)

In [42]:
check_log_model(df_sales, feats, model=ctb.CatBoostRegressor(max_depth=10, n_estimators=100, verbose=0))

(1615.3636033674914, 11.021358742602361)

In [22]:
check_log_model(df_sales, feats, model=DecisionTreeRegressor(max_depth=10))

(2204.3710320522173, 17.63398726067117)

In [43]:
#xgboost parametry standardowe
df_sales_train = df_sales[ ~df_sales["weekly_sales"].isna() ].copy()
df_sales_test = df_sales[ df_sales["weekly_sales"].isna()].copy()


X_train = df_sales_train[ feats ]
X_test = df_sales_test[ feats ]

y_train = df_sales_train["weekly_sales"]


m = df_sales_train["weekly_sales"].mean()
y_log_train = np.log(y_train + m)
model = xgb.XGBRegressor(max_depth=10, n_estimators=100, learning_rate=0.3, random_state=0)
#model = ctb.CatBoostRegressor(max_depth=10, n_estimators=100, verbose=0)
model.fit(X_train, y_log_train)
y_log_pred = model.predict(X_test)
y_pred = np.exp(y_log_pred) - m
df_sales_test["weekly_sales"] = y_pred
df_sales_test[ ["id", "weekly_sales"] ].to_csv("../output/xgb_hp_late_submit.csv", index=False)

#late submit : private: 1839.07082 public: 1850.54742

In [16]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import train_test_split
from functools import partial

In [18]:
#xgboost hyperopt
df_sales_train = df_sales[ ~df_sales["weekly_sales"].isna() ].copy()
df_sales_test = df_sales[ df_sales["weekly_sales"].isna()].copy()


X_train = df_sales_train[ feats ]
X_test = df_sales_test[ feats ]

y_train = df_sales_train["weekly_sales"]

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3)

def objective(space):
    
    xgb_params = {
        'max_depth': int(space['max_depth']),
        'colsample_bytree': space['colsample_bytree'],
        'learning_rate': space['learning_rate'],
        'subsample': space['subsample'],
        'random_state': int(space['random_state']),
        'min_child_weight': int(space['min_child_weight']),
        'reg_alpha': space['reg_alpha'],
        'reg_lambda': space['reg_lambda'],
        'n_estimators': 100

    }
    
    m = y_train.mean()
    y_log_train = np.log(y_train + m)
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(X_train, y_log_train)
    y_log_pred = model.predict(X_test)
    y_pred = np.exp(y_log_pred) - m

    score = mean_absolute_error(y_test, y_pred)
    
    return{'loss':score, 'status': STATUS_OK }
    
space ={
    'max_depth': hp.quniform ('x_max_depth', 5, 20, 1),
    'colsample_bytree': hp.uniform ('x_colsample_bytree', 0.8, 1.),
    'learning_rate': hp.uniform ('x_learning_rate', 0.05, 0.3),
    'subsample': hp.uniform ('x_subsample', 0.7, 1.),
    'random_state': hp.quniform ('x_random_state', 0, 10000, 50),
    'min_child_weight': hp.quniform ('x_min_child_weight', 1, 10, 1),
    'reg_alpha': hp.loguniform ('x_reg_alpha', 0., 1.),
    'reg_lambda': hp.uniform ('x_reg_lambda', 0.7, 1.),
}


trials = Trials()
best_params = fmin(fn=objective,
            space=space,
            algo=partial(tpe.suggest, n_startup_jobs=1),
            max_evals=30,
            trials=trials)

print("The best params: ", best_params)

100%|██████████| 30/30 [17:23<00:00, 34.78s/trial, best loss: 1305.101806640625] 
The best params:  {'x_colsample_bytree': 0.9492692502473196, 'x_learning_rate': 0.2630861615653215, 'x_max_depth': 16.0, 'x_min_child_weight': 3.0, 'x_random_state': 3350.0, 'x_reg_alpha': 1.1227129777824711, 'x_reg_lambda': 0.7523579745515265, 'x_subsample': 0.9579416319496641}


In [21]:
best_params['n_estimators']=100
best_params

{'x_colsample_bytree': 0.9492692502473196,
 'x_learning_rate': 0.2630861615653215,
 'x_max_depth': 16.0,
 'x_min_child_weight': 3.0,
 'x_random_state': 3350.0,
 'x_reg_alpha': 1.1227129777824711,
 'x_reg_lambda': 0.7523579745515265,
 'x_subsample': 0.9579416319496641,
 'n_estimators': 100}

In [24]:
best_params = {
    'colsample_bytree': 0.9492692502473196,
    'learning_rate': 0.2630861615653215,
    'max_depth': 16,
    'min_child_weight': 3,
    'random_state': 3350,
    'reg_alpha': 1.1227129777824711,
    'reg_lambda': 0.7523579745515265,
    'subsample': 0.9579416319496641,
    'n_estimators': 100
}
best_params

{'colsample_bytree': 0.9492692502473196,
 'learning_rate': 0.2630861615653215,
 'max_depth': 16,
 'min_child_weight': 3,
 'random_state': 3350,
 'reg_alpha': 1.1227129777824711,
 'reg_lambda': 0.7523579745515265,
 'subsample': 0.9579416319496641,
 'n_estimators': 100}

In [25]:
check_log_model(df_sales, feats, model=xgb.XGBRegressor(**best_params))

(1279.1025, 14.136362)

In [26]:
#xgboost parametry po hyperopt
X_train = df_sales_train[ feats ]
X_test = df_sales_test[ feats ]

y_train = df_sales_train["weekly_sales"]


m = df_sales_train["weekly_sales"].mean()
y_log_train = np.log(y_train + m)
model = xgb.XGBRegressor(**best_params)
#model = ctb.CatBoostRegressor(max_depth=10, n_estimators=100, verbose=0)
model.fit(X_train, y_log_train)
y_log_pred = model.predict(X_test)
y_pred = np.exp(y_log_pred) - m
df_sales_test["weekly_sales"] = y_pred
df_sales_test[ ["id", "weekly_sales"] ].to_csv("../output/xgb_hopt_late_submit.csv", index=False)

# private: 1833.92833,  public: 1851.31115