In [150]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [151]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
train_calendar = pd.read_csv('./data/train_calendar.csv')
test_calendar = pd.read_csv('./data/test_calendar.csv')

train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

In [152]:
train.head()

Unnamed: 0,warehouse,date,orders,holiday_name,holiday,shutdown,mini_shutdown,shops_closed,winter_school_holidays,school_holidays,blackout,mov_change,frankfurt_shutdown,precipitation,snow,user_activity_1,user_activity_2,id
0,Prague_1,2020-12-05,6895.0,,0,0,0,0,0,0,0,0.0,0,0.0,0.0,1722.0,32575.0,Prague_1_2020-12-05
1,Prague_1,2020-12-06,6584.0,,0,0,0,0,0,0,0,0.0,0,0.0,0.0,1688.0,32507.0,Prague_1_2020-12-06
2,Prague_1,2020-12-07,7030.0,,0,0,0,0,0,0,0,0.0,0,0.0,0.0,1696.0,32552.0,Prague_1_2020-12-07
3,Prague_1,2020-12-08,6550.0,,0,0,0,0,0,0,0,0.0,0,0.8,0.0,1681.0,32423.0,Prague_1_2020-12-08
4,Prague_1,2020-12-09,6910.0,,0,0,0,0,0,0,0,0.0,0,0.5,0.0,1704.0,32410.0,Prague_1_2020-12-09


In [153]:
test.head()

Unnamed: 0,warehouse,date,holiday_name,holiday,shops_closed,winter_school_holidays,school_holidays,id
0,Prague_1,2024-03-16,,0,0,0,0,Prague_1_2024-03-16
1,Prague_1,2024-03-17,,0,0,0,0,Prague_1_2024-03-17
2,Prague_1,2024-03-18,,0,0,0,0,Prague_1_2024-03-18
3,Prague_1,2024-03-19,,0,0,0,0,Prague_1_2024-03-19
4,Prague_1,2024-03-20,,0,0,0,0,Prague_1_2024-03-20


In [154]:
test_id = test['id']
test.drop(columns=['id'], inplace=True)

In [155]:
train_orders = train['orders']
train.drop(columns=[c for c in train.columns if c not in test.columns], inplace=True)
train.head()

Unnamed: 0,warehouse,date,holiday_name,holiday,shops_closed,winter_school_holidays,school_holidays
0,Prague_1,2020-12-05,,0,0,0,0
1,Prague_1,2020-12-06,,0,0,0,0
2,Prague_1,2020-12-07,,0,0,0,0
3,Prague_1,2020-12-08,,0,0,0,0
4,Prague_1,2020-12-09,,0,0,0,0


In [156]:
first_date = min(train.date.min(), test.date.min())
train['date_id'] = (train.date - first_date).dt.days
train['year'] = train.date.dt.year
train['month'] = train.date.dt.month
train['day'] = train.date.dt.day
train['day_of_week'] = train.date.dt.dayofweek
train['day_of_year'] = train.date.dt.dayofyear
train['quarter'] = train.date.dt.quarter
train.drop('date', axis=1, inplace=True)

test['date_id'] = (test.date - first_date).dt.days
test['year'] = test.date.dt.year
test['month'] = test.date.dt.month
test['day'] = test.date.dt.day
test['day_of_week'] = test.date.dt.dayofweek
test['day_of_year'] = test.date.dt.dayofyear
test['quarter'] = test.date.dt.quarter
test.drop('date', axis=1, inplace=True)

In [157]:
from math import pi

for x in [train, test]:
    x['month_sin'] = np.sin(2 * pi * x['month'])
    x['month_cos'] = np.cos(2 * pi * x['month'])
    x['day_sin'] = np.sin(2 * pi * x['day'])
    x['day_cos'] = np.cos(2 * pi * x['day'])
    x['day_year_sin'] = np.sin(2 * pi * x["day_of_year"])
    x['day_year_cos'] = np.cos(2 * pi * x['day_of_year'])

In [158]:
test.head()

Unnamed: 0,warehouse,holiday_name,holiday,shops_closed,winter_school_holidays,school_holidays,date_id,year,month,day,day_of_week,day_of_year,quarter,month_sin,month_cos,day_sin,day_cos,day_year_sin,day_year_cos
0,Prague_1,,0,0,0,0,1197,2024,3,16,5,76,1,-7.347881e-16,1.0,-3.91887e-15,1.0,-4.703634e-14,1.0
1,Prague_1,,0,0,0,0,1198,2024,3,17,6,77,1,-7.347881e-16,1.0,2.941628e-15,1.0,-1.175413e-14,1.0
2,Prague_1,,0,0,0,0,1199,2024,3,18,0,78,1,-7.347881e-16,1.0,-4.408728e-15,1.0,-3.331534e-14,1.0
3,Prague_1,,0,0,0,0,1200,2024,3,19,1,79,1,-7.347881e-16,1.0,-1.175909e-14,1.0,1.966863e-15,1.0
4,Prague_1,,0,0,0,0,1201,2024,3,20,2,80,1,-7.347881e-16,1.0,-4.898587e-15,1.0,-1.959435e-14,1.0


In [159]:
train.fillna('None', inplace=True)
test.fillna('None', inplace=True)

In [160]:
train.head()

Unnamed: 0,warehouse,holiday_name,holiday,shops_closed,winter_school_holidays,school_holidays,date_id,year,month,day,day_of_week,day_of_year,quarter,month_sin,month_cos,day_sin,day_cos,day_year_sin,day_year_cos
0,Prague_1,,0,0,0,0,0,2020,12,5,5,340,4,-2.939152e-15,1.0,-1.224647e-15,1.0,-5.485427e-14,1.0
1,Prague_1,,0,0,0,0,1,2020,12,6,6,341,4,-2.939152e-15,1.0,-1.469576e-15,1.0,-3.037892e-13,1.0
2,Prague_1,,0,0,0,0,2,2020,12,7,0,342,4,-2.939152e-15,1.0,-1.714506e-15,1.0,-9.79767e-14,1.0
3,Prague_1,,0,0,0,0,3,2020,12,8,1,343,4,-2.939152e-15,1.0,-1.959435e-15,1.0,1.078358e-13,1.0
4,Prague_1,,0,0,0,0,4,2020,12,9,2,344,4,-2.939152e-15,1.0,-2.204364e-15,1.0,-1.410991e-13,1.0


In [161]:
train['is_holiday'] = train['holiday_name']!='None'
test['is_holiday'] = test['holiday_name']!='None'

In [162]:
train['warehouse'].unique()
test['warehouse'].unique()
train['holiday_name'].unique()
test['holiday_name'].unique()

array(['Prague_1', 'Brno_1', 'Prague_2', 'Prague_3', 'Munich_1',
       'Frankfurt_1', 'Budapest_1'], dtype=object)

array(['Prague_1', 'Brno_1', 'Prague_2', 'Prague_3', 'Munich_1',
       'Frankfurt_1', 'Budapest_1'], dtype=object)

array(['None', 'Christmas Eve', '2nd Christmas Day', 'New Years Day',
       'International womens day', 'Good Friday', 'Easter Monday',
       'Labour Day', 'Den osvobozeni', 'Cyrila a Metodej', 'Jan Hus',
       'Den ceske statnosti',
       'Den vzniku samostatneho ceskoslovenskeho statu',
       'Den boje za svobodu a demokracii', 'Peace Festival in Augsburg',
       'Reformation Day', 'Memorial Day of the Republic',
       'Memorial Day for the Victims of the Communist Dictatorships',
       'Memorial Day for the Victims of the Holocaust',
       'National Defense Day', 'Day of National Unity',
       'Independent Hungary Day', 'Memorial Day for the Martyrs of Arad',
       '1848 Revolution Memorial Day (Extra holiday)',
       "All Saints' Day Holiday"], dtype=object)

array(['None', 'Good Friday', 'Easter Monday', 'Labour Day',
       'Den osvobozeni', 'Memorial Day for the Victims of the Holocaust'],
      dtype=object)

In [163]:
le = {}
for col in ['warehouse', 'holiday_name']:
    le[col] = LabelEncoder()
    train[col] = le[col].fit_transform(train[col])
    test[col] = le[col].transform(test[col])

In [164]:
train.head()

Unnamed: 0,warehouse,holiday_name,holiday,shops_closed,winter_school_holidays,school_holidays,date_id,year,month,day,day_of_week,day_of_year,quarter,month_sin,month_cos,day_sin,day_cos,day_year_sin,day_year_cos,is_holiday
0,4,22,0,0,0,0,0,2020,12,5,5,340,4,-2.939152e-15,1.0,-1.224647e-15,1.0,-5.485427e-14,1.0,False
1,4,22,0,0,0,0,1,2020,12,6,6,341,4,-2.939152e-15,1.0,-1.469576e-15,1.0,-3.037892e-13,1.0,False
2,4,22,0,0,0,0,2,2020,12,7,0,342,4,-2.939152e-15,1.0,-1.714506e-15,1.0,-9.79767e-14,1.0,False
3,4,22,0,0,0,0,3,2020,12,8,1,343,4,-2.939152e-15,1.0,-1.959435e-15,1.0,1.078358e-13,1.0,False
4,4,22,0,0,0,0,4,2020,12,9,2,344,4,-2.939152e-15,1.0,-2.204364e-15,1.0,-1.410991e-13,1.0,False


In [165]:
test.head()

Unnamed: 0,warehouse,holiday_name,holiday,shops_closed,winter_school_holidays,school_holidays,date_id,year,month,day,day_of_week,day_of_year,quarter,month_sin,month_cos,day_sin,day_cos,day_year_sin,day_year_cos,is_holiday
0,4,22,0,0,0,0,1197,2024,3,16,5,76,1,-7.347881e-16,1.0,-3.91887e-15,1.0,-4.703634e-14,1.0,False
1,4,22,0,0,0,0,1198,2024,3,17,6,77,1,-7.347881e-16,1.0,2.941628e-15,1.0,-1.175413e-14,1.0,False
2,4,22,0,0,0,0,1199,2024,3,18,0,78,1,-7.347881e-16,1.0,-4.408728e-15,1.0,-3.331534e-14,1.0,False
3,4,22,0,0,0,0,1200,2024,3,19,1,79,1,-7.347881e-16,1.0,-1.175909e-14,1.0,1.966863e-15,1.0,False
4,4,22,0,0,0,0,1201,2024,3,20,2,80,1,-7.347881e-16,1.0,-4.898587e-15,1.0,-1.959435e-14,1.0,False


In [166]:
X = train
y = train_orders

In [132]:
hyper_param = {
    'n_estimators': [200,250,150],
    'max_depth': [7,8,9],
    'learning_rate': [0.05, 0.1, 0.2]
}
grid_search = GridSearchCV(estimator=XGBRegressor(), param_grid=hyper_param, cv=5, scoring='neg_mean_absolute_percentage_error', verbose=1, n_jobs=-1, refit=True)
grid_search.fit(X, y)
best_estimator = grid_search.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [71]:
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

Best parameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 150}


In [168]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

def objective(params):
    model = XGBRegressor(**params)
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_percentage_error', n_jobs=-1).mean()
    return {'loss': -score, 'status': STATUS_OK}

space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 500, 25)),
    'max_depth': scope.int(hp.quniform('max_depth', 3, 15, 1)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'gamma': hp.uniform('gamma', 0, 0.5),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
}

trials = Trials()

best_params = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=200,
    trials=trials
)

print("Best Hyperparameters:", best_params)


100%|██████████| 200/200 [01:21<00:00,  2.45trial/s, best loss: 0.2606806882399134]
Best Hyperparameters: {'colsample_bytree': 0.5613444506160323, 'gamma': 0.23853348823019876, 'learning_rate': 0.29375510490170303, 'max_depth': 6.0, 'min_child_weight': 4.0, 'n_estimators': 325.0, 'subsample': 0.7772134298041764}


In [169]:
best_model = XGBRegressor(
    n_estimators=int(best_params['n_estimators']),
    max_depth=int(best_params['max_depth']),
    learning_rate=best_params['learning_rate'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    gamma=best_params['gamma'],
    min_child_weight=int(best_params['min_child_weight'])
)

best_model.fit(X, y)


In [170]:
Y_test = best_model.predict(test)

In [171]:
submission = pd.DataFrame({'id': test_id, 'Target': Y_test})

In [173]:
submission.head()

Unnamed: 0,id,Target
0,Prague_1_2024-03-16,10587.12793
1,Prague_1_2024-03-17,10252.371094
2,Prague_1_2024-03-18,9764.500977
3,Prague_1_2024-03-19,9573.128906
4,Prague_1_2024-03-20,9606.106445


In [174]:
submission.to_csv('./hyperopt_submission.csv', index=False)