# Winning a Kaggle Competition in Python

## Kaggle competitions process

### Explore train data

In [2]:
import pandas as pd

train = pd.read_csv('demand_forecasting_train_1_month.csv')

# Look at the shape of the data
print('Train shape:', train.shape)

train.head()

Train shape: (15500, 5)


Unnamed: 0,id,date,store,item,sales
0,100000,2017-12-01,1,1,19
1,100001,2017-12-02,1,1,16
2,100002,2017-12-03,1,1,31
3,100003,2017-12-04,1,1,7
4,100004,2017-12-05,1,1,20


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15500 entries, 0 to 15499
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      15500 non-null  int64 
 1   date    15500 non-null  object
 2   store   15500 non-null  int64 
 3   item    15500 non-null  int64 
 4   sales   15500 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 605.6+ KB


In [4]:
train.memory_usage(deep=True)

Index        128
id        124000
date     1038500
store     124000
item      124000
sales     124000
dtype: int64

### Explore test data

In [15]:
test = pd.read_csv('demand_forecasting_test.csv')

# Print train and test columns
print('Train columns:', train.columns.tolist())
print('Test columns:', test.columns.tolist())

Train columns: ['id', 'date', 'store', 'item', 'sales']
Test columns: ['id', 'date', 'store', 'item']


In [7]:
sample_submission = pd.read_csv('sample_submission_demand.csv')

sample_submission.head()

Unnamed: 0,id,sales
0,0,52
1,1,52
2,2,52
3,3,52
4,4,52


### Train a simple model

In [8]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

# Train a model
rf.fit(X=train[['store', 'item']], y=train['sales'])

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

### Prepare a submission

In [9]:
# Get predictions for the test set
test['sales'] = rf.predict(test[['store', 'item']])

# Write test predictions using the sample_submission format
test[['id', 'sales']].to_csv('kaggle_submission.csv', index=False)

### Explore overfitting XGBoost

In [23]:
train = pd.read_csv('train.csv')

In [24]:
train.shape

(913000, 4)

In [25]:
dates =['2017-11-01',
 '2017-11-02',
 '2017-11-03',
 '2017-11-04',
 '2017-11-05',
 '2017-11-06',
 '2017-11-07',
 '2017-11-08',
 '2017-11-09',
 '2017-11-10',
 '2017-11-11',
 '2017-11-12',
 '2017-11-13',
 '2017-11-14',
 '2017-11-15',
 '2017-11-16',
 '2017-11-17',
 '2017-11-18',
 '2017-11-19',
 '2017-11-20',
 '2017-11-21',
 '2017-11-22',
 '2017-11-23',
 '2017-11-24',
 '2017-11-25',
 '2017-11-26',
 '2017-11-27',
 '2017-11-28',
 '2017-11-29',
 '2017-11-30']

In [26]:
trim_train = train[train.date.apply(lambda x: x in dates)]

In [27]:
trim_train.shape

(15000, 4)

In [28]:
trim_train.head()

Unnamed: 0,date,store,item,sales
1765,2017-11-01,1,1,16
1766,2017-11-02,1,1,21
1767,2017-11-03,1,1,18
1768,2017-11-04,1,1,34
1769,2017-11-05,1,1,23


In [29]:
dates_test = ['2017-12-01',
 '2017-12-02',
 '2017-12-03',
 '2017-12-04',
 '2017-12-05',
 '2017-12-06',
 '2017-12-07',
 '2017-12-08',
 '2017-12-09',
 '2017-12-10',
 '2017-12-11',
 '2017-12-12',
 '2017-12-13',
 '2017-12-14',
 '2017-12-15',
 '2017-12-16',
 '2017-12-17',
 '2017-12-18',
 '2017-12-19',
 '2017-12-20',
 '2017-12-21',
 '2017-12-22',
 '2017-12-23',
 '2017-12-24',
 '2017-12-25',
 '2017-12-26',
 '2017-12-27',
 '2017-12-28',
 '2017-12-29',
 '2017-12-30',
 '2017-12-31']

In [30]:
trim_test = train[train.date.apply(lambda x: x in dates_test)]

In [31]:
trim_test.shape

(15500, 4)

In [32]:
trim_test.head()

Unnamed: 0,date,store,item,sales
1795,2017-12-01,1,1,19
1796,2017-12-02,1,1,16
1797,2017-12-03,1,1,31
1798,2017-12-04,1,1,7
1799,2017-12-05,1,1,20


### Train XGBoost models

In [33]:
import xgboost as xgb

# Create DMatrix on train data
dtrain = xgb.DMatrix(data=trim_train[['store', 'item']],
                     label=trim_train['sales'])

# Define xgboost parameters
params = {'objective': 'reg:linear',
          'max_depth': 2,
          'silent': 1}

# Train xgboost model
xg_depth_2 = xgb.train(params=params, dtrain=dtrain)

In [34]:
# Define xgboost parameters
params = {'objective': 'reg:linear',
          'max_depth': 8,
          'silent': 1}

# Train xgboost model
xg_depth_8 = xgb.train(params=params, dtrain=dtrain)

In [35]:
# Define xgboost parameters
params = {'objective': 'reg:linear',
          'max_depth': 15,
          'silent': 1}

# Train xgboost model
xg_depth_15 = xgb.train(params=params, dtrain=dtrain)

In [37]:
from sklearn.metrics import mean_squared_error

dtest = xgb.DMatrix(data=trim_test[['store', 'item']])

# For each of 3 trained models
for model in [xg_depth_2, xg_depth_8, xg_depth_15]:
    # Make predictions
    train_pred = model.predict(dtrain)     
    test_pred = model.predict(dtest)          
    
    # Calculate metrics
    mse_train = mean_squared_error(trim_train['sales'], train_pred)                  
    mse_test = mean_squared_error(trim_test['sales'], test_pred)
    print('MSE Train: {:.3f}. MSE Test: {:.3f}. Diff:  {:.3f} '.format(mse_train, mse_test, mse_train - mse_test))

MSE Train: 631.275. MSE Test: 558.522. Diff:  72.753 
MSE Train: 183.771. MSE Test: 337.337. Diff:  -153.566 
MSE Train: 134.984. MSE Test: 355.534. Diff:  -220.550 
