In [1]:
import os
import sys
from datetime import datetime

import pandas as pd
import numpy as np

# Data Viz
import dtale
import matplotlib.pyplot as plt
import seaborn as sns

# Models
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

from catboost import CatBoostRegressor, Pool
import xgboost as xgb


### Constants and Configurations

In [2]:
BASE_DIR = os.path.dirname(os.path.dirname(os.path.realpath("__file__")))
DATA_DIR = os.path.join(BASE_DIR, "data/17092021")
MODEL_DIR = os.path.join(BASE_DIR, "model/17092021")

### Data Cleaning & Transformation

In [3]:
train = pd.read_csv(f"{DATA_DIR}/TRAIN.csv")
test = pd.read_csv(f"{DATA_DIR}/TEST_FINAL.csv")

def clean_data(df):

    if 'ID' in df.columns:
        df.set_index(['ID', 'Store_id'], inplace=True)

    # Holiday preprocess
    df.loc[(df['Holiday'] == 1), 'Holiday'] = "Yes"
    df.loc[(df['Holiday'] == 0), 'Holiday'] = "No"

    # # Sales Log
    # try:
    #     df['SalesLog'] = np.log(df['Sales'])
    # except:
    #     pass

    # Convert datetimes
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['DateOrdinal'] = df['Date'].map(datetime.toordinal)

    return df


def transform_data(df):

    # Store_Type
    df.loc[(df['Store_Type'] == 'S2'), 'Store_Type'] = 1
    df.loc[(df['Store_Type'] == 'S3'), 'Store_Type'] = 2
    df.loc[(df['Store_Type'] == 'S4'), 'Store_Type'] = 3
    df.loc[(df['Store_Type'] == 'S1'), 'Store_Type'] = 4
    df['Store_Type'] = df['Store_Type'].astype(int)

    # Location_Type
    df.loc[(df['Location_Type'] == 'L4'), 'Location_Type'] = 1
    df.loc[(df['Location_Type'] == 'L5'), 'Location_Type'] = 2
    df.loc[(df['Location_Type'] == 'L3'), 'Location_Type'] = 3
    df.loc[(df['Location_Type'] == 'L2'), 'Location_Type'] = 4
    df.loc[(df['Location_Type'] == 'L1'), 'Location_Type'] = 5
    df['Location_Type'] = df['Location_Type'].astype(int)

    # Region_Code
    df.loc[(df['Region_Code'] == 'R4'), 'Region_Code'] = 1
    df.loc[(df['Region_Code'] == 'R3'), 'Region_Code'] = 2
    df.loc[(df['Region_Code'] == 'R2'), 'Region_Code'] = 3
    df.loc[(df['Region_Code'] == 'R1'), 'Region_Code'] = 4
    df['Region_Code'] = df['Region_Code'].astype(int)

    # Holiday
    df.loc[(df['Holiday'] == 'Yes'), 'Holiday'] = 1
    df.loc[(df['Holiday'] == 'No'), 'Holiday'] = 0
    df['Holiday'] = df['Holiday'].astype(int)

    # Discount
    df.loc[(df['Discount'] == 'Yes'), 'Discount'] = 1
    df.loc[(df['Discount'] == 'No'), 'Discount'] = 0
    df['Discount'] = df['Discount'].astype(int)

    return df
    

print("Cleaning Training DataSet")
train = clean_data(train)
train = transform_data(train)
print("Cleaning Testing DataSet")
test = clean_data(test)
test = transform_data(test)

dtale.show(train)

Cleaning Training DataSet
Cleaning Testing DataSet




### CATBOOST-ORDER&SALES

In [4]:
train = train.reset_index().set_index(['ID','Sales'])
test = test.reset_index().set_index(['ID'])
train

Unnamed: 0_level_0,Unnamed: 1_level_0,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Year,Month,Day,DayOfWeek,DateOrdinal
ID,Sales,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
T1000001,7011.84,1,4,3,4,2018-01-01,1,1,9,2018,1,1,0,736695
T1000002,51789.12,253,3,4,4,2018-01-01,1,1,60,2018,1,1,0,736695
T1000003,36868.20,252,2,4,4,2018-01-01,1,1,42,2018,1,1,0,736695
T1000004,19715.16,251,1,3,4,2018-01-01,1,1,23,2018,1,1,0,736695
T1000005,45614.52,250,1,3,1,2018-01-01,1,1,62,2018,1,1,0,736695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T1188336,37272.00,149,1,3,3,2019-05-31,1,1,51,2019,5,31,4,737210
T1188337,54572.64,153,3,4,4,2019-05-31,1,0,90,2019,5,31,4,737210
T1188338,31624.56,154,4,3,3,2019-05-31,1,0,56,2019,5,31,4,737210
T1188339,49162.41,155,2,5,3,2019-05-31,1,1,70,2019,5,31,4,737210


In [5]:
# # Outlier Removal
# s = train['#Order']
# q1 = s.quantile(0.25)
# q3 = s.quantile(0.75)
# iqr = q3 - q1
# iqr_lower = q1 - 1.5 * iqr
# iqr_upper = q3 + 1.5 * iqr
# outliers = train.loc[(train['#Order'] < iqr_lower) | (train['#Order'] > iqr_upper)]

# train = train.loc[train.index.difference(outliers.index)]



In [6]:
# train_pivot = pd.pivot_table(train.reset_index(),
#                index=['Store_id'],
#                values=['Date', 'Sales', '#Order'],
#                aggfunc={'Date':[min, max],
#                         'Sales':np.mean,
#                         '#Order':np.mean
#                        }
#               ).reset_index()

# train_pivot.columns = train_pivot.columns.get_level_values(1).values + train_pivot.columns.get_level_values(0).values
# train_pivot

In [7]:
# test_pivot = pd.pivot_table(test.reset_index(),
#                index=['Store_id'],
#                values=['Date'],
#                aggfunc=[min, max]
#               ).reset_index()

# test_pivot.columns = test_pivot.columns.get_level_values(1).values + test_pivot.columns.get_level_values(0).values
# test_pivot

In [8]:
# test = test.reset_index()
# test.loc[(test['Store_id'] == 1)]

In [9]:
# train = train.reset_index()
# train.loc[(train['Store_id'] == 1)]

In [10]:
# _train=train.reset_index()
# _train['SalesPerOrder'] = _train['Sales'] / _train['#Order']
# dtale.show(_train)

In [11]:
# Defining Features & Target for Catboost

target = ['#Order'] # Sales
# features = train.columns.difference(target + ['#Order', 'DateOrdinal'])
features = [
            'Store_id', 'Store_Type', 'Region_Code', 'Location_Type', 'DateOrdinal', 'Holiday', 'Discount',
            'DayOfWeek'
           ] # , '#Order', 'Year', 'Date', 'Day', 'Month'

X = train[features]
y = train[target]

categorical_features = [x for x in X.select_dtypes(include='object').columns.tolist() if x not in ['ID']]
print(categorical_features)

X

[]


Unnamed: 0_level_0,Unnamed: 1_level_0,Store_id,Store_Type,Region_Code,Location_Type,DateOrdinal,Holiday,Discount,DayOfWeek
ID,Sales,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
T1000001,7011.84,1,4,4,3,736695,1,1,0
T1000002,51789.12,253,3,4,4,736695,1,1,0
T1000003,36868.20,252,2,4,4,736695,1,1,0
T1000004,19715.16,251,1,4,3,736695,1,1,0
T1000005,45614.52,250,1,1,3,736695,1,1,0
...,...,...,...,...,...,...,...,...,...
T1188336,37272.00,149,1,3,3,737210,1,1,4
T1188337,54572.64,153,3,4,4,737210,1,0,4
T1188338,31624.56,154,4,3,3,737210,1,0,4
T1188339,49162.41,155,2,3,5,737210,1,1,4


In [12]:
# Train Valid Split

# Random Sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Store Sampling
# store_ids = sorted(train['Store_id'].unique().tolist())
# train_size=0.9
# train_store_ids = store_ids[:round(len(store_ids)*train_size)]
# test_store_ids = store_ids[round(len(store_ids)*train_size)+1:]

# X_train = train.loc[(train['Store_id'].isin(train_store_ids)), features]
# y_train = train.loc[(train['Store_id'].isin(train_store_ids)), target]
# X_test = train.loc[(train['Store_id'].isin(test_store_ids)), features]
# y_test = train.loc[(train['Store_id'].isin(test_store_ids)), target]


print(f"X_train: {X_train.shape}\nX_test: {X_test.shape}\ny_train: {y_train.shape}\ny_test: {y_test.shape}")

# initialize Pool
train_pool = Pool(X_train,
                  y_train,
                  cat_features=categorical_features
                 )
test_pool = Pool(X_test,
                 y_test,
                 cat_features=categorical_features
                )


# Model Training

############################ CATBOOST ############################

params = {
    'iterations': 100000,
    'learning_rate': 0.01,
    'max_depth': 8,
    'eval_metric': 'R2',
    'random_seed': 42,
    'early_stopping_rounds': 200,
    'use_best_model': True
    # 'cat_features': categorical_features,
}

# Initialize CatBoostClassifier
model = CatBoostRegressor(**params)
# Fit model
model.fit(train_pool,
          eval_set=test_pool,
          # cat_features=categorical_features,
          verbose=False,
          plot=True,
         )

X_train: (150672, 8)
X_test: (37668, 8)
y_train: (150672, 1)
y_test: (37668, 1)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x7fdf2adb20f0>

In [13]:
# Get predictions
preds = model.predict(X_test)
preds = np.where(preds<0, 0, preds)


# Evaluate Model
print(f'mean_squared_error: {mean_squared_error(y_test, preds):.2f}')
print(f'mean_squared_log_error: {mean_squared_log_error(y_test, preds):.5f}')
print(f'r2_score: {r2_score(y_test, preds):.5f}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, preds)):.2f}')

pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)

mean_squared_error: 122.65
mean_squared_log_error: 0.03694
r2_score: 0.86803
RMSE: 11.07


Unnamed: 0,feature_importance,feature_names
1,28.895003,Store_Type
0,19.683053,Store_id
4,14.73107,DateOrdinal
6,9.20669,Discount
7,8.025717,DayOfWeek
3,7.657331,Location_Type
2,6.464089,Region_Code
5,5.337046,Holiday


In [14]:
final_test_df = pd.concat([X_test.reset_index(), y_test.reset_index(drop=True), pd.Series(preds, name='#Orderpred')], axis=1)
final_test_df

Unnamed: 0,ID,Sales,Store_id,Store_Type,Region_Code,Location_Type,DateOrdinal,Holiday,Discount,DayOfWeek,#Order,#Orderpred
0,T1002394,32667.00,287,4,4,1,736701,0,0,6,54,55.231313
1,T1028080,70352.25,261,3,4,4,736771,1,1,6,111,113.429332
2,T1031425,34824.00,112,4,2,5,736781,0,1,2,59,59.370704
3,T1093671,34365.00,6,3,4,5,736951,0,0,4,67,56.544722
4,T1080508,37107.00,341,4,3,5,736915,0,0,3,60,63.006858
...,...,...,...,...,...,...,...,...,...,...,...,...
37663,T1141883,50208.00,90,4,2,5,737083,0,1,3,75,75.386483
37664,T1036656,45321.00,359,1,3,3,736795,0,1,2,58,56.679796
37665,T1057070,24900.00,29,4,3,1,736851,0,0,2,40,33.705984
37666,T1158015,40329.00,30,4,3,4,737127,0,0,5,69,65.982454


In [15]:
test = test.reindex(X_train.columns, axis=1)

preds = model.predict(test)

test = pd.concat([test.reset_index(), pd.Series(preds, name='#Orderpred')], axis=1).set_index(['ID'])
test


Unnamed: 0_level_0,Store_id,Store_Type,Region_Code,Location_Type,DateOrdinal,Holiday,Discount,DayOfWeek,#Orderpred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
T1188341,171,3,2,4,737211,0,0,5,98.220390
T1188342,172,4,4,5,737211,0,0,5,68.363392
T1188343,173,3,4,4,737211,0,0,5,131.058924
T1188344,174,4,1,5,737211,0,0,5,60.290593
T1188345,170,4,3,5,737211,0,0,5,62.863613
...,...,...,...,...,...,...,...,...,...
T1210601,186,1,3,2,737271,0,0,2,25.338540
T1210602,11,3,4,4,737271,0,0,2,95.670111
T1210603,185,4,2,5,737271,0,1,2,65.092039
T1210604,69,4,1,5,737271,0,0,2,44.273068


In [16]:
# Get predictions
preds = model.predict(X)
preds = np.where(preds<0, 0, preds)


train = pd.concat([X.reset_index(), y.reset_index(drop=True), pd.Series(preds, name='#Orderpred')], axis=1)
train = train.set_index(['ID'])
train

Unnamed: 0_level_0,Sales,Store_id,Store_Type,Region_Code,Location_Type,DateOrdinal,Holiday,Discount,DayOfWeek,#Order,#Orderpred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
T1000001,7011.84,1,4,4,3,736695,1,1,0,9,64.309698
T1000002,51789.12,253,3,4,4,736695,1,1,0,60,49.905361
T1000003,36868.20,252,2,4,4,736695,1,1,0,42,43.572812
T1000004,19715.16,251,1,4,3,736695,1,1,0,23,39.384591
T1000005,45614.52,250,1,1,3,736695,1,1,0,62,52.834829
...,...,...,...,...,...,...,...,...,...,...,...
T1188336,37272.00,149,1,3,3,737210,1,1,4,51,47.875411
T1188337,54572.64,153,3,4,4,737210,1,0,4,90,108.885850
T1188338,31624.56,154,4,3,3,737210,1,0,4,56,49.279134
T1188339,49162.41,155,2,3,5,737210,1,1,4,70,72.167423


In [17]:
# Defining Features & Target for Catboost

target = ['Sales']
# features = train.columns.difference(target + ['#Order', 'DateOrdinal'])
features = [
            '#Orderpred', 'Store_id', 'DateOrdinal', 'DayOfWeek'
           ] # , '#Order', 'Year', 'Date', 'Month', 'Day', 'DateOrdinal', 
             # 'Holiday', 'Store_Type', 'Region_Code', 'Location_Type', 'Discount',

X = train[features]
y = train[target]

categorical_features = [x for x in X.select_dtypes(include='object').columns.tolist() if x not in ['ID']]
print(categorical_features)

X

[]


Unnamed: 0_level_0,#Orderpred,Store_id,DateOrdinal,DayOfWeek
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
T1000001,64.309698,1,736695,0
T1000002,49.905361,253,736695,0
T1000003,43.572812,252,736695,0
T1000004,39.384591,251,736695,0
T1000005,52.834829,250,736695,0
...,...,...,...,...
T1188336,47.875411,149,737210,4
T1188337,108.885850,153,737210,4
T1188338,49.279134,154,737210,4
T1188339,72.167423,155,737210,4


In [18]:
# Train Valid Split

# Random Sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"X_train: {X_train.shape}\nX_test: {X_test.shape}\ny_train: {y_train.shape}\ny_test: {y_test.shape}")

# initialize Pool
train_pool = Pool(X_train,
                  y_train,
                  cat_features=categorical_features
                 )
test_pool = Pool(X_test,
                 y_test,
                 cat_features=categorical_features
                )


# Model Training

############################ CATBOOST ############################

params = {
    'iterations': 100000,
    'learning_rate': 0.01,
    'max_depth': 6,
    'eval_metric': 'R2',
    'random_seed': 42,
    'early_stopping_rounds': 100,
    'use_best_model': True
    # 'cat_features': categorical_features,
}

# Initialize CatBoostClassifier
model = CatBoostRegressor(**params)
# Fit model
model.fit(train_pool,
          eval_set=test_pool,
          # cat_features=categorical_features,
          verbose=False,
          plot=True,
         )

X_train: (131838, 4)
X_test: (56502, 4)
y_train: (131838, 1)
y_test: (56502, 1)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x7fdf14b2d278>

In [19]:
# Get predictions
preds = model.predict(X_test)
preds = np.where(preds<0, 0, preds)


# Evaluate Model
print(f'mean_squared_error: {mean_squared_error(y_test, preds):.2f}')
print(f'mean_squared_log_error: {mean_squared_log_error(y_test, preds):.5f}')
print(f'r2_score: {r2_score(y_test, preds):.5f}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, preds)):.2f}')

pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)

mean_squared_error: 52391383.41
mean_squared_log_error: 0.03650
r2_score: 0.84543
RMSE: 7238.19


Unnamed: 0,feature_importance,feature_names
0,60.136764,#Orderpred
1,19.689177,Store_id
2,18.111705,DateOrdinal
3,2.062353,DayOfWeek


In [20]:
final_test_df = pd.concat([X_test.reset_index(), y_test.reset_index(drop=True), pd.Series(preds, name='#Salespred')], axis=1)
final_test_df

Unnamed: 0,ID,#Orderpred,Store_id,DateOrdinal,DayOfWeek,Sales,#Salespred
0,T1002394,55.231313,287,736701,6,32667.00,32277.403110
1,T1028080,113.429332,261,736771,6,70352.25,72005.497073
2,T1031425,59.370704,112,736781,2,34824.00,36538.626478
3,T1093671,56.544722,6,736951,4,34365.00,26368.378546
4,T1080508,63.006858,341,736915,3,37107.00,38338.617935
...,...,...,...,...,...,...,...
56497,T1133835,91.168420,11,737061,2,53619.00,61271.457945
56498,T1178140,79.628752,110,737183,5,62964.00,57318.741237
56499,T1125451,93.064267,275,737038,0,41610.00,54601.043334
56500,T1147858,66.445654,26,737100,6,58940.82,39690.953357


In [77]:
submission_df = test.reindex(X_train.columns, axis=1)

preds = model.predict(submission_df)

submission_df = pd.concat([submission_df.reset_index()['ID'], pd.Series(preds, name='Sales')], axis=1)
submission_df

Unnamed: 0,ID,Sales
0,T1188341,57931.942783
1,T1188342,40045.545702
2,T1188343,81118.241047
3,T1188344,35838.231150
4,T1188345,41201.534183
...,...,...
22260,T1210601,21159.017271
22261,T1210602,58737.797681
22262,T1210603,46212.128072
22263,T1210604,30436.344625


In [80]:
submission_df.to_csv(f"{DATA_DIR}/catboost_v6_orders_sales_model.csv", index=False)

### CATBOOST-DATAIMPUTATIONS

In [49]:
train = pd.read_csv(f"{DATA_DIR}/TRAIN.csv")
test = pd.read_csv(f"{DATA_DIR}/TEST_FINAL.csv")

print("Cleaning Training DataSet")
train = clean_data(train)
train = transform_data(train)
print("Cleaning Testing DataSet")
test = clean_data(test)
test = transform_data(test)


train = train.reset_index().set_index(['ID'])
test = test.reset_index().set_index(['ID'])

dtale.show(train)

Cleaning Training DataSet
Cleaning Testing DataSet




In [50]:
# Outlier Removal
s = train['Sales']
q1 = s.quantile(0.25)
q3 = s.quantile(0.75)
iqr = q3 - q1
iqr_lower = q1 - 1.5 * iqr
iqr_upper = q3 + 1.5 * iqr
outliers = train.loc[(train['Sales'] < iqr_lower) | (train['Sales'] > iqr_upper)]

train = train.loc[train.index.difference(outliers.index)]

In [51]:
# Defining Features & Target for Catboost

target = ['Sales']
# features = train.columns.difference(target + ['#Order', 'DateOrdinal'])
features = [
            'Store_Type', 'Region_Code', 'Location_Type', 'DateOrdinal', 'Holiday', 'Discount',
            'Day', 'Store_id'
           ] # , '#Order', 'Year', 'Date', 'Month'


X = train[features]
y = train[target]

categorical_features = [x for x in X.select_dtypes(include='object').columns.tolist() if x not in ['ID']]

In [52]:
# Train Valid Split

# Random Sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Store Sampling
# store_ids = sorted(train['Store_id'].unique().tolist())
# train_size=0.9
# train_store_ids = store_ids[:round(len(store_ids)*train_size)]
# test_store_ids = store_ids[round(len(store_ids)*train_size)+1:]

# X_train = train.loc[(train['Store_id'].isin(train_store_ids)), features]
# y_train = train.loc[(train['Store_id'].isin(train_store_ids)), target]
# X_test = train.loc[(train['Store_id'].isin(test_store_ids)), features]
# y_test = train.loc[(train['Store_id'].isin(test_store_ids)), target]


print(f"X_train: {X_train.shape}\nX_test: {X_test.shape}\ny_train: {y_train.shape}\ny_test: {y_test.shape}")

# initialize Pool
train_pool = Pool(X_train,
                  y_train,
                  cat_features=categorical_features
                 )
test_pool = Pool(X_test,
                 y_test,
                 cat_features=categorical_features
                )


# Model Training

############################ CATBOOST ############################

params = {
    'iterations': 10000,
    'learning_rate': 0.1,
    'max_depth': 6,
    'eval_metric': 'R2',
    'random_seed': 42,
    'early_stopping_rounds': 20,
    'use_best_model': True
    # 'cat_features': categorical_features,
}

# Initialize CatBoostClassifier
model = CatBoostRegressor(**params)
# Fit model
model.fit(train_pool,
          eval_set=test_pool,
          # cat_features=categorical_features,
          verbose=False,
          plot=True,
         )

X_train: (127747, 8)
X_test: (54750, 8)
y_train: (127747, 1)
y_test: (54750, 1)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x7f8c917eaeb8>

In [53]:
# Get predictions
preds = model.predict(X_test)
preds = np.where(preds<0, 0, preds)


# Evaluate Model
print(f'mean_squared_error: {mean_squared_error(y_test, preds):.2f}')
print(f'mean_squared_log_error: {mean_squared_log_error(y_test, preds):.5f}')
print(f'r2_score: {r2_score(y_test, preds):.5f}')
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, preds)):.2f}')

mean_squared_error: 76459891.05
mean_squared_log_error: 0.07687
r2_score: 0.67369
RMSE: 8744.13


In [54]:
final_test_df = pd.concat([X_test.reset_index(), y_test.reset_index(drop=True), pd.Series(preds, name='pred')], axis=1)
final_test_df

Unnamed: 0,ID,Store_Type,Region_Code,Location_Type,DateOrdinal,Holiday,Discount,Day,Store_id,Sales,pred
0,T1065059,4,3,3,736873,0,0,28,350,39369.00,32583.279749
1,T1023698,1,3,1,736759,0,0,6,265,31101.00,23795.415136
2,T1116161,1,3,1,737013,0,0,15,203,12852.00,21649.486423
3,T1083253,4,4,5,736923,1,0,17,138,25281.30,25325.120745
4,T1045739,2,4,4,736820,0,0,6,215,66885.00,49936.788508
...,...,...,...,...,...,...,...,...,...,...,...
54745,T1007766,4,1,5,736716,1,0,22,174,34352.64,34585.407724
54746,T1050475,2,3,5,736833,0,0,19,325,50724.00,40380.167162
54747,T1119170,2,2,5,737021,1,1,23,7,28948.59,42197.772168
54748,T1087993,1,1,3,736936,0,1,30,273,33486.00,32387.949240


In [55]:
pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)

Unnamed: 0,feature_importance,feature_names
5,21.661498,Discount
2,20.849726,Location_Type
3,18.569098,DateOrdinal
0,18.050317,Store_Type
4,10.201738,Holiday
6,7.274171,Day
7,2.32378,Store_id
1,1.069671,Region_Code


In [None]:
test = test.reindex(X_train.columns, axis=1)

preds = model.predict(test)

submission_df = pd.concat([test.reset_index()['ID'], pd.Series(preds, name='Sales')], axis=1)
submission_df.to_csv(f"{DATA_DIR}/catboost_v2_outlier_removal.csv", index=False)

### CATBOOST

In [4]:
train = pd.read_csv(f"{DATA_DIR}/TRAIN.csv")
test = pd.read_csv(f"{DATA_DIR}/TEST_FINAL.csv")


# Defining Features & Target for Catboost

target = ['Sales']
# features = train.columns.difference(target + ['#Order', 'DateOrdinal'])
features = [
            'Store_Type', 'Region_Code', 'Location_Type', 'DateOrdinal', 'Holiday', 'Discount'
           ] # , '#Order', 'Sales', 'SalesLog', 'Date', 'Year', 'Month', 'Day'

X = train[features]
y = train[target]

categorical_features = [x for x in X.select_dtypes(include='object').columns.tolist() if x not in ['ID']]

In [25]:
# Train Valid Split

# Random Sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Store Sampling
# store_ids = sorted(train['Store_id'].unique().tolist())
# train_size=0.9
# train_store_ids = store_ids[:round(len(store_ids)*train_size)]
# test_store_ids = store_ids[round(len(store_ids)*train_size)+1:]

# X_train = train.loc[(train['Store_id'].isin(train_store_ids)), features]
# y_train = train.loc[(train['Store_id'].isin(train_store_ids)), target]
# X_test = train.loc[(train['Store_id'].isin(test_store_ids)), features]
# y_test = train.loc[(train['Store_id'].isin(test_store_ids)), target]


print(f"X_train: {X_train.shape}\nX_test: {X_test.shape}\ny_train: {y_train.shape}\ny_test: {y_test.shape}")

# initialize Pool
train_pool = Pool(X_train,
                  y_train,
                  cat_features=categorical_features
                 )
test_pool = Pool(X_test,
                 y_test,
                 cat_features=categorical_features
                )


# Model Training

############################ CATBOOST ############################

params = {
    'iterations': 5000,
    'learning_rate': 0.08861923728929248,
    'depth': 6,
    # 'cat_features': categorical_features,
    'eval_metric': 'R2',
    'random_seed': 42,
    'early_stopping_rounds': 20
}

# Initialize CatBoostClassifier
model = CatBoostRegressor(**params)
# Fit model
model.fit(train_pool,
          eval_set=test_pool,
          # cat_features=categorical_features,
          verbose=False,
          plot=True)

X_train: (131838, 6)
X_test: (56502, 6)
y_train: (131838, 1)
y_test: (56502, 1)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x7fa1eaa70a90>

In [78]:
# params = {
#             'depth':[8, 10],
# #           'iterations':[1000, 5000, 10000],
#             'learning_rate':[0.1], # ,0.01,0.001
# #           'l2_leaf_reg':[3,1,5,10,100],
# #           'border_count':[32,5,10,20,50,100,200],
# #           'bagging_temperature':[0.03,0.09,0.25,0.75],
# #           'random_strength':[0.2,0.5,0.8],
# #           'max_ctr_complexity':[1,2,3,4,5]
#          }


# model = CatBoostRegressor(cat_features=categorical_features)
# grid_search_result = model.grid_search(params,
#                                        X=X_train,
#                                        y=y_train,
#                                        cv=5,
#                                        partition_random_seed=42,
#                                        stratified=True)

# grid_search_result

In [26]:
# Get predictions
preds = model.predict(X_test)
preds = np.where(preds<0, 0, preds)


# Evaluate Model
print(f'mean_squared_error: {mean_squared_error(y_test, preds)}')
print(f'mean_squared_log_error: {mean_squared_log_error(y_test, preds)}')
print(f'r2_score: {r2_score(y_test, preds)}')

mean_squared_error: 93724714.34322993
mean_squared_log_error: 0.08054008839958912
r2_score: 0.7234771531179283


In [27]:
final_test_df = pd.concat([X_test.reset_index(), y_test.reset_index(drop=True), pd.Series(preds, name='pred')], axis=1)
final_test_df

Unnamed: 0,ID,Store_id,Store_Type,Region_Code,Location_Type,DateOrdinal,Holiday,Discount,Sales,pred
0,T1002394,287,S1,R1,L4,736701,No,No,32667.00,33857.969920
1,T1028080,261,S4,R1,L2,736771,Yes,Yes,70352.25,86393.218102
2,T1031425,112,S1,R3,L1,736781,No,Yes,34824.00,42331.069239
3,T1093671,6,S4,R1,L1,736951,No,No,34365.00,41236.997980
4,T1080508,341,S1,R2,L1,736915,No,No,37107.00,33321.788233
...,...,...,...,...,...,...,...,...,...,...
56497,T1133835,11,S4,R1,L2,737061,No,No,53619.00,66523.594119
56498,T1178140,110,S2,R3,L3,737183,No,Yes,62964.00,64380.442285
56499,T1125451,275,S4,R1,L1,737038,No,Yes,41610.00,61216.354986
56500,T1147858,26,S1,R3,L1,737100,Yes,No,58940.82,37025.850856


In [28]:
pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)

Unnamed: 0,feature_importance,feature_names
3,39.09885,DateOrdinal
5,21.454436,Discount
0,14.394313,Store_Type
2,13.225919,Location_Type
4,9.293336,Holiday
1,2.533147,Region_Code


In [29]:
test = test.reindex(X_train.columns, axis=1)

preds = model.predict(test)

submission_df = pd.concat([test.reset_index()['ID'], pd.Series(preds, name='Sales')], axis=1)
submission_df.to_csv(f"{DATA_DIR}/catboost_v13_draft_submission.csv", index=False)

### CATBOOST - OPTUNA

In [23]:
import optuna


def objective(trial):

    train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3, random_state=42)

    param = {
        "eval_metric": trial.suggest_categorical("eval_metric", ["MSLE"]),
        "depth": trial.suggest_int("depth", low=6, high=12, step=2),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1),
    }

#     if param["bootstrap_type"] == "Bayesian":
#         param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
#     elif param["bootstrap_type"] == "Bernoulli":
#         param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    gbm = CatBoostRegressor(**param)

    gbm.fit(train_x,
            train_y,
            eval_set=[(valid_x, valid_y)],
            cat_features=categorical_features,
            verbose=0,
            early_stopping_rounds=100)

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    msle = mean_squared_log_error(valid_y, pred_labels)

    return msle


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=600)


[32m[I 2021-09-18 22:52:18,528][0m A new study created in memory with name: no-name-e22e86ad-9b3a-4b98-8453-3b927503b4e9[0m
[32m[I 2021-09-18 22:52:46,297][0m Trial 0 finished with value: 0.06808010837016243 and parameters: {'eval_metric': 'MSLE', 'depth': 6, 'learning_rate': 0.020493709403457606}. Best is trial 0 with value: 0.06808010837016243.[0m
[32m[I 2021-09-18 22:53:23,648][0m Trial 1 finished with value: 0.06206859191009042 and parameters: {'eval_metric': 'MSLE', 'depth': 8, 'learning_rate': 0.06464191533400633}. Best is trial 0 with value: 0.06808010837016243.[0m
[32m[I 2021-09-18 22:53:50,958][0m Trial 2 finished with value: 0.06893888800329508 and parameters: {'eval_metric': 'MSLE', 'depth': 6, 'learning_rate': 0.06367106276502076}. Best is trial 2 with value: 0.06893888800329508.[0m
[32m[I 2021-09-18 22:55:04,004][0m Trial 3 finished with value: 0.055727911752371934 and parameters: {'eval_metric': 'MSLE', 'depth': 12, 'learning_rate': 0.09634440890306176}. Bes

In [24]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


Number of finished trials: 10
Best trial:
  Value: 0.07064219803056218
  Params: 
    eval_metric: MSLE
    depth: 6
    learning_rate: 0.08861923728929248


### CATBOOST - ONEHOTENCODING

In [5]:
# Defining Features & Target for Catboost

target = ['Sales'] # , 'SalesLog'
# features = train.columns.difference(target + ['#Order', 'DateOrdinal'])
features = [
            'Store_Type', 'Region_Code', 'Location_Type', 'DateOrdinal', 'Holiday', 'Discount'
           ] # , '#Order', 'Date', 'Year', 'Month', 'Day'

X = train[features]
y = train[target]

categorical_features = [x for x in X.select_dtypes(include='object').columns.tolist() if x not in ['ID']]


# y dataset preprocess
# define min max scaler
scaler = MinMaxScaler()
# transform data
y = scaler.fit_transform(y)
y = pd.DataFrame(y.reshape(-1), columns=['Sales'], index=X.index)


# X dataset preprocess
categorical_features = [x for x in X.select_dtypes(include='object').columns.tolist() if x not in ['ID', 'Store_id']]

enc = OneHotEncoder(sparse=False)
# enc.fit(X[categorical_features])
# enc.categories_

X_ohe = enc.fit_transform(X[categorical_features])
one_hot_encoded_frame = pd.DataFrame(X_ohe, columns=enc.get_feature_names(categorical_features))

numerical_features = list(set(features) - set(categorical_features))
numerical_frame = X[numerical_features]

X = pd.concat([numerical_frame.reset_index(), one_hot_encoded_frame], axis=1).set_index(['ID', 'Store_id'])
X

Unnamed: 0_level_0,Unnamed: 1_level_0,DateOrdinal,Store_Type_S1,Store_Type_S2,Store_Type_S3,Store_Type_S4,Region_Code_R1,Region_Code_R2,Region_Code_R3,Region_Code_R4,Location_Type_L1,Location_Type_L2,Location_Type_L3,Location_Type_L4,Location_Type_L5,Holiday_No,Holiday_Yes,Discount_No,Discount_Yes
ID,Store_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
T1000001,1,736695,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
T1000002,253,736695,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
T1000003,252,736695,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
T1000004,251,736695,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
T1000005,250,736695,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T1188336,149,737210,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
T1188337,153,737210,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
T1188338,154,737210,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
T1188339,155,737210,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [80]:
# Train Valid Split

# Random Sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Store Sampling
# store_ids = sorted(train['Store_id'].unique().tolist())
# train_size=0.9
# train_store_ids = store_ids[:round(len(store_ids)*train_size)]
# test_store_ids = store_ids[round(len(store_ids)*train_size)+1:]

# X_train = train.loc[(train['Store_id'].isin(train_store_ids)), features]
# y_train = train.loc[(train['Store_id'].isin(train_store_ids)), target]
# X_test = train.loc[(train['Store_id'].isin(test_store_ids)), features]
# y_test = train.loc[(train['Store_id'].isin(test_store_ids)), target]


print(f"X_train: {X_train.shape}\nX_test: {X_test.shape}\ny_train: {y_train.shape}\ny_test: {y_test.shape}")

# initialize Pool
train_pool = Pool(X_train,
                  y_train,
                 )
test_pool = Pool(X_test,
                 y_test,
                )


# Model Training

############################ CATBOOST ############################

params = {
    'iterations': 5000,
    'learning_rate': 0.05,
    'depth': 6,
    'eval_metric': 'MSLE',
    'random_seed': 42,
    'early_stopping_rounds': 100
}

# Initialize CatBoostClassifier
model = CatBoostRegressor(**params)
# Fit model
model.fit(train_pool,
          eval_set=test_pool,
          verbose=False,
          plot=True)

X_train: (131838, 18)
X_test: (56502, 18)
y_train: (131838, 1)
y_test: (56502, 1)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x7fa1eaa70c50>

In [81]:
# Get predictions
preds = model.predict(X_test)
preds = np.where(preds<0, 0, preds)

# # Inverse Transform
preds = pd.DataFrame(scaler.inverse_transform(preds.reshape(-1,1)).reshape(-1), columns=['preds'])
y_test = pd.DataFrame(scaler.inverse_transform(y_test).reshape(-1), columns=['Sale'])


# Evaluate Model
print(f'mean_squared_error: {mean_squared_error(y_test, preds)}')
print(f'mean_squared_log_error: {mean_squared_log_error(y_test, preds)}')
print(f'r2_score: {r2_score(y_test, preds)}')

mean_squared_error: 93189236.4736356
mean_squared_log_error: 0.06917776653043593
r2_score: 0.7250570124536455


### LIGHTGBM

In [63]:
train = pd.read_csv(f"{DATA_DIR}/TRAIN.csv")
test = pd.read_csv(f"{DATA_DIR}/TEST_FINAL.csv")

print("Cleaning Training DataSet")
train = clean_data(train)
print("Cleaning Testing DataSet")
test = clean_data(test)

dtale.show(train)


Cleaning Training DataSet
Cleaning Testing DataSet




In [64]:
# Defining Features & Target for Catboost

target = ['Sales']
# features = train.columns.difference(target + ['#Order', 'DateOrdinal'])
features = [
            'Store_Type', 'Region_Code', 'Location_Type', 'DateOrdinal', 'Holiday', 'Discount'
           ] # , '#Order', 'Sales', 'SalesLog', 'Date', 'Year', 'Month', 'Day'

X = train[features]
y = train[target]

categorical_features = [x for x in X.select_dtypes(include='object').columns.tolist() if x not in ['ID']]

In [67]:
for feature in categorical_features:
    X[feature] = pd.Series(X[feature], dtype="category")


# Train Valid Split

# Random Sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Store Sampling
# store_ids = sorted(train['Store_id'].unique().tolist())
# train_size=0.7
# train_store_ids = store_ids[:round(len(store_ids)*train_size)]
# test_store_ids = store_ids[round(len(store_ids)*train_size)+1:]


# X_train_idx = train.loc[(train['Store_id'].isin(train_store_ids)), features].index
# y_train_idx = train.loc[(train['Store_id'].isin(train_store_ids)), target].index
# X_test_idx = train.loc[(train['Store_id'].isin(test_store_ids)), features].index
# y_test_idx = train.loc[(train['Store_id'].isin(test_store_ids)), target].index

# X_train = X.loc[X_train_idx]
# y_train = pd.DataFrame(y, columns=['Sales']).loc[y_train_idx]
# X_test = X.loc[X_test_idx]
# y_test = pd.DataFrame(y, columns=['Sales']).loc[y_test_idx]


print(f"X_train: {X_train.shape}\nX_test: {X_test.shape}\ny_train: {y_train.shape}\ny_test: {y_test.shape}")


train_data = lgb.Dataset(X_train,
                         label=y_train,
                         feature_name=X.columns.tolist(),
                         categorical_feature=categorical_features
                        )

validation_data = lgb.Dataset(X_test,
                              label=y_test,
                              feature_name=X.columns.tolist(),
                              categorical_feature=categorical_features
                             )


params = {
    'objective': 'regression',
    'seed': 42,
    'max_depth': 10,
    'learning_rate': 0.1
}

num_round = 100
model = lgb.train(params,
                train_data,
                num_round,
                valid_sets=[validation_data])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



X_train: (131838, 6)
X_test: (56502, 6)
y_train: (131838, 1)
y_test: (56502, 1)
[1]	valid_0's l2: 3.02614e+08
[2]	valid_0's l2: 2.73122e+08
[3]	valid_0's l2: 2.49158e+08
[4]	valid_0's l2: 2.2971e+08
[5]	valid_0's l2: 2.13831e+08
[6]	valid_0's l2: 2.00919e+08
[7]	valid_0's l2: 1.9032e+08
[8]	valid_0's l2: 1.81471e+08
[9]	valid_0's l2: 1.73933e+08
[10]	valid_0's l2: 1.67617e+08
[11]	valid_0's l2: 1.62484e+08
[12]	valid_0's l2: 1.58109e+08
[13]	valid_0's l2: 1.54125e+08
[14]	valid_0's l2: 1.50773e+08
[15]	valid_0's l2: 1.47995e+08
[16]	valid_0's l2: 1.45464e+08
[17]	valid_0's l2: 1.42827e+08
[18]	valid_0's l2: 1.40871e+08
[19]	valid_0's l2: 1.39183e+08
[20]	valid_0's l2: 1.37262e+08
[21]	valid_0's l2: 1.359e+08
[22]	valid_0's l2: 1.34348e+08
[23]	valid_0's l2: 1.33203e+08
[24]	valid_0's l2: 1.32029e+08
[25]	valid_0's l2: 1.31043e+08
[26]	valid_0's l2: 1.30079e+08
[27]	valid_0's l2: 1.28968e+08
[28]	valid_0's l2: 1.28084e+08
[29]	valid_0's l2: 1.27375e+08
[30]	valid_0's l2: 1.26532e+08
[31

In [68]:
# Get predictions
preds = model.predict(X_test)
preds = np.where(preds<0, 0, preds)

# # Inverse Transform
preds = pd.DataFrame(scaler.inverse_transform(preds.reshape(-1,1)).reshape(-1), columns=['preds'])
y_test = pd.DataFrame(scaler.inverse_transform(y_test).reshape(-1), columns=['Sale'])


# Evaluate Model
print(f'mean_squared_error: {mean_squared_error(y_test, preds)}')
print(f'mean_squared_log_error: {mean_squared_log_error(y_test, preds)}')
print(f'r2_score: {r2_score(y_test, preds)}')

mean_squared_error: 6.415829737812693e+18
mean_squared_log_error: 0.08478027411469494
r2_score: 0.6902722716328039


### LIGHTGBM-ONEHOTENCODING

In [36]:
# Defining Features & Target for Catboost

target = ['Sales'] # , 'SalesLog'
# features = train.columns.difference(target + ['#Order', 'DateOrdinal'])
features = [
            'Store_Type', 'Region_Code', 'Location_Type', 'DateOrdinal', 'Holiday', 'Discount'
           ] # , '#Order', 'Date', 'Year', 'Month', 'Day'

X = train[features]
y = train[target]

categorical_features = [x for x in X.select_dtypes(include='object').columns.tolist() if x not in ['ID']]


# y dataset preprocess
# define min max scaler
scaler = MinMaxScaler()
# transform data
y = scaler.fit_transform(y)
y = pd.DataFrame(y.reshape(-1), columns=['Sales'], index=X.index)


# X dataset preprocess
categorical_features = [x for x in X.select_dtypes(include='object').columns.tolist() if x not in ['ID', 'Store_id']]

enc = OneHotEncoder(sparse=False)
# enc.fit(X[categorical_features])
# enc.categories_

X_ohe = enc.fit_transform(X[categorical_features])
one_hot_encoded_frame = pd.DataFrame(X_ohe, columns=enc.get_feature_names(categorical_features))

numerical_features = list(set(features) - set(categorical_features))
numerical_frame = X[numerical_features]

X = pd.concat([numerical_frame.reset_index(), one_hot_encoded_frame], axis=1).set_index(['ID', 'Store_id'])


# Train Valid Split


In [51]:
import lightgbm as lgb


# Random Sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Store Sampling
# store_ids = sorted(train['Store_id'].unique().tolist())
# train_size=0.9
# train_store_ids = store_ids[:round(len(store_ids)*train_size)]
# test_store_ids = store_ids[round(len(store_ids)*train_size)+1:]

# X_train = train.loc[(train['Store_id'].isin(train_store_ids)), features]
# y_train = train.loc[(train['Store_id'].isin(train_store_ids)), target]
# X_test = train.loc[(train['Store_id'].isin(test_store_ids)), features]
# y_test = train.loc[(train['Store_id'].isin(test_store_ids)), target]


print(f"X_train: {X_train.shape}\nX_test: {X_test.shape}\ny_train: {y_train.shape}\ny_test: {y_test.shape}")


train_data = lgb.Dataset(X_train,
                         label=y_train,
                         feature_name=X.columns.tolist(),
                         # categorical_feature=categorical_features
                        )

validation_data = lgb.Dataset(X_test,
                              label=y_test,
                              feature_name=X.columns.tolist(),
                              # categorical_feature=categorical_features
                             )


params = {
    'objective': 'regression',
    'seed': 42,
    'max_depth': 10,
    'learning_rate': 0.01
}

num_round = 1000
model = lgb.train(params,
                train_data,
                num_round,
                valid_sets=[validation_data])

X_train: (131838, 18)
X_test: (56502, 18)
y_train: (131838, 1)
y_test: (56502, 1)
[1]	valid_0's l2: 0.00548358
[2]	valid_0's l2: 0.00542247
[3]	valid_0's l2: 0.00536258
[4]	valid_0's l2: 0.00530379
[5]	valid_0's l2: 0.00524619
[6]	valid_0's l2: 0.00518961
[7]	valid_0's l2: 0.00513427
[8]	valid_0's l2: 0.00507992
[9]	valid_0's l2: 0.00502674
[10]	valid_0's l2: 0.00497453
[11]	valid_0's l2: 0.00492336
[12]	valid_0's l2: 0.00487321
[13]	valid_0's l2: 0.00482405
[14]	valid_0's l2: 0.00477589
[15]	valid_0's l2: 0.00472867
[16]	valid_0's l2: 0.00468239
[17]	valid_0's l2: 0.00463695
[18]	valid_0's l2: 0.00459252
[19]	valid_0's l2: 0.00454876
[20]	valid_0's l2: 0.00450601
[21]	valid_0's l2: 0.00446411
[22]	valid_0's l2: 0.00442306
[23]	valid_0's l2: 0.00438278
[24]	valid_0's l2: 0.00434334
[25]	valid_0's l2: 0.00430457
[26]	valid_0's l2: 0.00426659
[27]	valid_0's l2: 0.00422936
[28]	valid_0's l2: 0.00419292
[29]	valid_0's l2: 0.0041572
[30]	valid_0's l2: 0.00412218
[31]	valid_0's l2: 0.0040878

[291]	valid_0's l2: 0.00210269
[292]	valid_0's l2: 0.00210114
[293]	valid_0's l2: 0.00209986
[294]	valid_0's l2: 0.00209856
[295]	valid_0's l2: 0.00209708
[296]	valid_0's l2: 0.0020961
[297]	valid_0's l2: 0.0020947
[298]	valid_0's l2: 0.00209372
[299]	valid_0's l2: 0.00209246
[300]	valid_0's l2: 0.00209123
[301]	valid_0's l2: 0.00209028
[302]	valid_0's l2: 0.00208908
[303]	valid_0's l2: 0.00208786
[304]	valid_0's l2: 0.00208667
[305]	valid_0's l2: 0.00208516
[306]	valid_0's l2: 0.00208412
[307]	valid_0's l2: 0.00208258
[308]	valid_0's l2: 0.00208112
[309]	valid_0's l2: 0.00208011
[310]	valid_0's l2: 0.00207914
[311]	valid_0's l2: 0.00207801
[312]	valid_0's l2: 0.00207656
[313]	valid_0's l2: 0.00207518
[314]	valid_0's l2: 0.00207405
[315]	valid_0's l2: 0.00207297
[316]	valid_0's l2: 0.00207188
[317]	valid_0's l2: 0.0020705
[318]	valid_0's l2: 0.00206964
[319]	valid_0's l2: 0.00206853
[320]	valid_0's l2: 0.00206735
[321]	valid_0's l2: 0.00206611
[322]	valid_0's l2: 0.00206478
[323]	valid

[602]	valid_0's l2: 0.00183358
[603]	valid_0's l2: 0.00183345
[604]	valid_0's l2: 0.00183309
[605]	valid_0's l2: 0.00183268
[606]	valid_0's l2: 0.00183251
[607]	valid_0's l2: 0.00183174
[608]	valid_0's l2: 0.00183134
[609]	valid_0's l2: 0.00183093
[610]	valid_0's l2: 0.00183054
[611]	valid_0's l2: 0.00182979
[612]	valid_0's l2: 0.00182931
[613]	valid_0's l2: 0.00182916
[614]	valid_0's l2: 0.00182877
[615]	valid_0's l2: 0.0018284
[616]	valid_0's l2: 0.00182826
[617]	valid_0's l2: 0.00182762
[618]	valid_0's l2: 0.00182714
[619]	valid_0's l2: 0.00182676
[620]	valid_0's l2: 0.00182617
[621]	valid_0's l2: 0.00182581
[622]	valid_0's l2: 0.00182544
[623]	valid_0's l2: 0.0018253
[624]	valid_0's l2: 0.00182494
[625]	valid_0's l2: 0.00182422
[626]	valid_0's l2: 0.00182387
[627]	valid_0's l2: 0.00182376
[628]	valid_0's l2: 0.00182341
[629]	valid_0's l2: 0.00182298
[630]	valid_0's l2: 0.00182236
[631]	valid_0's l2: 0.00182193
[632]	valid_0's l2: 0.00182159
[633]	valid_0's l2: 0.00182087
[634]	vali

[900]	valid_0's l2: 0.00174626
[901]	valid_0's l2: 0.00174621
[902]	valid_0's l2: 0.00174588
[903]	valid_0's l2: 0.00174572
[904]	valid_0's l2: 0.00174562
[905]	valid_0's l2: 0.00174531
[906]	valid_0's l2: 0.00174521
[907]	valid_0's l2: 0.00174495
[908]	valid_0's l2: 0.0017447
[909]	valid_0's l2: 0.00174461
[910]	valid_0's l2: 0.00174429
[911]	valid_0's l2: 0.00174415
[912]	valid_0's l2: 0.00174411
[913]	valid_0's l2: 0.00174403
[914]	valid_0's l2: 0.00174373
[915]	valid_0's l2: 0.00174344
[916]	valid_0's l2: 0.00174332
[917]	valid_0's l2: 0.00174321
[918]	valid_0's l2: 0.00174292
[919]	valid_0's l2: 0.00174283
[920]	valid_0's l2: 0.00174275
[921]	valid_0's l2: 0.00174261
[922]	valid_0's l2: 0.00174235
[923]	valid_0's l2: 0.00174222
[924]	valid_0's l2: 0.00174191
[925]	valid_0's l2: 0.0017416
[926]	valid_0's l2: 0.00174152
[927]	valid_0's l2: 0.00174142
[928]	valid_0's l2: 0.00174134
[929]	valid_0's l2: 0.00174106
[930]	valid_0's l2: 0.00174067
[931]	valid_0's l2: 0.0017406
[932]	valid

In [52]:
# Get predictions
preds = model.predict(X_test)
preds = np.where(preds<0, 0, preds)

# # Inverse Transform
preds = pd.DataFrame(scaler.inverse_transform(preds.reshape(-1,1)).reshape(-1), columns=['preds'])
y_test = pd.DataFrame(scaler.inverse_transform(y_test).reshape(-1), columns=['Sale'])


# Evaluate Model
print(f'mean_squared_error: {mean_squared_error(y_test, preds)}')
print(f'mean_squared_log_error: {mean_squared_log_error(y_test, preds)}')
print(f'r2_score: {r2_score(y_test, preds)}')

mean_squared_error: 105729571.02557364
mean_squared_log_error: 0.07036915330171957
r2_score: 0.6880583506230373


### XGBOOST

In [93]:
train = pd.read_csv(f"{DATA_DIR}/TRAIN.csv")
test = pd.read_csv(f"{DATA_DIR}/TEST_FINAL.csv")

print("Cleaning Training DataSet")
train = clean_data(train)
print("Cleaning Testing DataSet")
test = clean_data(test)


target = ['Sales']
# features = train.columns.difference(target + ['#Order', 'Date'])
features = [
            'Store_Type', 'Region_Code', 'Location_Type', 'DateOrdinal', 'Holiday', 'Discount'
           ] # , '#Order', 'Date', 'Year', 'Month', 'Day'


train = train.reset_index(drop=True)

X = train[features]
y = train[target]


# y dataset preprocess
# define min max scaler
scaler = MinMaxScaler()
# transform data
y = scaler.fit_transform(y)


# X dataset preprocess
categorical_features = [x for x in X.select_dtypes(include='object').columns.tolist() if x not in ['ID', 'Store_id']]

enc = OneHotEncoder(sparse=False)
# enc.fit(X[categorical_features])
# enc.categories_

X_ohe = enc.fit_transform(X[categorical_features])
one_hot_encoded_frame = pd.DataFrame(X_ohe, columns=enc.get_feature_names(categorical_features))

numerical_features = list(set(features) - set(categorical_features))
numerical_frame = X[numerical_features]

X = pd.concat([numerical_frame, one_hot_encoded_frame], axis=1)#.set_index('ID')
X

Cleaning Training DataSet
Cleaning Testing DataSet


Unnamed: 0,DateOrdinal,Store_Type_S1,Store_Type_S2,Store_Type_S3,Store_Type_S4,Region_Code_R1,Region_Code_R2,Region_Code_R3,Region_Code_R4,Location_Type_L1,Location_Type_L2,Location_Type_L3,Location_Type_L4,Location_Type_L5,Holiday_No,Holiday_Yes,Discount_No,Discount_Yes
0,736695,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,736695,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,736695,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,736695,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,736695,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188335,737210,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
188336,737210,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
188337,737210,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
188338,737210,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [94]:
# Train Valid Split

# Random Sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Store Sampling
# store_ids = sorted(train['Store_id'].unique().tolist())
# train_size=0.7
# train_store_ids = store_ids[:round(len(store_ids)*train_size)]
# test_store_ids = store_ids[round(len(store_ids)*train_size)+1:]


# X_train_idx = train.loc[(train['Store_id'].isin(train_store_ids)), features].index
# y_train_idx = train.loc[(train['Store_id'].isin(train_store_ids)), target].index
# X_test_idx = train.loc[(train['Store_id'].isin(test_store_ids)), features].index
# y_test_idx = train.loc[(train['Store_id'].isin(test_store_ids)), target].index

# X_train = X.loc[X_train_idx]
# y_train = pd.DataFrame(y, columns=['Sales']).loc[y_train_idx]
# X_test = X.loc[X_test_idx]
# y_test = pd.DataFrame(y, columns=['Sales']).loc[y_test_idx]


print(f"X_train: {X_train.shape}\nX_test: {X_test.shape}\ny_train: {y_train.shape}\ny_test: {y_test.shape}")

X_train: (131838, 18)
X_test: (56502, 18)
y_train: (131838, 1)
y_test: (56502, 1)


In [95]:
# dtrain = xgb.DMatrix(X_train, label=y_train)
# dtest = xgb.DMatrix(X_test, label=y_test)

# param = {'max_depth': 6, 'eta': 1, 'objective': 'reg:squaredlogerror'}
# param['nthread'] = 4
# param['eval_metric'] = 'rmsle'

# evallist = [(dtest, 'eval'), (dtrain, 'train')]

# num_round = 1000
# bst = xgb.train(param, dtrain, num_round, evallist)


# Training
model = xgb.XGBRegressor(
    max_depth=6,
    n_estimators=10000,
    learning_rate=0.1,
    # min_child_weight=0.5, 
    # colsample_bytree=0.8, 
    # subsample=0.8, 
    # eta=0.1,
    seed=42)

model.fit(
    X_train, 
    y_train, 
    eval_metric="rmsle", 
    eval_set=[(X_test, y_test)], 
    verbose=True, 
    early_stopping_rounds = 20)

[0]	validation_0-rmsle:0.23320
Will train until validation_0-rmsle hasn't improved in 20 rounds.
[1]	validation_0-rmsle:0.21295
[2]	validation_0-rmsle:0.19440
[3]	validation_0-rmsle:0.17746
[4]	validation_0-rmsle:0.16202
[5]	validation_0-rmsle:0.14798
[6]	validation_0-rmsle:0.13524
[7]	validation_0-rmsle:0.12369
[8]	validation_0-rmsle:0.11327
[9]	validation_0-rmsle:0.10389
[10]	validation_0-rmsle:0.09548
[11]	validation_0-rmsle:0.08796
[12]	validation_0-rmsle:0.08126
[13]	validation_0-rmsle:0.07530
[14]	validation_0-rmsle:0.06997
[15]	validation_0-rmsle:0.06534
[16]	validation_0-rmsle:0.06126
[17]	validation_0-rmsle:0.05769
[18]	validation_0-rmsle:0.05455
[19]	validation_0-rmsle:0.05181
[20]	validation_0-rmsle:0.04947
[21]	validation_0-rmsle:0.04743
[22]	validation_0-rmsle:0.04570
[23]	validation_0-rmsle:0.04424
[24]	validation_0-rmsle:0.04296
[25]	validation_0-rmsle:0.04182
[26]	validation_0-rmsle:0.04089
[27]	validation_0-rmsle:0.04015
[28]	validation_0-rmsle:0.03949
[29]	validation_

[250]	validation_0-rmsle:0.03066
[251]	validation_0-rmsle:0.03066
[252]	validation_0-rmsle:0.03065
[253]	validation_0-rmsle:0.03064
[254]	validation_0-rmsle:0.03064
[255]	validation_0-rmsle:0.03063
[256]	validation_0-rmsle:0.03062
[257]	validation_0-rmsle:0.03062
[258]	validation_0-rmsle:0.03061
[259]	validation_0-rmsle:0.03061
[260]	validation_0-rmsle:0.03061
[261]	validation_0-rmsle:0.03061
[262]	validation_0-rmsle:0.03060
[263]	validation_0-rmsle:0.03060
[264]	validation_0-rmsle:0.03060
[265]	validation_0-rmsle:0.03059
[266]	validation_0-rmsle:0.03059
[267]	validation_0-rmsle:0.03058
[268]	validation_0-rmsle:0.03058
[269]	validation_0-rmsle:0.03057
[270]	validation_0-rmsle:0.03056
[271]	validation_0-rmsle:0.03056
[272]	validation_0-rmsle:0.03056
[273]	validation_0-rmsle:0.03055
[274]	validation_0-rmsle:0.03055
[275]	validation_0-rmsle:0.03054
[276]	validation_0-rmsle:0.03054
[277]	validation_0-rmsle:0.03054
[278]	validation_0-rmsle:0.03054
[279]	validation_0-rmsle:0.03053
[280]	vali

[499]	validation_0-rmsle:0.03032
[500]	validation_0-rmsle:0.03032
[501]	validation_0-rmsle:0.03032
Stopping. Best iteration:
[481]	validation_0-rmsle:0.03032



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=10000, n_jobs=0, num_parallel_tree=1, random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [96]:
# Get predictions
preds = model.predict(X_test)
preds = np.where(preds<0, 0, preds)

# # Inverse Transform
preds = pd.DataFrame(scaler.inverse_transform(preds.reshape(-1,1)).reshape(-1), columns=['preds'])
y_test = pd.DataFrame(scaler.inverse_transform(y_test).reshape(-1), columns=['Sale'])


# Evaluate Model
print(f'mean_squared_error: {mean_squared_error(y_test, preds)}')
print(f'mean_squared_log_error: {mean_squared_log_error(y_test, preds)}')
print(f'r2_score: {r2_score(y_test, preds)}')

mean_squared_error: 86478726.34589025
mean_squared_log_error: 0.053181023750529514
r2_score: 0.7448555189367879


In [97]:
final_test_df = pd.concat([X_test.reset_index(),
                           y_test,
                           preds
                          ],
                          axis=1)
final_test_df

Unnamed: 0,index,DateOrdinal,Store_Type_S1,Store_Type_S2,Store_Type_S3,Store_Type_S4,Region_Code_R1,Region_Code_R2,Region_Code_R3,Region_Code_R4,...,Location_Type_L2,Location_Type_L3,Location_Type_L4,Location_Type_L5,Holiday_No,Holiday_Yes,Discount_No,Discount_Yes,Sale,preds
0,2393,736701,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,32667.00,36732.925781
1,28079,736771,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,70352.25,89644.554688
2,31424,736781,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,34824.00,41917.582031
3,93670,736951,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,34365.00,39136.562500
4,80507,736915,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,37107.00,33330.449219
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56497,133834,737061,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,53619.00,63053.246094
56498,178139,737183,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,62964.00,59868.406250
56499,125450,737038,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,41610.00,59393.453125
56500,147857,737100,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,58940.82,36581.750000


In [98]:
test = test[features]

categorical_features = [x for x in test.select_dtypes(include='object').columns.tolist() if x not in ['ID', 'Store_id']]

enc = OneHotEncoder(sparse=False)
# enc.fit(test[categorical_features])
# enc.categories_

test_ohe = enc.fit_transform(test[categorical_features])
one_hot_encoded_frame = pd.DataFrame(test_ohe, columns=enc.get_feature_names(categorical_features))

numerical_features = list(set(features) - set(categorical_features))
numerical_frame = test[numerical_features]

test = pd.concat([numerical_frame.reset_index(), one_hot_encoded_frame], axis=1).set_index('ID')
test

Unnamed: 0_level_0,Store_id,DateOrdinal,Store_Type_S1,Store_Type_S2,Store_Type_S3,Store_Type_S4,Region_Code_R1,Region_Code_R2,Region_Code_R3,Region_Code_R4,Location_Type_L1,Location_Type_L2,Location_Type_L3,Location_Type_L4,Location_Type_L5,Holiday_No,Holiday_Yes,Discount_No,Discount_Yes
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
T1188341,171,737211,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
T1188342,172,737211,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
T1188343,173,737211,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
T1188344,174,737211,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
T1188345,170,737211,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T1210601,186,737271,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
T1210602,11,737271,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
T1210603,185,737271,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
T1210604,69,737271,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [99]:
test = test.reindex(X_train.columns, axis=1)

preds = scaler.inverse_transform(model.predict(test).reshape(-1,1)).reshape(-1)

submission_df = pd.concat([test.reset_index()['ID'], pd.Series(preds, name='Sales')], axis=1)
submission_df.to_csv(f"{DATA_DIR}/xgboost_v3_draft_submission.csv", index=False)

### Experiments

In [4]:
from lazypredict.Supervised import (  # pip install lazypredict
    LazyRegressor
)


train = pd.read_csv(f"{DATA_DIR}/TRAIN.csv")
test = pd.read_csv(f"{DATA_DIR}/TEST_FINAL.csv")

print("Cleaning Training DataSet")
train = clean_data(train)
print("Cleaning Testing DataSet")
test = clean_data(test)

train = train.sample(30000)


target = ['Sales']
# features = train.columns.difference(target + ['#Order', 'Date'])
features = [
            'Store_Type', 'Region_Code', 'Location_Type', 'DateOrdinal', 'Holiday', 'Discount'
           ] # , '#Order', 'Date', 'Year', 'Month', 'Day'


train = train.reset_index(drop=True)

X = train[features]
y = train[target]


# y dataset preprocess
# define min max scaler
scaler = MinMaxScaler()
# transform data
y = scaler.fit_transform(y)


# X dataset preprocess
categorical_features = [x for x in X.select_dtypes(include='object').columns.tolist() if x not in ['ID', 'Store_id']]

enc = OneHotEncoder(sparse=False)
# enc.fit(X[categorical_features])
# enc.categories_

X_ohe = enc.fit_transform(X[categorical_features])
one_hot_encoded_frame = pd.DataFrame(X_ohe, columns=enc.get_feature_names(categorical_features))

numerical_features = list(set(features) - set(categorical_features))
numerical_frame = X[numerical_features]

X = pd.concat([numerical_frame, one_hot_encoded_frame], axis=1)#.set_index('ID')

# Random Sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Fit LazyRegressor
reg = LazyRegressor(
    ignore_warnings=True,
    random_state=42,
    verbose=False,
    custom_metric=None
  )
models, predictions = reg.fit(X_train, X_test, y_train, y_test)  # pass all sets


The sklearn.utils.testing module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.utils. Anything that cannot be imported from sklearn.utils is now part of the private API.



Cleaning Training DataSet


  0%|          | 0/42 [00:00<?, ?it/s]

Cleaning Testing DataSet


100%|██████████| 42/42 [08:58<00:00, 12.81s/it]


In [5]:
models # , predictions

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
XGBRegressor,0.72,0.72,0.05,0.64
HistGradientBoostingRegressor,0.69,0.69,0.06,1.14
LGBMRegressor,0.69,0.69,0.06,0.25
GradientBoostingRegressor,0.65,0.65,0.06,1.1
RandomForestRegressor,0.64,0.64,0.06,2.95
BaggingRegressor,0.62,0.62,0.06,0.34
KNeighborsRegressor,0.6,0.6,0.07,1.52
NuSVR,0.58,0.58,0.07,32.43
ExtraTreesRegressor,0.56,0.57,0.07,2.26
MLPRegressor,0.56,0.56,0.07,1.04
