In [3]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor

import matplotlib.pylab as plt

In [4]:
df_train = pd.read_csv("data/stores_train.csv")
df_test = pd.read_csv("data/stores_test.csv")
df_bus_stops = pd.read_csv("data/busstops_norway_fixed.csv")
df_grunnkrets_age_dist = pd.read_csv("data/grunnkrets_age_distribution.csv")
df_grunnkrets_house_pers = pd.read_csv("data/grunnkrets_households_num_persons.csv")
df_grunnkrets_income_house = pd.read_csv("data/grunnkrets_income_households.csv")
df_grunnkrets_stripped = pd.read_csv("data/grunnkrets_norway_stripped.csv")
df_plaace_hierarchy = pd.read_csv("data/plaace_hierarchy.csv")
df_extra = pd.read_csv("data/stores_extra.csv")

In [5]:
df_grunnkrets_stripped = df_grunnkrets_stripped[df_grunnkrets_stripped.year == 2016]
df_grunnkrets_age_dist = df_grunnkrets_age_dist[df_grunnkrets_age_dist.year == 2016]
df_grunnkrets_house_pers = df_grunnkrets_house_pers[df_grunnkrets_house_pers.year == 2016]
df_grunnkrets_income_house = df_grunnkrets_income_house[df_grunnkrets_income_house.year == 2016]

In [6]:
def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error 
    
    Args:
        y_true (np.array): n-dimensional vector of ground-truth values 
        y_pred (np.array): n-dimensional vecotr of predicted values 
    
    Returns:
        A scalar float with the rmsle value 
    
    Note: You can alternatively use sklearn and just do: 
        `sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5`
    """
    assert (y_true >= 0).all(), 'Received negative y_true values'
    assert (y_pred >= 0).all(), 'Received negative y_pred values'
    assert y_true.shape == y_pred.shape, 'y_true and y_pred have different shapes'
    y_true_log1p = np.log1p(y_true)  # log(1 + y_true)
    y_pred_log1p = np.log1p(y_pred)  # log(1 + y_pred)
    return np.sqrt(np.mean(np.square(y_pred_log1p - y_true_log1p)))

In [7]:
def label_uniformier(array_train, array_test):
    """
    Take the unique values from the train and test part to combine it in a single array.
    Useful to fit the label encoder and don't do a mess during the transform (previously fit_transform that was confusing)
    """
    label_encoder = LabelEncoder()
    labels = np.asarray(list(array_train.unique()) + list(set(array_test.unique()) - set(array_train.unique())))
    label_encoder.fit(labels)
    return label_encoder

In [8]:
#train
X_train = df_train.loc[:, ['grunnkrets_id', 'lat', 'lon', 'sales_channel_name', 'chain_name', 'plaace_hierarchy_id']]
X_test = df_test.loc[:, ['grunnkrets_id', 'lat', 'lon', 'sales_channel_name', 'chain_name', 'plaace_hierarchy_id']]

X_train['lat'] = X_train.lat * 11.112
X_train['lon'] = X_train.lon * 6.4757
X_test['lat'] = X_test.lat * 11.112
X_test['lon'] = X_test.lon * 6.4757


X_train = pd.merge(X_train, df_grunnkrets_stripped[['grunnkrets_id', 'area_km2']], how="left", on="grunnkrets_id")
X_test = pd.merge(X_test, df_grunnkrets_stripped[['grunnkrets_id', 'area_km2']], how="left", on="grunnkrets_id")

X_train = pd.merge(X_train, df_plaace_hierarchy[['plaace_hierarchy_id', 'lv1_desc', 'lv2_desc', 'lv3_desc', 'lv4_desc']], how='left', on='plaace_hierarchy_id')
X_test = pd.merge(X_test, df_plaace_hierarchy[['plaace_hierarchy_id', 'lv1_desc', 'lv2_desc', 'lv3_desc', 'lv4_desc']], how='left', on='plaace_hierarchy_id')

""" TEST """
# X_train = pd.merge(X_train, create_population_repartition(), how='left', on='grunnkrets_id')
# X_test = pd.merge(X_test, create_population_repartition(), how='left', on='grunnkrets_id')

# X_train = pd.merge(X_train, create_population_age(), how='left', on='grunnkrets_id')
# X_test = pd.merge(X_test, create_population_age(), how='left', on='grunnkrets_id')

# X_train = pd.merge(X_train, create_income_repartition(), how='left', on='grunnkrets_id')
# X_test = pd.merge(X_test, create_income_repartition(), how='left', on='grunnkrets_id')
""" TEST """

le = label_uniformier(X_train['sales_channel_name'], X_test['sales_channel_name'])
X_train['encoded_channel_name'] = le.transform(X_train['sales_channel_name'])
X_test['encoded_channel_name'] = le.transform(X_test['sales_channel_name'])

X_train['chain_name'] = X_train['chain_name'].fillna('0')
X_test['chain_name'] = X_test['chain_name'].fillna('0')

le = label_uniformier(X_train['chain_name'], X_test['chain_name'])
X_train['encoded_chain'] = le.transform(X_train['chain_name'])
X_test['encoded_chain'] = le.transform(X_test['chain_name'])

le = label_uniformier(df_plaace_hierarchy['lv3_desc'], df_plaace_hierarchy['lv3_desc'])
X_train['encoded_lv3'] = le.transform(X_train['lv3_desc'])
X_test['encoded_lv3'] = le.transform(X_test['lv3_desc'])

le = label_uniformier(df_plaace_hierarchy['lv4_desc'], df_plaace_hierarchy['lv4_desc'])
X_train['encoded_lv4'] = le.transform(X_train['lv4_desc'])
X_test['encoded_lv4'] = le.transform(X_test['lv4_desc'])

X_train = X_train.drop(['sales_channel_name', 'chain_name', 'plaace_hierarchy_id', 'lv1_desc', 'lv2_desc', 'lv3_desc', 'lv4_desc'], axis=1)
X_test = X_test.drop(['sales_channel_name', 'chain_name', 'plaace_hierarchy_id', 'lv1_desc', 'lv2_desc', 'lv3_desc', 'lv4_desc'], axis=1)

X_train = X_train.apply(lambda x: x.fillna(x.mean()),axis=0)
X_test = X_test.apply(lambda x: x.fillna(x.mean()),axis=0)

scaler = StandardScaler()
X_train[X_train.columns] = scaler.fit_transform(X_train)
X_test[X_test.columns] = scaler.transform(X_test)

Y_train = df_train.loc[:, ['revenue', ]]
Y_train = scaler.fit_transform(Y_train)

In [9]:
X_train, X_train_test, Y_train, Y_train_test = train_test_split(X_train, Y_train)

In [15]:
ntrain = X_train.shape[0]
# ntest = X_train_test.shape[0]
ntest = X_test.shape[0]


SEED = 42 # for reproducibility
NFOLDS = 5 # set number of folds for out-of-fold prediction
kf = KFold(
    n_splits=NFOLDS,
    shuffle=True,
    random_state=SEED
) # K-Folds cross-validator

# oof = out of fold
def get_oof(clf, x_train, y_train, x_test):
    """
    Popular function on Kaggle.
    
    Trains a classifier on 4/5 of the training data and
    predicts the rest (1/5). This procedure is repeated for all 5 folds,
    thus we have predictions for all training set. This prediction is one
    column of meta-data, later on used as a feature column by a meta-algorithm.
    We predict the test part and average predictions across all 5 models.
    
    Keyword arguments:
    clf -- classifier
    x_train -- 4/5 of training data
    y_train -- corresponding labels
    x_test -- all test data
    
    """
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.iloc[train_index, :]
        y_tr = y_train[train_index]
        x_te = x_train.iloc[test_index, :]

        clf.fit(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [16]:
model1 = RandomForestRegressor(
    n_estimators=150,
    criterion='absolute_error',
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=2,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=42,
    verbose=0,
    warm_start=False,
    ccp_alpha=0.0,
    max_samples=None
)

model2 = DecisionTreeRegressor(
    criterion = 'absolute_error',
    splitter = 'random',
    max_depth = None,
    min_samples_split=40,
    min_samples_leaf=2,
    min_weight_fraction_leaf=0.0,
    min_impurity_decrease=0.0,
    max_features=None,
    max_leaf_nodes=None,
    random_state=42
)

model3 = XGBRegressor(
    objective='reg:squaredlogerror', 
    n_estimators=300, 
    random_state=42,
    colsample_bytree=0.8958238323555624, 
    gamma=0.11909139052336326,
    learning_rate=0.05983241782780355,
    subsample=0.8889067727422637,
    max_depth=5,
)

In [17]:
# rfr_oof_train, rfr_oof_test = get_oof(model1, X_train, np.ravel(Y_train), X_train_test)
# dtr_oof_train, dtr_oof_test = get_oof(model2, X_train, np.ravel(Y_train), X_train_test)
# xgb_oof_train, xgb_oof_test = get_oof(model3, X_train, np.ravel(Y_train), X_train_test)

rfr_oof_train, rfr_oof_test = get_oof(model1, X_train, np.ravel(Y_train), X_test)
dtr_oof_train, dtr_oof_test = get_oof(model2, X_train, np.ravel(Y_train), X_test)
xgb_oof_train, xgb_oof_test = get_oof(model3, X_train, np.ravel(Y_train), X_test)

In [18]:
x_train = np.concatenate((
    rfr_oof_train,
    dtr_oof_train,
    xgb_oof_train,
), axis=1)

x_test = np.concatenate((
    rfr_oof_test,
    dtr_oof_test,
    xgb_oof_test,
), axis=1)

In [19]:
# OOF predictions
meta_df = pd.DataFrame(x_train, columns=['RFR', 'DTR', 'XGB'])
meta_df['label'] = Y_train
meta_df

Unnamed: 0,RFR,DTR,XGB,label
0,0.035907,0.024278,0.339458,0.635860
1,0.164726,-0.231794,0.403550,1.015009
2,-0.156508,-0.391485,0.017641,0.512360
3,-0.124710,-0.098311,-0.011699,0.069932
4,-0.058426,-0.068623,0.034573,-0.240151
...,...,...,...,...
12854,-0.290921,-0.348823,-0.335171,-0.528902
12855,0.323017,-0.234102,0.069039,-0.416523
12856,-0.149905,-0.160939,-0.210401,1.951306
12857,-0.306418,-0.355521,-0.415134,-0.297771


In [32]:
META_MODEL = lgb.LGBMRegressor(
    max_depth=3, 
    random_state=SEED, 
    silent=True,
    metric='mse',
    n_jobs=-1, 
    n_estimators=1050,
    subsample=0.9,
    learning_rate=0.005

)

META_MODEL.fit(x_train, Y_train)
final_predictions = META_MODEL.predict(x_test)

# meta_mse = rmsle(lgb_final_predictions, np.ravel(Y_train_test))
# print(f'est MSE Stacked by LGBM: {rmsle(scaler.inverse_transform(Y_train_test),scaler.inverse_transform([[elmt] for elmt in lgb_final_predictions])):.4f}')

  y = column_or_1d(y, warn=True)


In [29]:
from sklearn.linear_model import LinearRegression

META_MODEL = LinearRegression()

META_MODEL.fit(x_train, Y_train)
final_predictions = META_MODEL.predict(x_test)


# meta_mse = rmsle(scaler.inverse_transform(Y_train_test),scaler.inverse_transform(final_predictions))
# meta_mse = rmsle(final_predictions, Y_train_test)
# print('Test MSE Stacked by Linear Regression:', meta_mse)

In [27]:
META_MODEL = CatBoostRegressor(
    depth=3, 
    random_state=SEED, 
    silent=True,
#     eval_metric='RMLSE',
    iterations=600,
    l2_leaf_reg=1,
    min_child_samples=2,
    learning_rate=0.025
)

META_MODEL.fit(x_train, Y_train)
final_predictions = META_MODEL.predict(x_test)

# meta_mse = rmsle(scaler.inverse_transform(Y_train_test),scaler.inverse_transform([[elmt] for elmt in cat_final_predictions]))
# print('Test MSE Stacked by Catboost:', meta_mse)

In [34]:
# Generate submission dataframe 
# NOTE: It is important that the ID and predicted values match
submission = pd.DataFrame()
submission['id'] = df_test.store_id 
submission['predicted'] = np.asarray(scaler.inverse_transform([[elmt] for elmt in final_predictions]))
# submission['predicted'] = np.asarray(scaler.inverse_transform(final_predictions))


# Save it to disk (`index=False` means don't save the index in the csv)
submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,id,predicted
0,914206820-914239427-717245,4.676835
1,916789157-916823770-824309,9.904663
2,913341082-977479363-2948,7.611649
3,889682582-889697172-28720,8.517198
4,997991699-998006945-417222,10.145025
...,...,...
8572,917323003-917383529-844309,7.823506
8573,917353379-917411824-845904,4.839260
8574,917072302-917089248-833647,10.437889
8575,916960557-916993161-829908,3.151334
