In [20]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
%autoreload 

# Setup right folder path
from os.path import abspath
from sys import path
path.append(abspath('../../'))


from preprocessing.functions import *
from features_engineering.functions import *
    
# basics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn part
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler 
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

# other models
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# others
# from scipy.stats import uniform, randint


In [22]:
SEED = 42

# Pipeline

In [23]:
df_train = pd.read_csv("../../data/stores_train.csv")
df_test = pd.read_csv("../../data/stores_test.csv")

# Preprocessing
# df_train = remove_outliers(df_train)
df_train = fix_lat_lon(df_train) 
df_test = fix_lat_lon(df_test)

# Features engineering
X_train, X_test, Y_train, scaler_revenue = features_engineering(df_train, df_test)

In [24]:
X_train.head(3)

Unnamed: 0,grunnkrets_id,lat,lon,lat_processed,lon_processed,SI_p1,SI_p2,SI_p3,SI_all,lat_reduced,...,BS_mean_1km_regionalt_knutepunkt,BS_closest_annen_viktig_holdeplass,BS_mean_1km_annen_viktig_holdeplass,encoded_mall_name,encoded_chain_name,encoded_sales_channel_name,encoded_lv3_desc,encoded_lv2_desc,encoded_lv1_desc,encoded_municipality_name
0,-0.573259,-0.495375,0.055228,-0.495375,0.055228,0.762237,0.266067,-0.117901,-0.119785,-0.552008,...,0.0,-0.149048,0.0,2.07642,1.639203,-0.353536,-0.486383,1.258967,-1.427478,-1.410067
1,-1.093241,-0.437829,0.194707,-0.437829,0.194707,0.839556,0.258357,-0.117901,-0.119782,-0.552008,...,0.0,-0.306458,0.0,-0.38128,1.639203,-0.353536,-0.486383,1.258967,-1.427478,0.322596
2,-0.568111,-0.353193,0.068337,-0.353193,0.068337,0.79177,0.316278,-0.11772,-0.119784,-0.218552,...,0.0,-0.156032,0.0,1.756919,0.014829,-0.353536,-0.486383,1.258967,-1.427478,0.45916


In [25]:
Y_train

Unnamed: 0,revenue
0,1.278708
1,1.394942
2,1.232971
3,1.012669
4,0.742568
...,...
12854,0.036629
12855,0.449633
12856,1.593563
12857,0.666705


# Model

In [26]:
rf_model = RandomForestRegressor(
    n_estimators=180,
    criterion='squared_error',
    max_depth=None,
    min_samples_split=14,
    min_samples_leaf=11,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    max_leaf_nodes=300,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    verbose=0,
    warm_start=False,
    ccp_alpha=0.0,
    max_samples=None,
    random_state=SEED,
)

lgbm_model = LGBMRegressor(
    num_leaves=70,
    max_depth=7, 
    n_estimators=2000,
    min_data_in_leaf = 400,
    learning_rate=0.05,
    random_state=SEED,  
)

xgb_model = XGBRegressor(
    objective='reg:squarederror', 
    n_estimators=300, 
    colsample_bytree=0.8958238323555624, 
    gamma=0.11909139052336326,
    learning_rate=0.05983241782780355,
    subsample=0.8889067727422637,
    max_depth=5,
    random_state=SEED,
)

gb_model = GradientBoostingRegressor(
    random_state=SEED,
    learning_rate=0.005,
    n_estimators=2000,
    subsample=1.0, 
    criterion='squared_error', 
    min_samples_split=4, 
    min_samples_leaf=2, 
    min_weight_fraction_leaf=0.0, 
    max_depth=9, 
    min_impurity_decrease=0.0, 
    init=None, 
    max_features=None,
    alpha=0.9,
    verbose=0,
    max_leaf_nodes=None,
    warm_start=False,
    validation_fraction=0.1,
    n_iter_no_change=None,
    tol=0.0001,
    ccp_alpha=0.0
)

cb_model = CatBoostRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    thread_count=-1,
    max_depth=6,
    silent=True,
    loss_function='RMSE',
    bagging_temperature=0.3,
    od_type="Iter",
    random_state=SEED,
)

***resect the same order for the features and models***

In [9]:
features =   [
    'grunnkrets_id',
    'SI_p1',
    'SI_p2', 
    'SI_p3', 
    'SI_all',
    'latxlat',
    'population_density',
    'ih_all_households',
    'BS_closest_mangler_viktighetsnivå',
    'BS_closest_lokalt_knutepunkt',
    'BS_closest_nasjonalt_knutepunkt',
    'BS_closest_regionalt_knutepunkt',
    'BS_closest_annen_viktig_holdeplass',
    'encoded_lv3_desc',
    'encoded_sales_channel_name',
    'encoded_chain_name',
    'encoded_mall_name',
    'encoded_municipality_name',
]

X_train = keep_only_use_features(X_train, features)
X_test = keep_only_use_features(X_test, features)

In [84]:
ntrain = X_train.shape[0]
ntest = X_test.shape[0]

NFOLDS = 5 # set number of folds for out-of-fold prediction
kf = KFold(
    n_splits=NFOLDS,
    shuffle=True,
    random_state=SEED
) # K-Folds cross-validator

# oof = out of fold
def get_oof(clf, x_train, y_train, x_test):
    """
    Trains a classifier on 4/5 of the training data and
    predicts the rest (1/5). This procedure is repeated for all 5 folds,
    thus we have predictions for all training set. This prediction is one
    column of meta-data, later on used as a feature column by a meta-algorithm.
    We predict the test part and average predictions across all 3 models.
    
    Keyword arguments:
    clf -- classifier
    x_train -- 4/5 of training data
    y_train -- corresponding labels
    x_test -- all test data
    
    """
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.iloc[train_index, :]
        y_tr = y_train[train_index]
        x_te = x_train.iloc[test_index, :]

        clf.fit(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [85]:
rf_oof_train, rf_oof_test = get_oof(rf_model, X_train, np.ravel(Y_train), X_test)
lgbm_oof_train, lgbm_oof_test = get_oof(lgbm_model, X_train, np.ravel(Y_train), X_test)
xgb_oof_train, xgb_oof_test = get_oof(xgb_model, X_train, np.ravel(Y_train), X_test)
# gb_oof_train, gb_oof_test = get_oof(gb_model, X_train, np.ravel(Y_train), X_test)
# cb_oof_train, cb_oof_test = get_oof(cb_model, X_train, np.ravel(Y_train), X_test)



In [86]:
x_train = np.concatenate((
    rf_oof_train,
    lgbm_oof_train,
    xgb_oof_train,
#     gb_oof_train,
#     cb_oof_train,
), axis=1)

x_test = np.concatenate((
    rf_oof_test,
    lgbm_oof_test,
    xgb_oof_test,
#     gb_oof_test,
#     cb_oof_test,
), axis=1)

In [87]:
# OOF predictions
meta_df = pd.DataFrame(x_train, columns=[
    'RF',
    'LGBM',
    'XGB',
#     'GB', 
#     'CB'
])
meta_df['label'] = Y_train
meta_df

Unnamed: 0,RF,LGBM,XGB,label
0,2.788373,4.205947,3.487751,4.247776
1,3.250180,4.762509,3.949633,4.633896
2,2.718632,2.664637,2.968262,4.095840
3,2.373959,2.635670,2.278232,3.364012
4,2.056517,3.180060,2.798694,2.466758
...,...,...,...,...
12854,1.816041,1.288297,1.630714,0.121679
12855,3.012450,1.767319,1.735754,1.493647
12856,2.390106,1.954661,2.392383,5.293702
12857,1.960225,2.070201,2.044152,2.214747


In [91]:
META_MODEL = LGBMRegressor(
    max_depth=3, 
    random_state=SEED, 
    silent=True,
    metric='mse',
    n_jobs=-1, 
    n_estimators=1050,
    subsample=0.9,
    learning_rate=0.005

)

META_MODEL.fit(x_train, Y_train)
Y_Pred = META_MODEL.predict(x_test)

  y = column_or_1d(y, warn=True)


In [93]:
META_MODEL = LinearRegression()

META_MODEL.fit(x_train, Y_train)
Y_Pred = META_MODEL.predict(x_test)

In [215]:
META_MODEL = CatBoostRegressor(
    depth=3, 
    random_state=SEED, 
    silent=True,
#     eval_metric='RMLSE',
    iterations=600,
    l2_leaf_reg=1,
    min_child_samples=2,
    learning_rate=0.025
)

META_MODEL.fit(x_train, Y_train)
Y_Pred = META_MODEL.predict(x_test)

In [241]:
# Running model indivudally to make mean
rf_model.fit(X_train, Y_train)
lgbm_model.fit(X_train, Y_train)
xgb_model.fit(X_train, Y_train)
gb_model.fit(X_train, Y_train)
cb_model.fit(X_train, Y_train)

rf_prediction = rf_model.predict(X_test)
lgbm_prediction = lgbm_model.predict(X_test)
xgb_prediction = xgb_model.predict(X_test)
gb_prediction = gb_model.predict(X_test)
cb_prediction = cb_model.predict(X_test)

avg_prediction = np.mean([
    rf_prediction,
    lgbm_prediction,
    xgb_prediction,
    gb_prediction,
    cb_prediction,
], axis=0)

  rf_model.fit(X_train, Y_train)


In [243]:
# Mean of mean
Y_Pred = np.mean([np.ravel(Y_Pred), avg_prediction], axis=0)

In [27]:
lgbm_model.fit(X_train, np.ravel(Y_train))
Y_Pred = lgbm_model.predict(X_test)



# Submission

In [28]:
submission = pd.DataFrame()
submission['id'] = df_test.store_id 
submission['predicted'] = np.asarray(10 ** Y_Pred - 1)

submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,predicted
0,914206820-914239427-717245,3.354104
1,916789157-916823770-824309,4.831055
2,913341082-977479363-2948,6.166362
3,889682582-889697172-28720,9.654043
4,997991699-998006945-417222,4.250768


In [91]:
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction

def prediction_with_kfold(model,X_train,Y_train):
    for model in models:
        kfold = KFold(n_splits=5, shuffle=True, random_state=SEED) # k=10, split the data into 10 equal parts
        xyz=[]
        accuracy=[]
        std=[]
        cv_result = cross_val_score(model,X_train, np.ravel(Y_train), cv = kfold)
        cv_result=cv_result
        xyz.append(cv_result.mean())
        std.append(cv_result.std())
        accuracy.append(cv_result)                                  
        new_models_dataframe2=pd.DataFrame({'CV Mean':xyz,'Std':std}) 
        print(new_models_dataframe2)

In [92]:
prediction_with_kfold(models, X_train, Y_train)

    CV Mean       Std
0  0.452727  0.012716
    CV Mean       Std
0  0.446343  0.014254
    CV Mean      Std
0  0.477386  0.01168
