In [1]:
import datetime
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import time
import warnings
import scipy
from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import KFold
from math import sqrt
from sklearn.metrics import mean_squared_error
from math import sqrt

warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

FEATS_EXCLUDED = ["Gun","MagazaID","MerchAltGrupID","UrunKlasmanID","SatisAdet","MerchGrup","MerchGrupID","MerchMarkaYasGrupID","BuyerGrupID","KlasmanGrupID","VucutBolge","KlasmanIklimStatu"]

In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))
    
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

def thr_to_accuracy(thr, Y_test, predictions):
    return -accuracy_score(Y_test, np.array(predictions>thr, dtype=np.int))

#from sklearn.utils import check_arrays
from sklearn.metrics import mean_absolute_error,mean_squared_error

def mape(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / (y_true+0.00000000001))) * 100

# LightGBM GBDT with KFold or Stratified KFold
def kfold_lightgbm(params, train_df, test_df, num_folds,FEATS_EXCLUDED):
    print("Starting LightGBM. Train shape: {}".format(train_df.shape))
    folds = KFold(n_splits= num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    final_rmse = 0
    final_mape = 0
    final_mae = 0
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]
    clfs = []
    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['SatisAdet'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['SatisAdet'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['SatisAdet'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x,label=train_y,free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,label=valid_y,free_raw_data=False)

        reg = lgb.train(params,lgb_train,valid_sets=[lgb_train, lgb_test],valid_names=['train', 'valid'],
                        num_boost_round=100000,early_stopping_rounds= 100,verbose_eval=200)

        oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
        sub_preds += reg.predict(test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        curr_rmse = sqrt(mean_squared_error(valid_y, oof_preds[valid_idx]))
        curr_mape = mape(valid_y, oof_preds[valid_idx]) 
        curr_mae = mean_absolute_error(valid_y, oof_preds[valid_idx])
        
        #curr_rmse = roc_auc_score(valid_y, oof_preds[valid_idx])
        final_rmse += curr_rmse/num_folds
        final_mape += curr_mape/num_folds
        final_mae += curr_mae/num_folds
        print('Fold %2d rmse : %.6f' % (n_fold + 1, curr_rmse))
        print('Fold %2d mape : %.6f' % (n_fold + 1, curr_mape))
        print('Fold %2d mae : %.6f' % (n_fold + 1, curr_mae))

        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()
              
    print('Averall RMSE : %.6f' % (final_rmse))
    print('Averall MAPE : %.6f' % (final_mape))
    print('Averall MAE : %.6f' % (final_mae))

    # save submission file
    #test_df.loc[:,'SatisAdet'] = sub_preds
    #test_df = test_df.reset_index()
    #test_df = test_df[['card_id', 'SatisAdet']]
    #submission_file_name = "lgbm_"+str(np.round(final_rmse,4))+".csv"
    #test_df.to_csv(DATAPATH+submission_file_name, index=False)    
        
    return sub_preds, oof_preds, feature_importance_df, final_rmse    

def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  #tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    assert len(trn_series) == len(target)
    #assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    """
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_tst_series.index = tst_series.index
    """
    return add_noise(ft_trn_series, noise_level) #, add_noise(ft_tst_series, noise_level)

def missing_fun(data):
    missing_value_df = data.dtypes.to_frame("type").reset_index()
    percent_missing = data.isnull().sum() * 100 / len(data)
    nunique = data.nunique(dropna=False).values
    missing_value_df["percent_missing"] = percent_missing.values
    missing_value_df["nunique"] = nunique
    missing_value_df = missing_value_df.loc[missing_value_df.percent_missing !=0]
    missing_value_df = missing_value_df.sort_values(by="percent_missing", ascending=False)
    return missing_value_df

In [None]:
train = pd.read_csv("data/SatisiKesfet_TrainData.csv")

In [None]:
train["Markup"] = 1-train["Markup"]
train["SezonGrup"] = train["SezonGrup"].apply(lambda x: x.strip())
train["SezonGrup"] = train["SezonGrup"].map({'Y': 1, 'K': 0})

In [None]:
klasman = pd.read_csv("data/Dim_Klasman.csv")
magaza = pd.read_csv("data/Dim_Magaza.csv")
ozelguntanimlari = pd.read_csv("data/Dim_OzelGunTanimlari.csv")
tarih = pd.read_csv("data/Dim_Tarih.csv")
meteoroloji = pd.read_csv("data/MeteorolojiDegerleri.csv")

In [None]:
train = train.merge(tarih[["Gun","Yil","Ay","YilHafta"]],"left",["Gun"])

In [None]:
train = train.merge(magaza[["MagazaID","SehirID","OutletMi"]],"left",["MagazaID"])

In [None]:
train = train.merge(meteoroloji,"left",["Gun","SehirID"])

In [None]:
train = train.drop("SehirID", axis=1)

In [None]:
for col in ["MinimumSicaklik","OrtalamaSicaklik","MaksimumSicaklik","YagisMiktari","KarKalinligi","Yagmur","Kar"]:
    train[col] = train.groupby("YilHafta")[col].transform(lambda x: x.fillna(x.mean()))

In [None]:
FEATS_EXCLUDED = ["Gun","MagazaID","MerchAltGrupID","UrunKlasmanID","SatisAdet","MerchGrup","MerchGrupID","MerchMarkaYasGrupID","BuyerGrupID","KlasmanGrupID","VucutBolge","KlasmanIklimStatu"]

In [None]:
train["GunSonuDepoStok"] = train["GunSonuToplamStok"] - train["GunSonuReyonStok"]

In [None]:
cols = ["OrtBirimFiyat","OrtBirimMaliyet","IndirimOrani","KarMarji"]
mask = (train.OrtBirimFiyat == 0)&(train.OrtBirimMaliyet == 0)&(train.IndirimOrani == 0)&(train.KarMarji == 0)
train.loc[mask, cols] = np.nan
train[cols] = train.groupby(['MerchAltGrupID', 'UrunKlasmanID'])[cols].transform(lambda x: x.fillna(x.mean()))
train[cols] = train[cols].fillna(0)

### Sezon Bazında

In [None]:
season_sum_cols = ['GunSonuReyonStok', 'GunSonuToplamStok','ModelSayisi','GunSonuDepoStok',"SatisAdet"]
season_mean_cols = ["OrtBirimFiyat","OrtBirimMaliyet","IndirimOrani","KarMarji","IndirimOrani","IlkFiyattanSatisOrani",'Markup']

In [None]:
for col in train.columns:
    if col not in FEATS_EXCLUDED+["SezonGrup"]:
        if col in season_mean_cols:
            train[col] = train.groupby(["MagazaID","MerchAltGrupID","UrunKlasmanID","Gun"])[col].transform("mean")
        if col in season_sum_cols:
            train[col] = train.groupby(["MagazaID","MerchAltGrupID","UrunKlasmanID","Gun"])[col].transform("sum")

In [None]:
train = train[train.duplicated(["Gun","UrunKlasmanID","MerchAltGrupID","MagazaID"])==False]
train = train.drop(["Gun","SezonGrup","Hafta"],axis=1)

In [None]:
season_sum_cols = ["SatisAdet"]
week_mean_cols = ["OrtBirimFiyat","OrtBirimMaliyet","IndirimOrani","KarMarji","IndirimOrani","IlkFiyattanSatisOrani",
                  'Markup','GunSonuReyonStok', 'GunSonuToplamStok','ModelSayisi','GunSonuDepoStok',
                 'MinimumSicaklik', 'OrtalamaSicaklik', 'MaksimumSicaklik','YagisMiktari', 'KarKalinligi', 'Yagmur', 'Kar']

In [None]:
for col in train.columns:
    if col not in FEATS_EXCLUDED+["SezonGrup"]:
        if col in season_mean_cols:
            train[col] = train.groupby(["MagazaID","MerchAltGrupID","UrunKlasmanID","YilHafta"])[col].transform("mean")
        if col in season_sum_cols:
            train[col] = train.groupby(["MagazaID","MerchAltGrupID","UrunKlasmanID","YilHafta"])[col].transform("sum")

In [None]:
train.shape

In [None]:
train = train.drop_duplicates(["UrunKlasmanID","MerchAltGrupID","MagazaID","YilHafta"])

In [None]:
train.to_csv("SatisiKesfet_TrainData_Week.csv", index = False)

### Feature Engineering

In [3]:
train = pd.read_csv("SatisiKesfet_TrainData_Week.csv")

In [4]:
klasman = pd.read_csv("data/Dim_Klasman.csv")
train = train.merge(klasman,"left",["MerchAltGrupID","UrunKlasmanID"])

#### Aggregations

In [5]:
FEATS_EXCLUDED = ["Gun","MagazaID","MerchAltGrupID","UrunKlasmanID","SatisAdet","MerchGrup","MerchGrupID","MerchMarkaYasGrupID",
                  "BuyerGrupID","KlasmanGrupID","VucutBolge","KlasmanIklimStatu", "SezonGrup","YilHafta"]

In [6]:
# MerchGrup Based Aggregations
aggs = {}
for col in train.columns:
    if col not in FEATS_EXCLUDED:
        aggs[col] = ['sum','mean']
        #aggs[col] = ['sum','max','min','mean','median']

aggs["MerchMarkaYasGrupID"] = ["nunique"]        
aggs["BuyerGrupID"] = ["nunique"]
aggs["KlasmanGrupID"] = ["nunique"]
aggs["VucutBolge"] = ["nunique"]
aggs["KlasmanIklimStatu"] = ["nunique"]

aggs["MerchAltGrupID"] = ["nunique"]
aggs["UrunKlasmanID"] = ["nunique"]
        
agg1 = train.reset_index().groupby(["MerchGrup","YilHafta"]).agg(aggs)
agg1.columns = pd.Index([e[0] + "_" + e[1] for e in agg1.columns.tolist()])
agg1.columns = ['MerchGrup_Hafta_'+ c for c in agg1.columns]
agg1 = agg1.reset_index()

In [7]:
# MerchGrup and MerchMarkaYasGrupID Based Aggregations
aggs = {}
for col in train.columns:
    if col not in FEATS_EXCLUDED:
        aggs[col] = ['sum','mean']

aggs["BuyerGrupID"] = ["nunique"]
aggs["KlasmanGrupID"] = ["nunique"]
aggs["VucutBolge"] = ["nunique"]
aggs["KlasmanIklimStatu"] = ["nunique"]

aggs["MerchAltGrupID"] = ["nunique"]
aggs["UrunKlasmanID"] = ["nunique"]
        
agg2 = train.reset_index().groupby(["MerchGrup","MerchMarkaYasGrupID","YilHafta"]).agg(aggs)
agg2.columns = pd.Index([e[0] + "_" + e[1] for e in agg2.columns.tolist()])
agg2.columns = ['MerchMarkaYasGrupID_Hafta_'+ c for c in agg2.columns]
agg2 = agg2.reset_index()

In [8]:
# MerchGrup, MerchMarkaYasGrupID, MerchAltGrupID Based Aggregations
aggs = {}
for col in train.columns:
    if col not in FEATS_EXCLUDED:
        aggs[col] = ['sum','mean']

aggs["BuyerGrupID"] = ["nunique"]
aggs["KlasmanGrupID"] = ["nunique"]
aggs["VucutBolge"] = ["nunique"]
aggs["KlasmanIklimStatu"] = ["nunique"]

aggs["UrunKlasmanID"] = ["nunique"]
        
agg3 = train.reset_index().groupby(["MerchGrup","MerchMarkaYasGrupID", "MerchAltGrupID","YilHafta"]).agg(aggs)
agg3.columns = pd.Index([e[0] + "_" + e[1] for e in agg3.columns.tolist()])
agg3.columns = ['MerchAltGrupID_Hafta_'+ c for c in agg3.columns]
agg3 = agg3.reset_index()

In [9]:
# MerchGrup, MerchMarkaYasGrupID, MerchAltGrupID, BuyerGrupID Based Aggregations
aggs = {}
for col in train.columns:
    if col not in FEATS_EXCLUDED:
        aggs[col] = ['sum','mean']

aggs["KlasmanGrupID"] = ["nunique"]
aggs["VucutBolge"] = ["nunique"]
aggs["KlasmanIklimStatu"] = ["nunique"]

aggs["UrunKlasmanID"] = ["nunique"]
        
agg4 = train.reset_index().groupby(["MerchGrup","MerchMarkaYasGrupID", "MerchAltGrupID","BuyerGrupID","YilHafta"]).agg(aggs)
agg4.columns = pd.Index([e[0] + "_" + e[1] for e in agg4.columns.tolist()])
agg4.columns = ['BuyerGrupID_Hafta_'+ c for c in agg4.columns]
agg4 = agg4.reset_index()

In [10]:
# MerchGrup, MerchMarkaYasGrupID, MerchAltGrupID, BuyerGrupID, KlasmanGrupID Based Aggregations
aggs = {}
for col in train.columns:
    if col not in FEATS_EXCLUDED:
        aggs[col] = ['sum','mean']

aggs["VucutBolge"] = ["nunique"]
aggs["KlasmanIklimStatu"] = ["nunique"]

aggs["UrunKlasmanID"] = ["nunique"]
        
agg5 = train.reset_index().groupby(["MerchGrup","MerchMarkaYasGrupID", "MerchAltGrupID","BuyerGrupID","KlasmanGrupID","YilHafta"]).agg(aggs)
agg5.columns = pd.Index([e[0] + "_" + e[1] for e in agg5.columns.tolist()])
agg5.columns = ['KlasmanGrupID_Hafta_'+ c for c in agg5.columns]
agg5 = agg5.reset_index()

In [11]:
train = train.merge(agg1,"left",["MerchGrup","YilHafta"]).\
    merge(agg2,"left",["MerchGrup","MerchMarkaYasGrupID","YilHafta"]).\
    merge(agg3,"left",["MerchGrup","MerchMarkaYasGrupID","MerchAltGrupID","YilHafta"]).\
    merge(agg4,"left",["MerchGrup","MerchMarkaYasGrupID","MerchAltGrupID","BuyerGrupID","YilHafta"]).\
    merge(agg5,"left",["MerchGrup","MerchMarkaYasGrupID","MerchAltGrupID","BuyerGrupID","KlasmanGrupID","YilHafta"])

In [12]:
#train.to_csv("SatisiKesfet_TrainData_Week_Final.csv", index=False)

In [13]:
train.shape

(10817221, 257)

### Model

In [14]:
# Dropping columns with highly frequent values
cols_to_drop = []
drop_values = []
for col in train.columns:
    temp = train[col].value_counts(dropna=False, normalize=1)
    if len(temp[temp>0.99].index)>0:
        cols_to_drop.append(col)
        drop_values.append(temp)  
#train.drop(cols_to_drop, axis = 1, inplace = True)
print("These columns were dropped: {} \n Since they have some values covering 99.9% of its all values".format(cols_to_drop))

These columns were dropped: ['MerchGrup_Hafta_KlasmanIklimStatu_nunique', 'MerchGrup_Hafta_VucutBolge_nunique', 'MerchMarkaYasGrupID_Hafta_KlasmanIklimStatu_nunique', 'KlasmanGrupID_Hafta_KlasmanIklimStatu_nunique'] 
 Since they have some values covering 99.9% of its all values


In [15]:
train.drop(cols_to_drop, axis = 1, inplace = True)

In [16]:
FEATS_EXCLUDED = ["Gun","MagazaID","MerchAltGrupID","UrunKlasmanID","SatisAdet","MerchGrup","MerchGrupID","MerchMarkaYasGrupID",
                  "BuyerGrupID","KlasmanGrupID","VucutBolge","KlasmanIklimStatu","YilHafta"]

### LGBM

In [28]:
# params optimized by optuna
params = {'task':'train','objective': 'regression','metric': 'rmse','learning_rate': 0.05,'verbose': -1,'nthread':-1,
          'num_leaves': 10, 'min_data': 50, 'max_depth': 7, 'min_data_in_leaf': 50, 'feature_fraction': 0.8,'bagging_fraction': 0.8,'bagging_freq': 2 }

In [29]:
def kfold_lightgbm(params, train_df, num_folds,FEATS_EXCLUDED):
    print("Starting LightGBM. Train shape: {}".format(train_df.shape))
    folds = KFold(n_splits= num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    feature_importance_df = pd.DataFrame()
    final_rmse = 0
    final_mape = 0
    final_mae = 0
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]
    clfs = []
    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['SatisAdet'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['SatisAdet'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['SatisAdet'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x,label=train_y,free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,label=valid_y,free_raw_data=False)

        reg = lgb.train(params,lgb_train,valid_sets=[lgb_train, lgb_test],valid_names=['train', 'valid'],
                        num_boost_round=100000,early_stopping_rounds= 100,verbose_eval=200)
        clfs.append(reg)
        oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
        #sub_preds += reg.predict(test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        curr_rmse = sqrt(mean_squared_error(valid_y, oof_preds[valid_idx]))
        curr_mape = mape(valid_y, oof_preds[valid_idx]) 
        curr_mae = mean_absolute_error(valid_y, oof_preds[valid_idx])
        
        #curr_rmse = roc_auc_score(valid_y, oof_preds[valid_idx])
        final_rmse += curr_rmse/num_folds
        final_mape += curr_mape/num_folds
        final_mae += curr_mae/num_folds
        print('Fold %2d rmse : %.6f' % (n_fold + 1, curr_rmse))
        print('Fold %2d mae : %.6f' % (n_fold + 1, curr_mae))

        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()
              
    print('Averall RMSE : %.6f' % (final_rmse))
    print('Averall MAE : %.6f' % (final_mae))

    # save submission file
    #test_df.loc[:,'SatisAdet'] = sub_preds
    #test_df = test_df.reset_index()
    #test_df = test_df[['card_id', 'SatisAdet']]
    #submission_file_name = "lgbm_"+str(np.round(final_rmse,4))+".csv"
    #test_df.to_csv(DATAPATH+submission_file_name, index=False)    
        
    return clfs, oof_preds, feature_importance_df, final_rmse    

In [32]:
clfs, oof_preds, feature_importance_df, final_rmse = kfold_lightgbm(params,train, num_folds = 3, FEATS_EXCLUDED=FEATS_EXCLUDED)

Starting LightGBM. Train shape: (10817221, 253)
Training until validation scores don't improve for 100 rounds.
[200]	train's rmse: 6.06294	valid's rmse: 6.1268
[400]	train's rmse: 5.92368	valid's rmse: 6.01224
[600]	train's rmse: 5.83773	valid's rmse: 5.95535
[800]	train's rmse: 5.77433	valid's rmse: 5.91347
[1000]	train's rmse: 5.72504	valid's rmse: 5.88611
[1200]	train's rmse: 5.6823	valid's rmse: 5.86097
[1400]	train's rmse: 5.64424	valid's rmse: 5.84105
[1600]	train's rmse: 5.61152	valid's rmse: 5.82434
[1800]	train's rmse: 5.58052	valid's rmse: 5.81172
[2000]	train's rmse: 5.55385	valid's rmse: 5.80363
[2200]	train's rmse: 5.52851	valid's rmse: 5.79414
[2400]	train's rmse: 5.50286	valid's rmse: 5.78383
[2600]	train's rmse: 5.48112	valid's rmse: 5.77636
[2800]	train's rmse: 5.4606	valid's rmse: 5.76968
[3000]	train's rmse: 5.44145	valid's rmse: 5.76475
[3200]	train's rmse: 5.42386	valid's rmse: 5.76029
[3400]	train's rmse: 5.40459	valid's rmse: 5.75377
[3600]	train's rmse: 5.38594	

In [34]:
import pickle
with open('weekly_lgbm.pkl', 'wb') as f:
    pickle.dump(clfs, f)

In [37]:
pred_df = train[["YilHafta","MagazaID","MerchAltGrupID","UrunKlasmanID","SatisAdet"]].copy()
pred_df["pred"] = oof_preds
pred_df.to_csv("weekly_oof.csv", index = False)

In [50]:
pred_df.loc[pred_df.SatisAdet==0, "pred"] = 0
pred_df.to_csv("weekly_oof_post.csv", index = False)

In [38]:
test = pd.read_csv("SatisiKesfet_TestData_Week_Final.csv")
feats = [f for f in train.columns if f not in FEATS_EXCLUDED]
sub_preds = np.zeros(train.shape[0])
for clf in clfs:
    sub_preds += clf.predict(test[feats], num_iteration=reg.best_iteration) / len(clfs)
sub_df = test[["YilHafta","MagazaID","MerchAltGrupID","UrunKlasmanID"]].copy()
sub_df["pred"] = sub_preds
sub_df.to_csv("weekly_preds.csv", index = False)

NameError: name 'train_df' is not defined

In [41]:
len(clfs)

3

In [43]:
sub_preds = np.zeros(test.shape[0])
for clf in clfs:
    sub_preds += clf.predict(test[feats], num_iteration=clf.best_iteration) / len(clfs)
sub_df = test[["YilHafta","MagazaID","MerchAltGrupID","UrunKlasmanID"]].copy()
sub_df["pred"] = sub_preds
sub_df.to_csv("weekly_preds.csv", index = False)

In [57]:
feature_importance_df.to_csv("feature_importance_weekly.csv",index = False)

In [59]:
!free -m

              total        used        free      shared  buff/cache   available
Mem:         380166      282983       87953           0        9228       95150
Swap:             0           0           0


In [None]:
#sub_df.loc[sub_df.SatisAdet==0, "pred"] = 0
#sub_df.to_csv("weekly_oof_post.csv", index = False)1

In [47]:
mape_valid[mape_valid.OrtBirimFiyat==0].head()

Unnamed: 0,OrtBirimFiyat,SatisAdet,pred
15562,0.0,0,0.0
15666,0.0,0,0.0
16097,0.0,0,0.0
46323,0.0,0,0.0
53235,0.0,0,0.0


In [48]:
mape_valid = train[["OrtBirimFiyat","SatisAdet"]].copy()
mape_valid["pred"] = oof_preds
mape_valid.loc[mape_valid["OrtBirimFiyat"]==0, "pred"] = 0
mape_valid.loc[mape_valid["SatisAdet"]==0, "pred"] = 0
mape_valid2 = mape_valid[mape_valid.SatisAdet!=0]
curr_rmse = sqrt(mean_squared_error(mape_valid.SatisAdet, mape_valid.pred))
curr_mape = mape(mape_valid2.SatisAdet, mape_valid2.pred)
curr_mae = mean_absolute_error(mape_valid.SatisAdet, mape_valid.pred)
print('Averall RMSE : %.6f' % (curr_rmse))
print('Averall MAPE : %.6f' % (curr_mape))
print('Averall MAE : %.6f' % (curr_mae))

Averall RMSE : 5.399932
Averall MAPE : 80.196364
Averall MAE : 1.617424


### Ridge

In [None]:
from sklearn.linear_model import Ridge, Lasso
def kfold_ridge(train_df, num_folds,FEATS_EXCLUDED):
    print("Starting LightGBM. Train shape: {}".format(train_df.shape))
    folds = KFold(n_splits= num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    final_rmse = 0
    final_mae = 0
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]
    clfs = []
    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['SatisAdet'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['SatisAdet'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['SatisAdet'].iloc[valid_idx]

        reg = Lasso(alpha=0) # higher the alpha value, more restriction on the coefficients; low alpha > more generalization, coefficients are barely
        reg.fit(train_x, train_y)
        clfs.append(reg)
        oof_preds[valid_idx] = reg.predict(valid_x)
        #sub_preds += reg.predict(test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits
        curr_rmse = sqrt(mean_squared_error(valid_y, oof_preds[valid_idx]))
        curr_mae = mean_absolute_error(valid_y, oof_preds[valid_idx])
        
        #curr_rmse = roc_auc_score(valid_y, oof_preds[valid_idx])
        final_rmse += curr_rmse/num_folds
        final_mae += curr_mae/num_folds
        print('Fold %2d rmse : %.6f' % (n_fold + 1, curr_rmse))
        print('Fold %2d mae : %.6f' % (n_fold + 1, curr_mae))

        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()
              
    print('Averall RMSE : %.6f' % (final_rmse))
    print('Averall MAE : %.6f' % (final_mae))
    
    return clfs, oof_preds, final_rmse    

In [None]:
gc.collect()

In [None]:
clfs, oof_preds, final_rmse = kfold_ridge(train, num_folds=5, FEATS_EXCLUDED=FEATS_EXCLUDED)

In [None]:
with open('weekly_ridge.pkl', 'wb') as f:
    pickle.dump(clfs, f)

### Lasso & Ridge

In [None]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
feats = [f for f in train.columns if f not in FEATS_EXCLUDED]
rr = Ridge(alpha=0.01) # higher the alpha value, more restriction on the coefficients; low alpha > more generalization, coefficients are barely
rr.fit(train.loc[train.Yil<2017, feats], train.loc[train.Yil<2017, "SatisAdet"])
preds = rr.predict(train.loc[train.Yil==2017, feats])
curr_rmse = sqrt(mean_squared_error(train.loc[train.Yil==2017,"SatisAdet"], preds))
curr_mae = mean_absolute_error(train.loc[train.Yil==2017,"SatisAdet"], preds)
print('Averall RMSE : %.6f' % (curr_rmse))
print('Averall MAE : %.6f' % (curr_mae))

In [None]:
mape_valid = train.loc[train.Yil==2017, ["OrtBirimFiyat","SatisAdet"]].copy()
mape_valid["pred"] = preds
mape_valid.loc[mape_valid["OrtBirimFiyat"]==0, "pred"] = 0
mape_valid2 = mape_valid[mape_valid.SatisAdet!=0]
curr_rmse = sqrt(mean_squared_error(mape_valid.SatisAdet, mape_valid.pred))
curr_mape = mape(mape_valid2.SatisAdet, mape_valid2.pred)
curr_mae = mean_absolute_error(mape_valid.SatisAdet, mape_valid.pred)
print('Averall RMSE : %.6f' % (curr_rmse))
print('Averall MAPE : %.6f' % (curr_mape))
print('Averall MAE : %.6f' % (curr_mae))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
feats = [f for f in train.columns if f not in FEATS_EXCLUDED]
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(train.loc[train.Yil<2017, feats], train.loc[train.Yil<2017, "SatisAdet"])
preds = rf.predict(train.loc[train.Yil==2017, feats])
curr_rmse = sqrt(mean_squared_error(train.loc[train.Yil==2017,"SatisAdet"], preds))
curr_mae = mean_absolute_error(train.loc[train.Yil==2017,"SatisAdet"], preds)
print('Averall RMSE : %.6f' % (curr_rmse))
print('Averall MAE : %.6f' % (curr_mae))