In [1]:
import datetime
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import time
import warnings
import scipy
from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
#from sklearn.model_selection import KFold, StratifiedKFold, test_test_split
from math import sqrt
from sklearn.metrics import mean_squared_error
from math import sqrt

warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

FEATS_EXCLUDED = ["Gun","MagazaID","MerchAltGrupID","UrunKlasmanID","SatisAdet","MerchGrup","MerchGrupID","MerchMarkaYasGrupID","BuyerGrupID","KlasmanGrupID","VucutBolge","KlasmanIklimStatu"]

In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))
    
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", test=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

def thr_to_accuracy(thr, Y_test, predictions):
    return -accuracy_score(Y_test, np.array(predictions>thr, dtype=np.int))

#from sklearn.utils import check_arrays
from sklearn.metrics import mean_absolute_error,mean_squared_error

def mape(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / (y_true+0.00000000001))) * 100

def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  #tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    assert len(trn_series) == len(target)
    #assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target test
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    """
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_tst_series.index = tst_series.index
    """
    return add_noise(ft_trn_series, noise_level) #, add_noise(ft_tst_series, noise_level)

def missing_fun(test):
    missing_value_df = test.dtypes.to_frame("type").reset_index()
    percent_missing = test.isnull().sum() * 100 / len(test)
    nunique = test.nunique(dropna=False).values
    missing_value_df["percent_missing"] = percent_missing.values
    missing_value_df["nunique"] = nunique
    missing_value_df = missing_value_df.loc[missing_value_df.percent_missing !=0]
    missing_value_df = missing_value_df.sort_values(by="percent_missing", ascending=False)
    return missing_value_df

In [4]:
test = pd.read_csv("data/SatisiKesfet_TestData.csv",usecols=["Gun","MagazaID","MerchAltGrupID","UrunKlasmanID","OrtBirimFiyat"])

In [7]:
test = test.merge(tarih[["Gun","YilHafta"]],"left",["Gun"])

In [8]:
test.head()

Unnamed: 0,Gun,MagazaID,MerchAltGrupID,UrunKlasmanID,OrtBirimFiyat,YilHafta
0,20180613,544,172,3073,0.0,201824
1,20180605,420,134,3599,0.0,201823
2,20181126,477,164,3557,39.99,201848
3,20180827,490,16,3787,39.95,201835
4,20181226,463,166,3771,29.99,201852


In [9]:
test["OrtBirimFiyat"] = test.groupby(["MagazaID","MerchAltGrupID","UrunKlasmanID","YilHafta"])["OrtBirimFiyat"].transform("mean")

In [17]:
preds = pd.read_csv("weekly_preds.csv")

In [14]:
test = test.drop_duplicates()

In [15]:
test.shape

(3967187, 5)

In [16]:
test.head()

Unnamed: 0,MagazaID,MerchAltGrupID,UrunKlasmanID,OrtBirimFiyat,YilHafta
0,544,172,3073,29.852857,201824
1,420,134,3599,0.0,201823
2,477,164,3557,17.138571,201848
3,490,16,3787,28.498571,201835
4,463,166,3771,12.778571,201852


In [22]:
preds.head()

Unnamed: 0,YilHafta,MagazaID,MerchAltGrupID,UrunKlasmanID,pred
0,201824,544,172,3073,2.259339
1,201823,420,134,3599,4.928994
2,201848,477,164,3557,8.383911
3,201835,490,16,3787,55.305679
4,201852,463,166,3771,26.594463


In [23]:
test = test.merge(preds,"left",["MagazaID","MerchAltGrupID","UrunKlasmanID","YilHafta"])

In [25]:
test.loc[test.OrtBirimFiyat==0,"pred"] = 0

In [26]:
test.loc[test.pred<0,"pred"] = 0

Unnamed: 0,MagazaID,MerchAltGrupID,UrunKlasmanID,OrtBirimFiyat,YilHafta,pred
11,420,130,3656,6.271429,201815,-2.536657
34,237,206,3097,35.678571,201836,-0.668234
110,199,172,3080,14.282857,201839,-1.665960
134,399,164,3793,13.346429,201836,-0.906874
148,544,134,3667,4.589286,201826,-5.609657
209,490,130,3594,20.875714,201826,-1.030760
212,150,54,3898,1.850000,201815,-0.555533
267,150,160,4214,7.702857,201839,-1.655967
286,406,188,3415,22.835714,201821,-5.049982
293,150,166,3557,8.486429,201844,-5.299428


In [29]:
test.to_csv("weekly_preds_post.csv", index=False)

In [None]:
test["Markup"] = 1-test["Markup"]
test["SezonGrup"] = test["SezonGrup"].apply(lambda x: x.strip())
test["SezonGrup"] = test["SezonGrup"].map({'Y': 1, 'K': 0})

In [6]:
klasman = pd.read_csv("data/Dim_Klasman.csv")
magaza = pd.read_csv("data/Dim_Magaza.csv")
ozelguntanimlari = pd.read_csv("data/Dim_OzelGunTanimlari.csv")
tarih = pd.read_csv("data/Dim_Tarih.csv")
meteoroloji = pd.read_csv("data/MeteorolojiDegerleri.csv")

In [None]:
test = test.merge(tarih[["Gun","Yil","Ay","YilHafta"]],"left",["Gun"])

In [None]:
test = test.merge(magaza[["MagazaID","SehirID","OutletMi"]],"left",["MagazaID"])

In [None]:
test = test.merge(meteoroloji,"left",["Gun","SehirID"])

In [None]:
for col in ["MinimumSicaklik","OrtalamaSicaklik","MaksimumSicaklik","YagisMiktari","KarKalinligi","Yagmur","Kar"]:
    test[col] = test.groupby("YilHafta")[col].transform(lambda x: x.fillna(x.mean()))

In [None]:
FEATS_EXCLUDED = ["Gun","MagazaID","MerchAltGrupID","UrunKlasmanID","SatisAdet","MerchGrup","MerchGrupID","MerchMarkaYasGrupID","BuyerGrupID","KlasmanGrupID","VucutBolge","KlasmanIklimStatu"]

In [None]:
test["GunSonuDepoStok"] = test["GunSonuToplamStok"] - test["GunSonuReyonStok"]

In [None]:
cols = ["OrtBirimFiyat","OrtBirimMaliyet","IndirimOrani","KarMarji"]
mask = (test.OrtBirimFiyat == 0)&(test.OrtBirimMaliyet == 0)&(test.IndirimOrani == 0)&(test.KarMarji == 0)
test.loc[mask, cols] = np.nan
test[cols] = test.groupby(['MerchAltGrupID', 'UrunKlasmanID'])[cols].transform(lambda x: x.fillna(x.mean()))
test[cols] = test[cols].fillna(0)

### Sezon Bazında

In [None]:
season_sum_cols = ['GunSonuReyonStok', 'GunSonuToplamStok','ModelSayisi','GunSonuDepoStok']
season_mean_cols = ["OrtBirimFiyat","OrtBirimMaliyet","IndirimOrani","KarMarji","IndirimOrani","IlkFiyattanSatisOrani",'Markup']

In [None]:
for col in test.columns:
    if col not in FEATS_EXCLUDED+["SezonGrup"]:
        if col in season_mean_cols:
            test[col] = test.groupby(["MagazaID","MerchAltGrupID","UrunKlasmanID","Gun"])[col].transform("mean")
        if col in season_sum_cols:
            test[col] = test.groupby(["MagazaID","MerchAltGrupID","UrunKlasmanID","Gun"])[col].transform("sum")

In [None]:
test = test.drop_duplicates(["Gun","UrunKlasmanID","MerchAltGrupID","MagazaID"])
test = test.drop(["Gun","SezonGrup"],axis=1)

In [None]:
week_mean_cols = ["OrtBirimFiyat","OrtBirimMaliyet","IndirimOrani","KarMarji","IndirimOrani","IlkFiyattanSatisOrani",
                  'Markup','GunSonuReyonStok', 'GunSonuToplamStok','ModelSayisi','GunSonuDepoStok',
                 'MinimumSicaklik', 'OrtalamaSicaklik', 'MaksimumSicaklik','YagisMiktari', 'KarKalinligi', 'Yagmur', 'Kar']

In [None]:
for col in test.columns:
    if col not in FEATS_EXCLUDED+["SezonGrup"]:
        if col in season_sum_cols:
            test[col] = test.groupby(["MagazaID","MerchAltGrupID","UrunKlasmanID","YilHafta"])[col].transform("sum")

In [None]:
test = test.drop_duplicates(["UrunKlasmanID","MerchAltGrupID","MagazaID","YilHafta"])

In [None]:
test.to_csv("SatisiKesfet_TestData_Week.csv", index = False)

### Feature Engineering

In [None]:
test = test.merge(klasman,"left",["MerchAltGrupID","UrunKlasmanID"])

#### Aggregations

In [None]:
FEATS_EXCLUDED = ["Gun","MagazaID","MerchAltGrupID","UrunKlasmanID","SatisAdet","MerchGrup","MerchGrupID","MerchMarkaYasGrupID",
                  "BuyerGrupID","KlasmanGrupID","VucutBolge","KlasmanIklimStatu", "SezonGrup","YilHafta"]

In [None]:
# MerchGrup Based Aggregations
aggs = {}
for col in test.columns:
    if col not in FEATS_EXCLUDED:
        aggs[col] = ['sum','mean']
        #aggs[col] = ['sum','max','min','mean','median']

aggs["MerchMarkaYasGrupID"] = ["nunique"]        
aggs["BuyerGrupID"] = ["nunique"]
aggs["KlasmanGrupID"] = ["nunique"]
aggs["VucutBolge"] = ["nunique"]
aggs["KlasmanIklimStatu"] = ["nunique"]

aggs["MerchAltGrupID"] = ["nunique"]
aggs["UrunKlasmanID"] = ["nunique"]
        
agg1 = test.reset_index().groupby(["MerchGrup","YilHafta"]).agg(aggs)
agg1.columns = pd.Index([e[0] + "_" + e[1] for e in agg1.columns.tolist()])
agg1.columns = ['MerchGrup_Hafta_'+ c for c in agg1.columns]
agg1 = agg1.reset_index()

In [None]:
# MerchGrup and MerchMarkaYasGrupID Based Aggregations
aggs = {}
for col in test.columns:
    if col not in FEATS_EXCLUDED:
        aggs[col] = ['sum','mean']

aggs["BuyerGrupID"] = ["nunique"]
aggs["KlasmanGrupID"] = ["nunique"]
aggs["VucutBolge"] = ["nunique"]
aggs["KlasmanIklimStatu"] = ["nunique"]

aggs["MerchAltGrupID"] = ["nunique"]
aggs["UrunKlasmanID"] = ["nunique"]
        
agg2 = test.reset_index().groupby(["MerchGrup","MerchMarkaYasGrupID","YilHafta"]).agg(aggs)
agg2.columns = pd.Index([e[0] + "_" + e[1] for e in agg2.columns.tolist()])
agg2.columns = ['MerchMarkaYasGrupID_Hafta_'+ c for c in agg2.columns]
agg2 = agg2.reset_index()

In [None]:
# MerchGrup, MerchMarkaYasGrupID, MerchAltGrupID Based Aggregations
aggs = {}
for col in test.columns:
    if col not in FEATS_EXCLUDED:
        aggs[col] = ['sum','mean']

aggs["BuyerGrupID"] = ["nunique"]
aggs["KlasmanGrupID"] = ["nunique"]
aggs["VucutBolge"] = ["nunique"]
aggs["KlasmanIklimStatu"] = ["nunique"]

aggs["UrunKlasmanID"] = ["nunique"]
        
agg3 = test.reset_index().groupby(["MerchGrup","MerchMarkaYasGrupID", "MerchAltGrupID","YilHafta"]).agg(aggs)
agg3.columns = pd.Index([e[0] + "_" + e[1] for e in agg3.columns.tolist()])
agg3.columns = ['MerchAltGrupID_Hafta_'+ c for c in agg3.columns]
agg3 = agg3.reset_index()

In [None]:
# MerchGrup, MerchMarkaYasGrupID, MerchAltGrupID, BuyerGrupID Based Aggregations
aggs = {}
for col in test.columns:
    if col not in FEATS_EXCLUDED:
        aggs[col] = ['sum','mean']

aggs["KlasmanGrupID"] = ["nunique"]
aggs["VucutBolge"] = ["nunique"]
aggs["KlasmanIklimStatu"] = ["nunique"]

aggs["UrunKlasmanID"] = ["nunique"]
        
agg4 = test.reset_index().groupby(["MerchGrup","MerchMarkaYasGrupID", "MerchAltGrupID","BuyerGrupID","YilHafta"]).agg(aggs)
agg4.columns = pd.Index([e[0] + "_" + e[1] for e in agg4.columns.tolist()])
agg4.columns = ['BuyerGrupID_Hafta_'+ c for c in agg4.columns]
agg4 = agg4.reset_index()

In [None]:
# MerchGrup, MerchMarkaYasGrupID, MerchAltGrupID, BuyerGrupID, KlasmanGrupID Based Aggregations
aggs = {}
for col in test.columns:
    if col not in FEATS_EXCLUDED:
        aggs[col] = ['sum','mean']

aggs["VucutBolge"] = ["nunique"]
aggs["KlasmanIklimStatu"] = ["nunique"]

aggs["UrunKlasmanID"] = ["nunique"]
        
agg5 = test.reset_index().groupby(["MerchGrup","MerchMarkaYasGrupID", "MerchAltGrupID","BuyerGrupID","KlasmanGrupID","YilHafta"]).agg(aggs)
agg5.columns = pd.Index([e[0] + "_" + e[1] for e in agg5.columns.tolist()])
agg5.columns = ['KlasmanGrupID_Hafta_'+ c for c in agg5.columns]
agg5 = agg5.reset_index()

In [None]:
test = test.merge(agg1,"left",["MerchGrup","YilHafta"]).\
    merge(agg2,"left",["MerchGrup","MerchMarkaYasGrupID","YilHafta"]).\
    merge(agg3,"left",["MerchGrup","MerchMarkaYasGrupID","MerchAltGrupID","YilHafta"]).\
    merge(agg4,"left",["MerchGrup","MerchMarkaYasGrupID","MerchAltGrupID","BuyerGrupID","YilHafta"]).\
    merge(agg5,"left",["MerchGrup","MerchMarkaYasGrupID","MerchAltGrupID","BuyerGrupID","KlasmanGrupID","YilHafta"])

In [None]:
test.to_csv("SatisiKesfet_TestData_Week_Final.csv", index=False)

### Model

In [None]:
FEATS_EXCLUDED = ["Gun","MagazaID","MerchAltGrupID","UrunKlasmanID","SatisAdet","MerchGrup","MerchGrupID","MerchMarkaYasGrupID",
                  "BuyerGrupID","KlasmanGrupID","VucutBolge","KlasmanIklimStatu","YilHafta"]

In [None]:
test

In [None]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
feats = [f for f in test.columns if f not in FEATS_EXCLUDED]
rr = Ridge(alpha=0.01) # higher the alpha value, more restriction on the coefficients; low alpha > more generalization, coefficients are barely
rr.fit(test.loc[test.Yil<2017, feats], test.loc[test.Yil<2017, "SatisAdet"])
preds = rr.predict(test.loc[test.Yil==2017, feats])
curr_rmse = sqrt(mean_squared_error(test.loc[test.Yil==2017,"SatisAdet"], preds))
curr_mae = mean_absolute_error(test.loc[test.Yil==2017,"SatisAdet"], preds)
print('Averall RMSE : %.6f' % (curr_rmse))
print('Averall MAE : %.6f' % (curr_mae))

In [None]:
mape_valid = test.loc[test.Yil==2017, ["OrtBirimFiyat","SatisAdet"]].copy()
mape_valid["pred"] = preds
mape_valid.loc[mape_valid["OrtBirimFiyat"]==0, "pred"] = 0
mape_valid2 = mape_valid[mape_valid.SatisAdet!=0]
curr_rmse = sqrt(mean_squared_error(mape_valid.SatisAdet, mape_valid.pred))
curr_mape = mape(mape_valid2.SatisAdet, mape_valid2.pred)
curr_mae = mean_absolute_error(mape_valid.SatisAdet, mape_valid.pred)
print('Averall RMSE : %.6f' % (curr_rmse))
print('Averall MAPE : %.6f' % (curr_mape))
print('Averall MAE : %.6f' % (curr_mae))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
feats = [f for f in test.columns if f not in FEATS_EXCLUDED]
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(test.loc[test.Yil<2017, feats], test.loc[test.Yil<2017, "SatisAdet"])
preds = rf.predict(test.loc[test.Yil==2017, feats])
curr_rmse = sqrt(mean_squared_error(test.loc[test.Yil==2017,"SatisAdet"], preds))
curr_mae = mean_absolute_error(test.loc[test.Yil==2017,"SatisAdet"], preds)
print('Averall RMSE : %.6f' % (curr_rmse))
print('Averall MAE : %.6f' % (curr_mae))