In [1]:
import datetime
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import time
import warnings
import scipy
from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_squared_error
from math import sqrt

warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

FEATS_EXCLUDED = ["Gun","MagazaID","MerchAltGrupID","UrunKlasmanID","SatisAdet","MerchGrup","MerchGrupID","MerchMarkaYasGrupID","BuyerGrupID","KlasmanGrupID","VucutBolge","KlasmanIklimStatu"]

In [2]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))
    
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

def thr_to_accuracy(thr, Y_test, predictions):
    return -accuracy_score(Y_test, np.array(predictions>thr, dtype=np.int))

#from sklearn.utils import check_arrays
from sklearn.metrics import mean_absolute_error

def mape(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / (y_true+0.00000000001))) * 100
def missing_fun(data):
    missing_value_df = data.dtypes.to_frame("type").reset_index()
    percent_missing = data.isnull().sum() * 100 / len(data)
    nunique = data.nunique(dropna=False).values
    missing_value_df["percent_missing"] = percent_missing.values
    missing_value_df["nunique"] = nunique
    missing_value_df = missing_value_df.loc[missing_value_df.percent_missing !=0]
    missing_value_df = missing_value_df.sort_values(by="percent_missing", ascending=False)
    return missing_value_df

In [78]:
test = pd.read_csv("data/SatisiKesfet_TestData.csv")

In [6]:
test["Markup"] = 1-test["Markup"]

In [7]:
test["SezonGrup"] = test["SezonGrup"].apply(lambda x: x.strip())
test["SezonGrup"] = test["SezonGrup"].map({'Y': 1, 'K': 0})

In [8]:
test["GunSonuDepoStok"] = test["GunSonuToplamStok"] - test["GunSonuReyonStok"]

In [80]:
klasman = pd.read_csv("data/Dim_Klasman.csv")
magaza = pd.read_csv("data/Dim_Magaza.csv")
ozelguntanimlari = pd.read_csv("data/Dim_OzelGunTanimlari.csv")
tarih = pd.read_csv("data/Dim_Tarih.csv")
meteoroloji = pd.read_csv("data/MeteorolojiDegerleri.csv")

In [10]:
test = test.merge(tarih[["Gun","Yil","Ay","Hafta","HaftaninGunu"]],"left",["Gun"])
test = test.merge(magaza[["MagazaID","SehirID"]],"left",["MagazaID"])
test = test.merge(meteoroloji,"left",["Gun","SehirID"])

In [11]:
for col in ["MinimumSicaklik","OrtalamaSicaklik","MaksimumSicaklik","YagisMiktari","KarKalinligi","Yagmur","Kar"]:
    test[col] = test.groupby("Hafta")[col].transform(lambda x: x.fillna(x.mean()))

### Feature Engineering

In [12]:
celcius = test[["SehirID","Gun","OrtalamaSicaklik"]].drop_duplicates().sort_values(["SehirID","Gun"])
celcius["FarkSicaklik"] = celcius.groupby("SehirID")["OrtalamaSicaklik"].shift(1)
celcius["FarkSicaklik"] = celcius["OrtalamaSicaklik"] - celcius["FarkSicaklik"]
celcius.loc[celcius["FarkSicaklik"].isnull(),"FarkSicaklik"] = 0

In [13]:
test = test.merge(celcius[["SehirID","Gun","FarkSicaklik"]],"left",["SehirID","Gun"])

### Categorical

In [81]:
test = test.merge(klasman,"left",["MerchAltGrupID","UrunKlasmanID"])

In [83]:
test[["Gun","MagazaID","MerchAltGrupID","UrunKlasmanID","MerchGrup","MerchGrupID","MerchMarkaYasGrupID",
                  "BuyerGrupID","KlasmanGrupID","VucutBolge","KlasmanIklimStatu"]].nunique()

AttributeError: 'DataFrame' object has no attribute 'nunique'

In [15]:
UrunKlasmanID = pd.read_csv("burak/UrunKlasmanID-SatisAdet_num.csv")
BuyerGrupID = pd.read_csv("burak/BuyerGrupID-SatisAdet_num.csv")
KlasmanGrupID = pd.read_csv("burak/KlasmanGrupID-SatisAdet_num.csv")
MagazaID = pd.read_csv("burak/MagazaID-SatisAdet_num.csv")
klasman = pd.read_csv("burak/klasman_num.csv")
magaza = pd.read_csv("burak/magaza_num.csv")
tarih = pd.read_csv("burak/tarih_num.csv")

In [16]:
test = test.merge(UrunKlasmanID,"left","UrunKlasmanID")
test = test.merge(BuyerGrupID,"left","BuyerGrupID")
test.drop("BuyerGrupID",axis=1,inplace=True)
test = test.merge(KlasmanGrupID,"left","KlasmanGrupID")
test.drop("KlasmanGrupID",axis=1,inplace=True)
test = test.merge(MagazaID,"left","MagazaID") 
test = test.merge(klasman,"left",["MerchAltGrupID","UrunKlasmanID"]) 
test = test.drop("KlasmanIklimStatu_nan",axis=1)
test = test.merge(magaza,"left","MagazaID") 
test = test.drop("CaddeAVM_Tanımsız",axis=1)

In [None]:
prophet = pd.read_csv("burak/prophet_satis.csv")

#### Ozel Gunler

In [18]:
ozel = pd.read_csv("data/ozelgun_fark.csv")

In [19]:
test = test.merge(ozel,"left","Gun")

### Check Nulls

In [17]:
test = test.select_dtypes(exclude=["object"])

In [21]:
null_df = test.isnull().sum().to_frame("null").reset_index()
null_cols = null_df.loc[null_df["null"]>0,"index"].values
for col in null_cols:
    test[col] = test[col].fillna(test[col].mean())

In [22]:
test.to_csv("SatisiKesfet_TestData_Daily.csv", index = False)

### Prediction

In [3]:
test = pd.read_csv("SatisiKesfet_TestData_Daily.csv")

In [7]:
tarih = pd.read_csv("data/Dim_Tarih.csv")

In [8]:
test = test.merge(tarih[["Gun","YilHafta"]],"left",["Gun"])

In [9]:
gc.collect()

14

In [10]:
weekly = pd.read_csv("weekly_preds_post.csv")

In [11]:
test = test.merge(weekly[["YilHafta","MagazaID","MerchAltGrupID","UrunKlasmanID"]],"left",["YilHafta","MagazaID","MerchAltGrupID","UrunKlasmanID"])

In [12]:
del test["YilHafta"]
gc.collect()

28

In [13]:
FEATS_EXCLUDED = ["Gun","MagazaID","MerchAltGrupID","UrunKlasmanID","SatisAdet","MerchGrup","MerchGrupID","MerchMarkaYasGrupID",
                  "BuyerGrupID","KlasmanGrupID","VucutBolge","KlasmanIklimStatu","gun_agg","SehirID","YilHafta"]
feats = [f for f in test.columns if f not in FEATS_EXCLUDED]

In [14]:
import pickle
with open(r"daily_lgbm_2018.pkl", "rb") as input_file:
    reg = pickle.load(input_file)

In [15]:
preds = reg.predict(test[feats], num_iteration=reg.best_iteration)

In [21]:
mape_valid = test[["Gun","MagazaID","MerchAltGrupID","UrunKlasmanID","OrtBirimFiyat","SezonGrup"]].copy()
mape_valid["pred"] = preds
mape_valid.loc[mape_valid["OrtBirimFiyat"]==0, "pred"] = 0
mape_valid.loc[mape_valid["pred"]<0, "pred"] = 0

In [22]:
del mape_valid["OrtBirimFiyat"]
gc.collect()

14

In [23]:
mape_valid.shape

(34464612, 6)

In [26]:
mape_valid.head()

Unnamed: 0,Gun,MagazaID,MerchAltGrupID,UrunKlasmanID,SezonGrup,pred
0,20180613,544,172,3073,Y,0.0
1,20180605,420,134,3599,K,0.0
2,20181126,477,164,3557,K,2.914444
3,20180827,490,16,3787,Y,2.643522
4,20181226,463,166,3771,K,3.744163


In [25]:
mape_valid["SezonGrup"] = mape_valid["SezonGrup"].map({1: "Y", 0: "K"})

In [27]:
mape_valid.to_csv("submission_2018.csv",index=False)

In [77]:
test[["Gun","MagazaID","MerchAltGrupID","UrunKlasmanID","SatisAdet","MerchGrup","MerchGrupID","MerchMarkaYasGrupID",
                  "BuyerGrupID","KlasmanGrupID","VucutBolge","KlasmanIklimStatu"]].nunique()

KeyError: "['SatisAdet' 'MerchGrup' 'BuyerGrupID' 'KlasmanGrupID' 'VucutBolge'\n 'KlasmanIklimStatu'] not in index"

In [59]:
imp_df = pd.read_csv("feature_importance_weekly.csv")
imp_df["gain"] = imp_df[["feature", "importance"]].groupby("feature").transform("mean")
imp_df = imp_df.drop(["fold","importance"],axis=1).drop_duplicates()

In [85]:
mape_valid.pred.sum()

106492161.02914762

In [76]:
test

Unnamed: 0,Gun,MagazaID,MerchAltGrupID,UrunKlasmanID,GunSonuReyonStok,GunSonuToplamStok,OrtBirimFiyat,OrtBirimMaliyet,IndirimOrani,IlkFiyattanSatisOrani,KarMarji,Markup,ModelSayisi,SezonGrup,GunSonuDepoStok,Yil,Ay,Hafta,HaftaninGunu,SehirID_x,MinimumSicaklik,OrtalamaSicaklik,MaksimumSicaklik,YagisMiktari,KarKalinligi,Yagmur,Kar,FarkSicaklik,MerchGrupID,MerchMarkaYasGrupID,UrunKlasmanID_num,BuyerGrupID_num,KlasmanGrupID_num,MagazaID_num,MerchGrupID_11,MerchGrupID_12,MerchGrupID_14,MerchGrupID_15,MerchMarkaYasGrupID_12,MerchMarkaYasGrupID_15,MerchMarkaYasGrupID_31,MerchMarkaYasGrupID_40,MerchMarkaYasGrupID_53,MerchMarkaYasGrupID_62,VucutBolge_Alt,VucutBolge_Tan?ms?z,VucutBolge_?st,VucutBolge_?stAlt,KlasmanIklimStatu_Baharl?k/D?? Giyim,KlasmanIklimStatu_Baharl?k/Mevsimlik,KlasmanIklimStatu_Di?er,KlasmanIklimStatu_K??l?k/Di?er,KlasmanIklimStatu_K??l?k/D?? Giyim,KlasmanIklimStatu_Yazl?k/K.Kol,KlasmanIklimStatu_Yazl?k/S?cak Yaz,OutletMi,SehirID_y,SehirID_100,SehirID_105,SehirID_110,SehirID_113,SehirID_114,SehirID_118,SehirID_126,SehirID_127,SehirID_132,SehirID_133,SehirID_154,SehirID_156,CografiBolge_AKDZ,CografiBolge_DAND,CografiBolge_EGEB,CografiBolge_GAND,CografiBolge_IAND,CografiBolge_KRDZ,CografiBolge_MARM,CografiBolge_TNSZ,CaddeAVM_Cadde,CaddeAVM_Mall- ?st? Kapal?,CaddeAVM_Mall-?st? A??k,bas_ANNELER GUNU,bas_BABALAR GUNU,bas_Black Friday,bas_DUNYA KADINLAR GUNU,bas_KURBAN BAYRAMI,bas_OKUL ACILISI,bas_OKUL KAPANISI,bas_RAMAZAN AYI,bas_RAMAZAN BAYRAMI,bas_SEVGILILER GUNU,bas_YARI YIL TATILI,bas_YIL BASI,bitis_KURBAN BAYRAMI,bitis_RAMAZAN AYI,bitis_RAMAZAN BAYRAMI,bitis_YARI YIL TATILI
0,20180613,544,172,3073,10,10,0.00,0.00,0.00,0.00,0.00,1.00,3,1,0,2018,6,24,3,110,17.620000,24.170000,28.740000,0.000000,0.000000,0.000000,0.000000,0.080000,15,62,0.421229,0.805046,1.727717,3.629945,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,110,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,-31,4,163,-97,69,96,-5,-28,2,-119,-144,-163,72,1,4,-129
1,20180605,420,134,3599,4,4,0.00,0.00,0.00,0.00,0.00,1.00,2,0,0,2018,6,23,2,100,18.300000,25.940000,31.360000,0.000000,0.000000,0.000000,0.000000,0.690000,14,40,1.230324,1.645025,0.823724,1.986201,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,100,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,-23,12,171,-89,77,104,3,-20,10,-111,-136,-155,80,9,12,-121
2,20181126,477,164,3557,35,35,39.99,16.52,0.20,0.00,0.55,-1.42,4,0,0,2018,11,48,1,127,-4.490000,4.620000,11.100000,0.000000,0.000000,1.000000,0.000000,4.020000,15,53,2.931385,3.245879,2.733727,1.951087,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,127,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,-197,-162,-3,-263,-97,-70,-171,-194,-164,-285,-310,-329,-94,-165,-162,-295
3,20180827,490,16,3787,104,105,39.95,21.55,0.11,0.50,0.42,-0.85,22,1,1,2018,8,35,1,110,22.080000,27.260000,30.600000,0.470000,0.000000,0.000000,0.000000,0.450000,12,12,4.552435,2.707800,3.283117,4.268838,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,110,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,-106,-71,88,-172,-6,21,-80,-103,-73,-194,-219,-238,-3,-74,-71,-204
4,20181226,463,166,3771,26,27,29.99,26.27,0.50,0.00,0.05,-0.14,11,0,1,2018,12,52,3,126,9.240000,12.910000,15.390000,28.450000,22.350000,0.000000,0.000000,-1.110000,15,53,1.849366,2.707800,2.626767,2.020010,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,126,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,-227,-192,-33,-293,-127,-100,-201,-224,-194,-315,-340,-359,-124,-195,-192,-325
5,20180703,409,134,3840,6,6,0.00,0.00,0.00,0.00,0.00,1.00,4,1,0,2018,7,27,2,110,20.310000,27.790000,31.600000,0.000000,0.000000,0.000000,0.000000,1.880000,14,40,2.803847,2.754867,2.813736,2.043310,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,110,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,-51,-16,143,-117,49,76,-25,-48,-18,-139,-164,-183,52,-19,-16,-149
6,20181118,150,14,3665,5,5,59.99,34.05,0.14,0.00,0.39,-0.76,2,0,0,2018,11,46,7,126,13.470000,16.940000,19.190000,6.910000,0.000000,1.000000,0.000000,5.617303,12,12,1.457138,4.904707,1.361794,3.341547,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,126,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,-189,-154,5,-255,-89,-62,-163,-186,-156,-277,-302,-321,-86,-157,-154,-287
7,20180919,400,166,3541,136,137,47.46,18.90,0.00,1.00,0.57,-1.51,13,0,1,2018,9,38,3,110,19.480000,22.890000,26.310000,0.000000,0.000000,0.000000,0.000000,0.070000,15,53,6.855783,3.245879,5.861113,3.171203,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,110,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,-129,-94,65,-195,-29,-2,-103,-126,-96,-217,-242,-261,-26,-97,-94,-227
8,20180904,338,164,3796,1,1,0.00,0.00,0.00,0.00,0.00,1.00,1,1,0,2018,9,36,2,127,15.690000,25.640000,33.540000,0.000000,0.000000,0.000000,0.000000,0.570000,15,53,1.128613,2.707800,2.490138,1.364397,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,127,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,-114,-79,80,-180,-14,13,-88,-111,-81,-202,-227,-246,-11,-82,-79,-212
9,20180530,409,164,3899,80,80,22.45,10.38,0.10,0.50,0.50,-1.16,15,1,0,2018,5,22,3,110,18.040000,21.890000,24.990000,0.510000,0.000000,0.000000,0.000000,1.200000,15,53,11.303794,4.904707,9.044559,2.043310,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,110,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,-17,18,177,-83,83,110,9,-14,16,-105,-130,-149,86,15,18,-115


In [70]:
test.MagazaID.nunique()

206

In [67]:
plt.figure(figsize=(14,25))
sns.barplot(x="gain", y="feature", data=imp_df.sort_values(by="gain",ascending=False)[:10])
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
#plt.savefig('lgbm_importances_weekly.png')

In [35]:
gc.collect()

447

In [34]:
del weekly


In [33]:
whos

Variable                 Type                          Data/Info
----------------------------------------------------------------
FEATS_EXCLUDED           list                          n=15
KFold                    ABCMeta                       <class 'sklearn.model_selection._split.KFold'>
StratifiedKFold          ABCMeta                       <class 'sklearn.model_sel<...>._split.StratifiedKFold'>
accuracy_score           function                      <function accuracy_score at 0x7f15249e50d0>
confusion_matrix         function                      <function confusion_matrix at 0x7f15249e5158>
contextmanager           function                      <function contextmanager at 0x7f155d9ad730>
datetime                 module                        <module 'datetime' from '<...>b/python3.5/datetime.py'>
display_importances      function                      <function display_importances at 0x7f1520585ae8>
f1_score                 function                      <function f1_score at 0x7f15

### Model

In [None]:
# params optimized by optuna
params = {'task': 'test','objective': 'regression','metric': 'rmse','learning_rate': 0.05,'verbose': -1,'nthread':-1,
          'num_leaves': 10, 'min_data': 50, 'max_depth': 10, 'num_leaves': 31, 'min_data_in_leaf': 50, 'feature_fraction': 0.8,'bagging_fraction': 0.8,'bagging_freq': 2 }

In [None]:
FEATS_EXCLUDED = ["Gun","MagazaID","MerchAltGrupID","UrunKlasmanID","SatisAdet","MerchGrup","MerchGrupID","MerchMarkaYasGrupID",
                  "BuyerGrupID","KlasmanGrupID","VucutBolge","KlasmanIklimStatu","gun_agg","SehirID"]

In [None]:
feats = [f for f in test.columns if f not in FEATS_EXCLUDED]
lgb_test = lgb.Dataset(test.loc[test.Yil<2017, feats], label=test.loc[test.Yil<2017,"SatisAdet"],free_raw_data=False)
lgb_test = lgb.Dataset(test.loc[test.Yil==2017, feats], label=test.loc[test.Yil==2017,"SatisAdet"],free_raw_data=False)
reg = lgb.test(params,lgb_test,valid_sets=[lgb_test, lgb_test],valid_names=['test', 'valid'],num_boost_round=100000,early_stopping_rounds= 100,verbose_eval=100)
preds = reg.predict(test.loc[test.Yil==2017, feats], num_iteration=reg.best_iteration)
curr_rmse = sqrt(mean_squared_error(test.loc[test.Yil==2017,"SatisAdet"], preds))
curr_mae = mean_absolute_error(test.loc[test.Yil==2017,"SatisAdet"], preds)
print('Averall RMSE : %.6f' % (curr_rmse))
print('Averall MAE : %.6f' % (curr_mae))

In [None]:
mape_valid = test.loc[test.Yil==2017, ["OrtBirimFiyat","SatisAdet"]].copy()
mape_valid["pred"] = preds
mape_valid.loc[mape_valid["OrtBirimFiyat"]==0, "pred"] = 0
mape_valid2 = mape_valid[mape_valid.SatisAdet!=0]
curr_rmse = sqrt(mean_squared_error(mape_valid.SatisAdet, mape_valid.pred))
curr_mape = mape(mape_valid2.SatisAdet, mape_valid2.pred)
curr_mae = mean_absolute_error(mape_valid.SatisAdet, mape_valid.pred)
print('Averall RMSE : %.6f' % (curr_rmse))
print('Averall MAPE : %.6f' % (curr_mape))
print('Averall MAE : %.6f' % (curr_mae))

In [None]:
curr_mape = mape(sum(mape_valid.SatisAdet), sum(mape_valid.pred))
print('Averall MAPE : %.6f' % (curr_mape))

In [None]:
importance_df = pd.DataFrame()
importance_df["feature"] = feats
importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
importance_df.sort_values("importance",ascending=False)

### K-Fold

In [None]:
sub_preds, oof_preds, feature_importance_df, final_rmse = kfold_lightgbm(params,test_main, test_valid, num_folds=3, FEATS_EXCLUDED=FEATS_EXCLUDED)

In [None]:
mape_df = test_main[["OrtBirimFiyat","SatisAdet"]].copy()
mape_df["pred"] = oof_preds
mape_df.loc[mape_df["OrtBirimFiyat"]<=0, "pred"] = 0

In [None]:
sqrt(mean_squared_error(mape_df.SatisAdet, mape_df.pred))

In [None]:
mean_absolute_error(mape_df.SatisAdet, mape_df.pred)

In [None]:
mape_df2 = mape_df[mape_df.SatisAdet!=0]
mape(mape_df2.SatisAdet, mape_df2.pred)

In [None]:
mape_valid = test_valid[["OrtBirimFiyat","SatisAdet"]].copy()
mape_valid["pred"] = sub_preds
mape_valid.loc[mape_valid["OrtBirimFiyat"]<=0, "pred"] = 0

In [None]:
sqrt(mean_squared_error(mape_valid.SatisAdet, mape_valid.pred))

In [None]:
mean_absolute_error(mape_df.SatisAdet, mape_df.pred)

In [None]:
mape_valid2 = mape_valid[mape_valid.SatisAdet!=0]
mape(mape_valid2.SatisAdet, mape_valid2.pred)

In [None]:
feature_importance_df.head()

In [None]:
imp_select = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance",ascending=False)
imp_select

In [None]:
# Dropping columns with highly frequent values
cols_to_drop = []
drop_values = []
for col in data_test.columns:
    temp = data_test[col].value_counts(dropna=False, normalize=1)
    if len(temp[temp>0.999].index)>0:
        cols_to_drop.append(col)
        drop_values.append(temp)  
data_test.drop(cols_to_drop, axis = 1, inplace = True)
print(f"These columns were dropped: {cols_to_drop} \n Since they have some values covering 99.9% of its all values")

In [None]:
corr_matrix = data_test.corr().abs()
corr=corr_matrix.unstack().sort_values(kind="quicksort")
corr=pd.DataFrame(data=corr,columns=['correlation'])
corr=corr[(corr.correlation>0.7) & (corr.correlation<1)].reset_index()
corr.drop_duplicates(subset=['correlation']).sort_values("correlation", ascending = False)

In [None]:
sub_df = test[["Month_of_Year","ISO_Week_of_ISO_Year","CV_UserID"]].copy()
sub_df["target"] = sub_preds_int

In [None]:
sub_df.head()

In [None]:
sub_df.to_csv("Dummies.csv",header=False,sep=";",index=False)