The aim is to predict ARTIS_DURUMU, which a categorical variable with levels 0 and 1.

Target is imbalanced, so

* weights can be introduced
* a proper cutoff can be selected
* downsampling or subsampling can be applied

# Load Libraries & Read Data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

        import matplotlib.pyplot as plt
from geopy import distance
import seaborn as sns
%matplotlib inline
import lightgbm as lgbm
import xgboost as xgb
from sklearn import neighbors

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, roc_auc_score, accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import StandardScaler
import warnings
import json
import gc
pd.options.display.max_rows=300

In [None]:
sample = pd.read_csv("../input/anadolu-hayat-emeklilik-datathon-coderspace/samplesubmission.csv")
test = pd.read_csv("../input/anadolu-hayat-emeklilik-datathon-coderspace/test.csv")
train = pd.read_csv("../input/anadolu-hayat-emeklilik-datathon-coderspace/train.csv")

In [None]:
# combine data sets for convenience in feature engineering
dt = pd.concat([train, test])
# Change column names for convenience
cols = dt.columns
cols =list(map(str.lower, cols))

dt.columns = cols
dt.shape
dt.info()

## Check Data _ preliminaries

* Check target distribution
* Check duplicates
* Check missing values
* Check unique values of categorical columns

In [None]:
### check target distribution
dt['artis_durumu'].value_counts()

In [None]:
### check if there are duplicates
print(dt.shape[0])
# counting unique values
n = len(pd.unique(dt['policy_id']))
print(n)

# data is not duplicated , good

In [None]:
### Check missing values
#if they are all missing in either training or test, or check if complete columns in training set have missing values in test set

# Same columns have missing values in both training and test sets
train.isna().any()[lambda x: x]
test.isna().any()[lambda x: x]

In [None]:
### Check unique values of string cols
# string cols
string_cols = dt.loc[:, dt.dtypes == object].columns
string_cols

for ix in string_cols:
    print(ix)
    print(pd.unique(dt[ix]))
    print(dt[ix].value_counts())
    print("-------------")

**Suggestions for categorical features**
* sozlesme_kokeni: 'NEW' dominates
    * sozlesme_kokeni_NEW: boolean
    * frequency encoding
    
    
* sozlesme_kokeni_detay: 'NEW' dominates
    * sozlesme_kokeni_detay_NEW: boolean
    * frequency encoding


* baslangic_tarihi : look for distributions over years and months (obtain year-month col)

* kapsam_tipi: group low density pensions types / also check for its numeric
    * frequency encoding
    
    
* kapsam_grubu : gruplanmamis dominates
    * 
    
    
* dagitim_kanali : group most frequent ones
    * 
    
    
* police_sehir: group most frequent ones
    * 
    
    
* uyruk:
    * uyruk_TR:boolean
    
    
* meslek & meslek_kirilim
    * 
    
    
* yatirim karakteri: nice to have <3, but lots of na
    * one-hot encoding
    
    
* medeni_hal: group single vs married / or also add widowed
    * one-hot encoding
    
    
* egitim durumu: ok , but maybe group higher education
    * ordinal encoding


# Playing with Columns

In [None]:
# clean 'gelir' columns
char_gelir = dt['gelir'].str.contains(",", regex=False, na=False)
dt['gelir'][char_gelir] = dt['gelir'][char_gelir].str.replace('\W', '', )
dt['gelir'][char_gelir] = dt['gelir'][char_gelir] *100

dt['gelir'] = dt['gelir'].astype(float)

dt["gelir"].describe([0.0001, 0.0005, 0.01, 0.05, 0.20, 0.25,0.4,0.5,0.6,0.75, 0.8, 0.95, 0.96, 0.97, 0.98, 0.99, 0.9995, 0.9999]).astype(float)

In [None]:
# inf values
#dt['gelir'].idxmax()
#dt.iloc[323636]

In [None]:
dt.loc[dt['gelir'] < 0, 'gelir'] = 0 # nonnegative
dt.loc[dt['gelir'] > 21000, 'gelir'] = 21000 # upper cap
dt.loc[dt['gelir'] < 100,'gelir'] = dt.loc[dt['gelir'] < 100]['gelir']* 10 # probably wrong records

dt["gelir"].describe([0.0001, 0.0005, 0.01, 0.05, 0.20, 0.25,0.4,0.5,0.6,0.75, 0.8, 0.95, 0.96, 0.97, 0.98, 0.99, 0.9995, 0.9999]).astype(float)

In [None]:
# cocuk_sayisi
#plt.hist(dt["cocuk_sayisi"])
dt.loc[dt['cocuk_sayisi'] >5, 'cocuk_sayisi'] = 5

In [None]:
# baslangic tarihi related
dt["year"] = pd.to_datetime(dt["baslangic_tarihi"]).dt.year
dt["month"] = pd.to_datetime(dt["baslangic_tarihi"]).dt.month

dt['yearmonth'] = dt['baslangic_tarihi'].str.replace('\W', '', )
dt['yearmonth'] = dt['yearmonth'].astype('int')

In [None]:
# month
dt.loc[dt['month']==1, 'month'] = 13 # month 1 & 12 are popular months

In [None]:
dt.rename(columns = {'subat_odenen_tu':'subat_odenen_tutar'}, inplace = True)

In [None]:
# define columns
months = ["ocak","subat","mart","nisan","mayis","haziran","temmuz","agustos","eylul","ekim","kasim","aralik"]
vade_cols = [str(i)+'_vade_tutari' for i in months]
odenen_cols = [str(i)+'_odenen_tutar' for i in months]

for ix in months:
    dt[ix + '_odeme_orani']  = 1.00*dt[ix + '_odenen_tutar']/ dt[ix + '_vade_tutari'] 
    
dt['average_vade_tutari'] = dt[vade_cols].mean(axis=1)
dt['min_vade_tutari'] = dt[vade_cols].min(axis=1)
dt['max_vade_tutari'] = dt[vade_cols].max(axis=1)

dt['change_in_vade_tutari'] = 1.00*(dt['max_vade_tutari'] - dt['min_vade_tutari'])/dt['min_vade_tutari']

dt['vade_vs_gelir'] = 1.00*dt['gelir']/dt['average_vade_tutari']
dt['vade_vs_gelir'].fillna(dt['vade_vs_gelir'].mean(), inplace = True)

for ix in months:
    ispaid =np.where(((dt[ix + '_vade_tutari']*0.01 +dt[ix + '_odenen_tutar']) - dt[ix + '_vade_tutari']) >0, 1, 0)
    dt[ix + '_ispaid'] = ispaid
    
# number of not paid
ispaid_cols = [col for col in dt.columns if 'ispaid' in col]
dt['toplam_notpaid'] = 12 - dt[ispaid_cols].sum(axis=1)

In [None]:
## PART 2 _ If you are bothered with outliers, then 
# vade cols duzeltme_1
# correct with very large values: ex : policy_id 7981587
# correct ex :5177519
dt[vade_cols] = np.where(dt[vade_cols] > 100000, dt[vade_cols]/1000, dt[vade_cols])
dt[odenen_cols] = np.where(dt[odenen_cols] > 100000, dt[odenen_cols]/1000, dt[odenen_cols])

# IQR Outlier Tending for odenen tutar columns
Q1 = dt[odenen_cols].T.quantile(0.25)#lower = (mn - 3*sd)
Q3 = dt[odenen_cols].T.quantile(0.75)#upper = (mn + 3*sd)
IQR = Q3-Q1

lower = np.where((Q1 - 1.5*IQR)<0,0,Q1 - 1.5*IQR)
upper = Q3 + 1.5*IQR

for j in odenen_cols:
    dt[j] = np.where(dt[j]< lower, lower,dt[j])
    dt[j] = np.where(dt[j]> upper, upper,dt[j])
    
# odenen correction after things
# cannot pay more than vade
for j in range(12):
    dt[odenen_cols[j]] = np.where(dt[odenen_cols[j]]>(1.1*dt[vade_cols[j]]),dt[vade_cols[j]],dt[odenen_cols[j]])

#update statistical columns
dt['average_vade_tutari'] = dt[vade_cols].mean(axis=1)
dt['min_vade_tutari'] = dt[vade_cols].min(axis=1)
dt['max_vade_tutari'] = dt[vade_cols].max(axis=1)

dt['change_in_vade_tutari'] = 1.00*(dt['max_vade_tutari'] - dt['min_vade_tutari'])/dt['min_vade_tutari']

dt['vade_vs_gelir'] = 1.00*dt['gelir']/dt['average_vade_tutari']
dt['vade_vs_gelir'].fillna(dt['vade_vs_gelir'].mean(), inplace = True)

# add new columns
dt['total_vade_tutari'] = dt[vade_cols].sum(axis=1)
dt['sd_vade_tutari']= dt[vade_cols].std(axis=1)
dt['total_odenen_tutar'] = dt[odenen_cols].sum(axis=1)
dt['sd_odenen_tutar']= dt[odenen_cols].std(axis=1)
#dt[dt['total_odenen_tutar']>dt['total_vade_tutari']]
#dt[dt['total_odenen_tutar']== 0]['artis_durumu'].value_counts()

dt['odeme_ratio'] = 1.00*dt['total_odenen_tutar']/dt['total_vade_tutari']

In [None]:
## General behavior of the policy owner
k = vade_cols.copy()
k.extend(odenen_cols)
k.extend(['policy_id'])
sub_dt = dt[k].copy()

#initialize
PD_OMEDI=[0] * sub_dt.shape[0]
PD_ODEDI=[0] * sub_dt.shape[0]
PY_OMEDI=[0] * sub_dt.shape[0]
PY_ODEDI=[0] * sub_dt.shape[0]
PA_OMEDI=[0] * sub_dt.shape[0]
PA_ODEDI=[0] * sub_dt.shape[0]
    
for j in range(1,12):
    vade = np.where(sub_dt[vade_cols[j]] - sub_dt[vade_cols[j-1]] <0,-1,np.where(sub_dt[vade_cols[j]] - sub_dt[vade_cols[j-1]] >0,1,0))
    odeme = np.where(sub_dt[vade_cols[j]] - (1.01*sub_dt[odenen_cols[j]]) >0,-1,1)
    
    PD_OMEDI = PD_OMEDI+np.where((vade == -1) & (odeme == -1),1,0)
    PD_ODEDI = PD_ODEDI+np.where((vade == -1) & (odeme == 1),1,0)
    
    PY_OMEDI = PY_OMEDI+np.where((vade == 1) & (odeme == -1),1,0)
    PY_ODEDI = PY_ODEDI+np.where((vade == 1) & (odeme == 1),1,0)  
    
    PA_OMEDI = PA_OMEDI+np.where((vade == 0) & (odeme == -1),1,0)
    PA_ODEDI = PA_ODEDI+np.where((vade == 0) & (odeme == 1),1,0)  
         
                                          
sub_dt['PD_OMEDI']=PD_OMEDI      
sub_dt['PD_ODEDI']=PD_ODEDI  
sub_dt['PY_OMEDI']=PY_OMEDI  
sub_dt['PY_ODEDI']=PY_ODEDI  
sub_dt['PA_OMEDI']=PA_OMEDI  
sub_dt['PA_ODEDI']=PA_ODEDI  

sub_dt = sub_dt[['policy_id','PD_OMEDI','PD_ODEDI','PY_OMEDI','PY_ODEDI','PA_OMEDI','PA_ODEDI']]
dt= dt.merge(sub_dt, on = "policy_id", how= 'left')

In [None]:
dt['kapsam_tipi_numeric'] = dt['kapsam_tipi'].str.replace('PENSION', '', )
dt['kapsam_tipi_numeric'] = dt['kapsam_tipi_numeric'].astype('int')

dt['hesap_degeri_degisimi'] = dt['sene_sonu_hesap_degeri'] - dt['sene_basi_hesap_degeri']
dt['hesap_degeri_degisimi_perc'] = (dt['sene_sonu_hesap_degeri'] - dt['sene_basi_hesap_degeri']) / dt['sene_basi_hesap_degeri']

dt['hesap_degeri_degisimi_perc'].fillna(0, inplace = True)

**Represent the trend in odenen tutar and vade tutari over months**

In [None]:
from sklearn.linear_model import LinearRegression

def find_trend(ys):
    xs = range(12)
    xs = np.array(xs).reshape(-1,1)
    #ys = dt.loc[3313][vade_cols]
    ys = np.array(ys)
    model = LinearRegression(fit_intercept=True)
    model.fit(xs, ys)
    # calculate trend
    trend = model.coef_
    return trend[0]

In [None]:
#1
dt['trend_vade']= dt.apply(lambda x: find_trend(x[vade_cols]), axis=1)

#2 ( diff between odenen and vade tutari)
new_cols = [str(i)+'_vade_odenen_fark' for i in months]
for j in range(12):
    dt.loc[:,str(new_cols[j])] = dt[vade_cols[j]]-dt[odenen_cols[j]]
    
dt['trend_vade_odenen_fark']= dt.apply(lambda x: find_trend(x[new_cols]), axis=1)

In [None]:
# office puan
office_dt= dt.groupby('office_id')[vade_cols].mean().reset_index()
office_dt['office_mean_vade_tutari']= office_dt.mean(axis=1)
office_dt = office_dt[["office_id",'office_mean_vade_tutari']].copy()
dt= dt.merge(office_dt, on = "office_id", how= 'left')
dt['vade_in_office']= dt['average_vade_tutari']/dt['office_mean_vade_tutari']

In [None]:
#dt.info()

# Imputation & Encoding

In [None]:
# NA columns
dt.isna().any()[lambda x: x]
#dt.head().T

In [None]:
#dt[(dt['sozlesme_kokeni'] == "NEW") & (dt['sozlesme_kokeni_detay'].isnull == True)]
## empty, then 
dt['sozlesme_kokeni_detay'].fillna("other", inplace = True)

## only a few missing- dominant category
dt['dagitim_kanali'].fillna("Kanal4 + Kanal2", inplace = True)
dt['uyruk'].fillna("TR", inplace = True)
dt['musteri_segmenti'].fillna(106, inplace = True)

## highly missing
dt['yatirim_karakteri'].fillna("other", inplace = True)

## median impute
dt['cocuk_sayisi'].fillna(dt['cocuk_sayisi'].median(), inplace = True)

## reasoning
#1
office_vs_sehir = dt.groupby("office_id")["police_sehir"].agg(lambda x:x.value_counts().index[0]).reset_index()
office_vs_sehir = office_vs_sehir.rename(columns={"police_sehir":"most_common_sehir"})
office_vs_sehir.head()

dt = dt.merge(office_vs_sehir, how="left",on="office_id")
dt['police_sehir'].fillna(dt['most_common_sehir'], inplace = True)

most_common_sehir =  dt["police_sehir"].value_counts().index[0]
dt['police_sehir'].fillna(most_common_sehir, inplace = True)

#2
most_common_meslek =  dt["meslek"].value_counts().index[0]
dt['meslek'].fillna(most_common_meslek, inplace = True)

#3
meslek_vs_kirilim = dt.groupby("meslek")["meslek_kirilim"].agg(lambda x:x.value_counts().index[0]).reset_index()
meslek_vs_kirilim = meslek_vs_kirilim.rename(columns={"meslek_kirilim":"most_common_kirilim"})
meslek_vs_kirilim.head()

dt = dt.merge(meslek_vs_kirilim, how="left",on="meslek")
dt['meslek_kirilim'].fillna(dt['most_common_kirilim'], inplace = True)

most_common_kirilim =  dt["meslek_kirilim"].value_counts().index[0]
dt['meslek_kirilim'].fillna(most_common_kirilim, inplace = True)

## combining levels
#1
pd.crosstab(dt['medeni_hal'],dt['artis_durumu'])

medeni_hal_upd = {"Divorced": "2", 
                  "Marriage Cancelled": "0",
                  "Married": "1",
                  "Other": np.NaN,
                  "Single":"0",
                  "Widowed":"2"}
dt['medeni_hal'].replace(medeni_hal_upd, inplace=True)
dt['medeni_hal'] = dt['medeni_hal'].astype('float')

#2
pd.crosstab(dt['egitim_durum'],dt['artis_durumu'])
egitim_durum_upd = {"(Di?er)": "0",
                    "?lkö?retim": "1",
                    "Lise": "2",
                    "Önlisans": "3",
                    "Lisans": "4",
                    "Yüksek Lisans": "5",
                    "Doktora":"6"}
dt['egitim_durum'].replace(egitim_durum_upd, inplace=True)
dt['egitim_durum'] = dt['egitim_durum'].astype('float')

In [None]:
#from fancyimpute import MICE #MICE()  depreciated

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

selected_columns_for_imputation = ['year','dogum_tarihi','cinsiyet','memleket','medeni_hal','musteri_segmenti','egitim_durum','gelir','cocuk_sayisi','sene_basi_hesap_degeri','sene_sonu_hesap_degeri']
MiceImputed = dt[selected_columns_for_imputation].copy(deep=True)
mice_imputer = IterativeImputer()
MiceImputed.iloc[:, :]= mice_imputer.fit_transform(MiceImputed)

In [None]:
MiceImputed.head()

In [None]:
# NA columns
#dt.isna().any()[lambda x: x]
dt['medeni_hal'].fillna(MiceImputed['medeni_hal'], inplace = True)
dt['egitim_durum'].fillna(MiceImputed['egitim_durum'], inplace = True)
dt['gelir'].fillna(MiceImputed['gelir'], inplace = True)

## Other Encoding

### Boolean Encoding

In [None]:
#1
dt.loc[dt['sozlesme_kokeni'] == "NEW", 'sozlesme_kokeni_NEW'] = 1
dt['sozlesme_kokeni_NEW'] = dt['sozlesme_kokeni_NEW'].fillna(0)

pd.crosstab(dt['sozlesme_kokeni_NEW'],dt['artis_durumu'])

In [None]:
#2
dt.loc[dt['sozlesme_kokeni_detay'] == "NEW", 'sozlesme_kokeni_detay_NEW'] = 1
dt['sozlesme_kokeni_detay_NEW'] = dt['sozlesme_kokeni_detay_NEW'].fillna(0)

pd.crosstab(dt['sozlesme_kokeni_detay_NEW'],dt['artis_durumu'])

In [None]:
#3
dt.loc[dt['uyruk'] == "TR", 'uyruk_TR'] = 1
dt['uyruk_TR'] = dt['uyruk_TR'].fillna(0)

pd.crosstab(dt['uyruk_TR'],dt['artis_durumu'])

In [None]:
dt.pivot(columns='artis_durumu').year.plot(kind = 'hist', stacked=True)

In [None]:
dt.pivot(columns='artis_durumu').month.plot(kind = 'hist', stacked=True)

In [None]:
egitim_durum_upd

### one-hot encodings

* medeni_hal
* yatirim_karakteri (without other)

In [None]:
dt['medeni_hal'] = dt['medeni_hal'].astype(int)

In [None]:
from sklearn.preprocessing import OneHotEncoder
onehot_cols = ['medeni_hal', 'yatirim_karakteri']

enc = OneHotEncoder()
# transforming the column after fitting
enc.fit(dt[onehot_cols])

enc.get_feature_names_out(['medeni_hal', 'yatirim_karakteri'])

encoded_colm = enc.transform(dt[onehot_cols]).toarray()
encoded_colm = pd.DataFrame(encoded_colm, columns = enc.get_feature_names_out(['medeni_hal', 'yatirim_karakteri']))
encoded_colm

# concatenating dataframes
dt = pd.concat([dt, encoded_colm], axis = 1)

In [None]:
#drop columns
#dt.drop(columns=['medeni_hal_0', 'medeni_hal_1'] , inplace =True)

### Hash encodings with high cardinality

https://towardsdatascience.com/smarter-ways-to-encode-categorical-data-for-machine-learning-part-1-of-3-6dca2f71b159

In [None]:
import category_encoders as ce

In [None]:
# kapsam_tipi 
ce_hash = ce.HashingEncoder(cols = ['kapsam_tipi'],n_components=12)
ce_hash.fit(dt)

hashing_output = ce_hash.transform(dt)
output_cols = [col for col in hashing_output.columns if 'col_' in col]
hashing_output = hashing_output[output_cols]

hash_cols= ["Kapsam_Tipi_" + str(i) for i in range(1,13)]
hashing_output.columns = hash_cols
hashing_output.head()

dt = pd.concat([dt, hashing_output], axis = 1)

In [None]:
# police_sehir 
ce_hash = ce.HashingEncoder(cols = ['police_sehir'],n_components=12)
ce_hash.fit(dt)

hashing_output = ce_hash.transform(dt)
output_cols = [col for col in hashing_output.columns if 'col_' in col]
hashing_output = hashing_output[output_cols]

hash_cols= ["police_sehir_" + str(i) for i in range(1,13)]
hashing_output.columns = hash_cols
hashing_output.head()

dt = pd.concat([dt, hashing_output], axis = 1)

### Clustering for Meslek

In [None]:
meslek_based = dt.groupby("meslek").agg({
    "gelir":"mean",
    "sene_basi_hesap_degeri": "mean",
    "sene_sonu_hesap_degeri": "mean",
    "average_vade_tutari" : "mean",
    "max_vade_tutari" :"mean",
    "policy_id":"count"
}).reset_index()

In [None]:
#meslek_based['policy_id'].describe()
meslek_based.head()
#meslek_based = meslek_based[meslek_based["policy_id"] > meslek_based["policy_id"].quantile(0)].sort_values(by="gelir", ascending = False)

In [None]:
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
X = meslek_based.loc[:, ~meslek_based.columns.isin(['meslek','policy_id'])]
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)

kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}
# A list holds the SSE values for each k
sse = []
for k in range(1, 21):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(X_scaled)
    sse.append(kmeans.inertia_)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(1, 21), sse)
plt.xticks(range(1, 21))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

In [None]:
kmeans = KMeans(
    init="random",
    n_clusters=7,
    n_init=10,
    max_iter=300,
    random_state=42
).fit(X)

In [None]:
#cluster_summary
pd.concat([pd.DataFrame(kmeans.labels_, columns = ["cluster"]),meslek_based["policy_id"]], axis=1).groupby("cluster").agg({
    "cluster": "count",
    "policy_id":"sum"
})

In [None]:
# cluster means
meslek_clusters = pd.DataFrame(kmeans.cluster_centers_, columns = X.columns)
meslek_clusters.sort_values("gelir",ascending = False)

In [None]:
meslek_clusters = pd.concat([meslek_based["meslek"], pd.DataFrame(kmeans.labels_, columns = ["meslek_cluster"])], axis = 1)
dt = dt.merge(meslek_clusters, on = "meslek", how="left")

In [None]:
#meslek_clusters.sort_values("meslek_cluster")

### frequency encodings

In [None]:
# grouping by frequency
fq = dt.groupby('meslek_kirilim').size()/len(dt)  
# mapping values to dataframe
dt.loc[:, "{}_encoded".format('meslek_kirilim')] = dt['meslek_kirilim'].map(fq) 
## drop original column.
#df = df.drop(['nom_0'], axis = 1)
#fq.plot.bar(stacked = True) 
#df.head(10)

### target encodings

In [None]:
from category_encoders import TargetEncoder

In [None]:
# dagitim_tipi
target_X = dt[dt['artis_durumu'].isnull() == False]['dagitim_kanali']
target_y = dt[dt['artis_durumu'].isnull() == False]['artis_durumu']
ce_target = TargetEncoder(cols=['dagitim_kanali'], smoothing=8, min_samples_leaf=5).fit(target_X,target_y)

ce_output = ce_target.transform(dt['dagitim_kanali'].reset_index(drop=True))
dt['dagitim_kanali_encoded'] = ce_output['dagitim_kanali']

#ce_output.value_counts()
pd.crosstab(dt['kapsam_grubu'],dt['artis_durumu'])

In [None]:
# kapsam grubu
target_X = dt[dt['artis_durumu'].isnull() == False]['kapsam_grubu']
target_y = dt[dt['artis_durumu'].isnull() == False]['artis_durumu']
ce_target = TargetEncoder(cols=['kapsam_grubu'], smoothing=8, min_samples_leaf=5).fit(target_X,target_y)

ce_output = ce_target.transform(dt['kapsam_grubu'].reset_index(drop=True))
dt['kapsam_grubu_encoded'] = ce_output['kapsam_grubu']

In [None]:
# kapsam tipi
# keep most frequents as they are and assign the others as others
focus_kapsam = dt['kapsam_tipi'].value_counts()[0:10].to_frame().reset_index()['index'].to_list()
dt['kapsam_tipi_grouped'] = dt['kapsam_tipi'].copy()
dt.loc[~dt['kapsam_tipi_grouped'].isin(focus_kapsam),'kapsam_tipi_grouped'] = "other"

from sklearn.preprocessing import OneHotEncoder
onehot_cols = ['kapsam_tipi_grouped']

enc = OneHotEncoder()
# transforming the column after fitting
enc.fit(dt[onehot_cols])
encoded_colm = enc.transform(dt[onehot_cols]).toarray()

col_ = ["kapsam_tipi_grouped_" + str(i) for i in range(11)]
encoded_colm = pd.DataFrame(encoded_colm, columns = col_)
encoded_colm
# concatenating dataframes
dt = pd.concat([dt, encoded_colm], axis = 1)

# one-hot encoding


In [None]:
# keep most frequents as they are and assign the others as others
focus_kapsam = dt['office_id'].value_counts()[0:50].to_frame().reset_index()['index'].to_list()
dt['office_id_grouped'] = dt['office_id'].copy()
dt.loc[~dt['office_id_grouped'].isin(focus_kapsam),'office_id_grouped'] = 0

from category_encoders import TargetEncoder
#use target encoder to encode office_id but treat the most frequent 50 as original and the remaining as other
target_X = dt[dt['artis_durumu'].isnull() == False]['office_id_grouped']
target_y = dt[dt['artis_durumu'].isnull() == False]['artis_durumu']
ce_target = TargetEncoder(cols=['office_id_grouped'], smoothing=8, min_samples_leaf=5).fit(target_X,target_y)

ce_output = ce_target.transform(dt['office_id_grouped'].reset_index(drop=True))
dt['office_id_grouped_encoded'] = ce_output['office_id_grouped']

# target encoding

### Updating Numerical columns

* sigorta_tip
* dogum_tarihi
* musteri_segmenti

In [None]:
dt['musteri_segmenti'] = dt['musteri_segmenti'].astype(int)

In [None]:
onehot_cols = ['sigorta_tip', 'musteri_segmenti']

enc = OneHotEncoder()
# transforming the column after fitting
enc.fit(dt[onehot_cols])

encoded_colm = enc.transform(dt[onehot_cols]).toarray()
encoded_colm = pd.DataFrame(encoded_colm, columns = enc.get_feature_names_out(onehot_cols))
encoded_colm

# concatenating dataframes
dt = pd.concat([dt, encoded_colm], axis = 1)

**Check Distributions**

In [None]:
print(plt.hist(dt["sigorta_tip"]))
dt.sigorta_tip.value_counts()
print(pd.crosstab(dt['sigorta_tip'],dt['artis_durumu']))

In [None]:
print(plt.hist(dt["musteri_segmenti"]))
dt.musteri_segmenti.value_counts()
print(pd.crosstab(dt['musteri_segmenti'],dt['artis_durumu']))

In [None]:
print(plt.hist(dt["dogum_tarihi"]))

In [None]:
kwargs = dict(alpha=0.5, bins=100, density=True, stacked=True)

x1 = dt.loc[dt.artis_durumu==1, 'dogum_tarihi']
x2 = dt.loc[dt.artis_durumu==0, 'dogum_tarihi']

plt.hist(x1, **kwargs, color='g', label='1')
plt.hist(x2, **kwargs, color='b', label='0')
plt.gca().set(title='Frequency Histogram ', ylabel='Frequency')
plt.xlim(1919,2002)
plt.legend();

In [None]:
#dt.to_csv("processed_data.csv", index=False)

# Modeling

In [None]:
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import early_stopping,log_evaluation
#from imblearn.over_sampling import SMOTE,KMeansSMOTE
from sklearn.preprocessing import StandardScaler


SEED=22

In [None]:
sample = pd.read_csv("../input/anadolu-hayat-emeklilik-datathon-coderspace/samplesubmission.csv")
dt = pd.read_csv("../input/processed-data2/processed_data2.csv") # already prepared processed data

In [None]:
# new features about artis_durumu

# ilk uc ayda artmis mi artmamis mi
dt['artis_durumu_2020'] = np.where((1.15*dt['ocak_vade_tutari'] < dt['subat_vade_tutari'])|(1.15*dt['ocak_vade_tutari'] < dt['mart_vade_tutari']) | (1.15*dt['subat_vade_tutari'] < dt['mart_vade_tutari']), 1,0) 

# number of artis
months = ["ocak","subat","mart","nisan","mayis","haziran","temmuz","agustos","eylul","ekim","kasim","aralik"]
vade_cols = [str(i)+'_vade_tutari' for i in months]

artis_number = [0] * dt.shape[0]
for j in range(12):
    artis_number = artis_number + np.where(dt[vade_cols[j]]>(1.2*dt[vade_cols[j-1]]),1,0)
    
dt['num_of_artis'] = artis_number

# correct values
dt['sene_sonu_hesap_degeri']= np.where(dt['sene_sonu_hesap_degeri'] <1,0,dt['sene_sonu_hesap_degeri'])
dt['sene_basi_hesap_degeri']= np.where(dt['sene_basi_hesap_degeri'] <1,0,dt['sene_basi_hesap_degeri'])

#Q1 = dt['sene_basi_hesap_degeri'].quantile(0.25)#lower = (mn - 3*sd)
#Q3 = dt['sene_basi_hesap_degeri'].quantile(0.75)#upper = (mn + 3*sd)
#IQR = Q3-Q1
#
#lower = np.where((Q1 - 1.5*IQR)<0,0,Q1 - 1.5*IQR)
#upper = Q3 + 1.5*IQR
#
#dt['sene_basi_hesap_degeri'] = np.where(dt['sene_basi_hesap_degeri']< lower, lower,dt['sene_basi_hesap_degeri'])
#dt['sene_basi_hesap_degeri'] = np.where(dt['sene_basi_hesap_degeri']> upper, upper,dt['sene_basi_hesap_degeri'])

#Q1 = dt['sene_sonu_hesap_degeri'].quantile(0.25)#lower = (mn - 3*sd)
#Q3 = dt['sene_sonu_hesap_degeri'].quantile(0.75)#upper = (mn + 3*sd)
#IQR = Q3-Q1
#
#lower = np.where((Q1 - 1.5*IQR)<0,0,Q1 - 1.5*IQR)
#upper = Q3 + 1.5*IQR
#
#dt['sene_sonu_hesap_degeri'] = np.where(dt['sene_sonu_hesap_degeri']< lower, lower,dt['sene_sonu_hesap_degeri'])
#dt['sene_sonu_hesap_degeri'] = np.where(dt['sene_sonu_hesap_degeri']> upper, upper,dt['sene_sonu_hesap_degeri'])

# treat NAs and inf
dt['hesap_degeri_degisimi_perc'] = (dt['sene_sonu_hesap_degeri'] - dt['sene_basi_hesap_degeri']) / dt['sene_basi_hesap_degeri']
dt['hesap_degeri_degisimi_perc'].replace([np.inf, -np.inf], np.nan, inplace=True)
#dt['hesap_degeri_degisimi_perc'].fillna(1, inplace =True)
dt['hesap_degeri_degisimi_perc'].fillna(dt['hesap_degeri_degisimi_perc'].median(), inplace =True)

Q1 = dt['hesap_degeri_degisimi_perc'].quantile(0.25)#lower = (mn - 3*sd)
Q3 = dt['hesap_degeri_degisimi_perc'].quantile(0.75)#upper = (mn + 3*sd)
IQR = Q3-Q1

lower = np.where((Q1 - 1.5*IQR)<0,0,Q1 - 1.5*IQR)
upper = Q3 + 1.5*IQR

dt['hesap_degeri_degisimi_perc'] = np.where(dt['hesap_degeri_degisimi_perc']< lower, lower,dt['hesap_degeri_degisimi_perc'])
dt['hesap_degeri_degisimi_perc'] = np.where(dt['hesap_degeri_degisimi_perc']> upper, upper,dt['hesap_degeri_degisimi_perc'])


## Functions

In [None]:
# takes model as an input
def get_optimal_cutoff(model,X_test,y_test):
    optimal_cutoff = pd.DataFrame()
    for i in range(20,80,5):
        temp = np.where(pd.DataFrame(model.predict_proba(X_test))[1]<=i/100, 0, 1)
        optimal_cutoff = pd.concat([optimal_cutoff,
            pd.DataFrame(
                [{
                 'Cutoff': i/100,
                 'f1': f1_score(y_test, temp)
             }]
            )]
        )
    del temp
    optimal_cutoff.set_index('Cutoff', inplace = True)
    return optimal_cutoff['f1'].idxmax()

# takes probability as an input
def get_optimal_cutoff2(prob,X_test,y_test):
    optimal_cutoff = pd.DataFrame()    
    for i in range(10,91,5):
        temp = np.where(pd.DataFrame(prob)[1]<=i/100, 0, 1)
        optimal_cutoff = pd.concat([optimal_cutoff,
            pd.DataFrame(
                [{
                 'Cutoff': i/100,
                 'f1': f1_score(y_test, temp)
             }]
            )]
        )
    del temp
    optimal_cutoff.set_index('Cutoff', inplace = True)
    return optimal_cutoff['f1'].idxmax()


In [None]:
# optimal ensemble for two models
def optimal_ensemble(prob1,prob2,y):
    coef = []
    for i in np.linspace(0, 1, 11):
        if i<=1:
            coef.append((i,1-i))
            
    optimal_coef = pd.DataFrame()
    for i in coef:
        c1, c2 = i
        optimal_coef = optimal_coef.append(
            pd.DataFrame(
                {
                    'c1': [c1],
                    'c2': [c2],
                    'roc': roc_auc_score(y,prob1*c1 + prob2*c2)
                }
            )
        )
        
    c1,c2 = optimal_coef.iloc[optimal_coef['roc'].argmax()][['c1', 'c2']]
    return c1,c2

# optimal ensemble for three models
def optimal_ensemble3(prob1,prob2,prob3,y):
    coef = []
    for i in np.linspace(0, 1, 11):
        for j in np.linspace(0, 1, 11):
            if i+j<=1:
                coef.append((i, j, 1-i-j))
    
    optimal_coef = pd.DataFrame()
    for i in coef:
        c1, c2, c3 = i
        optimal_coef = optimal_coef.append(
        pd.DataFrame(
            {
                'c1': [c1],
                'c2': [c2],
                'c3': [c3],
                'roc': roc_auc_score(y,prob1*c1 + prob2*c2+ prob3*c3)
            }
        )
    )
        
    c1, c2, c3 = optimal_coef.iloc[optimal_coef['roc'].argmax()][['c1', 'c2', 'c3']]
    return c1,c2,c3

In [None]:
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

## Modeling Approaches

In [None]:
from plotnine import *
from mizani.formatters import percent_format

focus_kapsam = dt.kapsam_tipi.value_counts()[0:15].to_frame().reset_index()['index'].to_list()

focus_dt = dt[(dt['kapsam_tipi'].isin(focus_kapsam) == True) & (dt.artis_durumu.isnull() == False)]
(ggplot(focus_dt)
    + aes(x=focus_dt['kapsam_tipi'], fill = focus_dt['artis_durumu'].astype('category'))
    + geom_bar(position = "fill") 
    + scale_y_continuous(labels=percent_format())#+ scale_y_continuous(labels=lambda l: ["%d%%" % (v * 100) for v in l])
    + theme(figure_size=(20, 6))
)

#p = ggplot(mpg) + geom_bar(aes(x='manufacturer', fill='class'), position='fill')


In [None]:
focus_kapsam = dt['office_id'].value_counts()[0:10].to_frame().reset_index()['index'].to_list()

focus_dt = dt[(dt['office_id'].isin(focus_kapsam) == True) & (dt.artis_durumu.isnull() == False)]
(ggplot(focus_dt)
    + aes(x=focus_dt['office_id'], fill = focus_dt['artis_durumu'].astype('category'))
    + geom_bar(position = "fill") 
    + scale_y_continuous(labels=percent_format())#+ scale_y_continuous(labels=lambda l: ["%d%%" % (v * 100) for v in l])
    + theme(figure_size=(20, 6))
)

In [None]:
focus_kapsam = dt['meslek'].value_counts()[0:15].to_frame().reset_index()['index'].to_list()

focus_dt = dt[(dt['meslek'].isin(focus_kapsam) == True) & (dt.artis_durumu.isnull() == False)]
(ggplot(focus_dt)
    + aes(x=focus_dt['meslek'], fill = focus_dt['artis_durumu'].astype('category'))
    + geom_bar(position = "fill") 
    + scale_y_continuous(labels=percent_format())#+ scale_y_continuous(labels=lambda l: ["%d%%" % (v * 100) for v in l])
    + theme(figure_size=(20, 6))
)

## Model Index _ Submodeling

Construct different models for sigorta_tip & musteri_segmenti pairs with sufficient number of observations

In [None]:
train = dt.loc[dt['artis_durumu'].isnull() == False]
# drop na columns if there is any na values in it
train = train.dropna(axis=1, how='any') # na cols 

train['artis_durumu'] = train['artis_durumu'].astype('category')

test = dt.loc[dt['artis_durumu'].isnull() == True]
test = test[train.columns].copy()

modeling_codes = pd.DataFrame(train.groupby(['musteri_segmenti','sigorta_tip'])['policy_id'].count()).reset_index().reset_index()
#modeling_codes.sort_values('sigorta_tip')

# sigorta_tip = 1 -- keep musteri_segmenti
# sigorta_tip = 4 -- do not detail on musteri_segmenti / use musteri_segmenti= 102
# sigorta_tip = 6 --  exclude 105 and 101
# sigorta_tip = 7 -- ok
# sigorta_tip = 8 -- do not detail on musteri_segmenti/ combine them

modeling_codes['index'] = np.NaN
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([1,7]), 'index']= range(1, 1+ len(modeling_codes[modeling_codes['sigorta_tip'].isin([1,7])]))
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([4]), 'index'] = 1 + modeling_codes['index'].max()
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([8]), 'index'] = 1 + modeling_codes['index'].max()
modeling_codes.loc[(modeling_codes['sigorta_tip'].isin([6])) & (modeling_codes['musteri_segmenti'].isin([101,105])), 'index'] = 1 + modeling_codes['index'].max()
modeling_codes.loc[modeling_codes['index'].isnull() == True, 'index'] = range(16,20)
modeling_codes.rename(columns = {'index':'model_index'}, inplace = True)

modeling_codes.loc[modeling_codes['model_index'] == 15,'model_index'] = 16
modeling_codes.loc[modeling_codes['model_index'] == 2,'model_index'] = 4


train = train.merge(modeling_codes[['sigorta_tip','musteri_segmenti','model_index']], on = ['sigorta_tip','musteri_segmenti'], how = 'left')
test = test.merge(modeling_codes[['sigorta_tip','musteri_segmenti','model_index']], on = ['sigorta_tip','musteri_segmenti'], how = 'left')


## Feature Sets

In [None]:
#deneme1_new best
features = ['dogum_tarihi', 'cinsiyet', 'memleket', 'gelir', 'cocuk_sayisi','sene_basi_hesap_degeri', 'sene_sonu_hesap_degeri',
            'year', 'month', 
            'ocak_odenen_tutar','subat_odenen_tutar','mart_odenen_tutar', 'nisan_odenen_tutar','mayis_odenen_tutar','haziran_odenen_tutar','temmuz_odenen_tutar','agustos_odenen_tutar','eylul_odenen_tutar','ekim_odenen_tutar','kasim_odenen_tutar','aralik_odenen_tutar',
            'ocak_vade_tutari','subat_vade_tutari','mart_vade_tutari','nisan_vade_tutari','mayis_vade_tutari','haziran_vade_tutari','temmuz_vade_tutari','agustos_vade_tutari','eylul_vade_tutari','ekim_vade_tutari','kasim_vade_tutari','aralik_vade_tutari',
            'ocak_odeme_orani', 'subat_odeme_orani', 'mart_odeme_orani', 'nisan_odeme_orani', 'mayis_odeme_orani', 'haziran_odeme_orani', 'temmuz_odeme_orani', 'agustos_odeme_orani', 'eylul_odeme_orani', 'ekim_odeme_orani', 'kasim_odeme_orani', 'aralik_odeme_orani', 
            'average_vade_tutari', 'min_vade_tutari', 'max_vade_tutari', 'change_in_vade_tutari', 'vade_vs_gelir', 
            'toplam_notpaid', 'hesap_degeri_degisimi','hesap_degeri_degisimi_perc', 
            'sozlesme_kokeni_NEW', 'sozlesme_kokeni_detay_NEW', #'uyruk_TR',
            'medeni_hal_0', 'medeni_hal_1', 
            'yatirim_karakteri_Bilge', 'yatirim_karakteri_Cesur', 'yatirim_karakteri_Dengeli', 'yatirim_karakteri_Dikkatli', 'yatirim_karakteri_Temkinli', 'yatirim_karakteri_Yetkin',
            'police_sehir_1', 'police_sehir_2', 'police_sehir_3', 'police_sehir_4', 'police_sehir_5', 'police_sehir_6', 'police_sehir_7', 'police_sehir_8', 'police_sehir_9', 'police_sehir_10', 'police_sehir_11', 'police_sehir_12', 
            'Kapsam_Tipi_1', 'Kapsam_Tipi_2', 'Kapsam_Tipi_3', 'Kapsam_Tipi_4', 'Kapsam_Tipi_5', 'Kapsam_Tipi_6', 'Kapsam_Tipi_7', 'Kapsam_Tipi_8', 'Kapsam_Tipi_9', 'Kapsam_Tipi_10', 'Kapsam_Tipi_11', 'Kapsam_Tipi_12', 
            'meslek_kirilim_encoded', 'dagitim_kanali_encoded', 'kapsam_grubu_encoded' ,
            'odeme_ratio','trend_vade','trend_vade_odenen_fark','PD_OMEDI','PD_ODEDI','PY_OMEDI','PY_ODEDI','PA_OMEDI','PA_ODEDI',
            'office_id_grouped_encoded',
            'kapsam_tipi_grouped_0','kapsam_tipi_grouped_1','kapsam_tipi_grouped_2','kapsam_tipi_grouped_3','kapsam_tipi_grouped_4','kapsam_tipi_grouped_5','kapsam_tipi_grouped_6','kapsam_tipi_grouped_7','kapsam_tipi_grouped_8','kapsam_tipi_grouped_9','kapsam_tipi_grouped_10',
            'office_mean_vade_tutari','vade_in_office',
            'num_of_artis'
           ]   

## old best
#features = ['dogum_tarihi', 'cinsiyet', 'memleket', 'gelir', 'cocuk_sayisi','sene_basi_hesap_degeri', 'sene_sonu_hesap_degeri',
#            'year', 'month', 
#            'ocak_odeme_orani', 'subat_odeme_orani', 'mart_odeme_orani', 'nisan_odeme_orani', 'mayis_odeme_orani', 'haziran_odeme_orani', 'temmuz_odeme_orani', 'agustos_odeme_orani', 'eylul_odeme_orani', 'ekim_odeme_orani', 'kasim_odeme_orani', 'aralik_odeme_orani', 
#            'average_vade_tutari', 'min_vade_tutari', 'max_vade_tutari', 'change_in_vade_tutari', 'vade_vs_gelir', 
#            'toplam_notpaid', #'hesap_degeri_degisimi_perc', 
#            'sozlesme_kokeni_NEW', 'sozlesme_kokeni_detay_NEW', #'uyruk_TR',
#            'medeni_hal_0', 'medeni_hal_1', 
#            'yatirim_karakteri_Bilge', 'yatirim_karakteri_Cesur', 'yatirim_karakteri_Dengeli', 'yatirim_karakteri_Dikkatli', 'yatirim_karakteri_Temkinli', 'yatirim_karakteri_Yetkin',
#            'police_sehir_1', 'police_sehir_2', 'police_sehir_3', 'police_sehir_4', 'police_sehir_5', 'police_sehir_6', 'police_sehir_7', 'police_sehir_8', 'police_sehir_9', 'police_sehir_10', 'police_sehir_11', 'police_sehir_12', 
#            'Kapsam_Tipi_1', 'Kapsam_Tipi_2', 'Kapsam_Tipi_3', 'Kapsam_Tipi_4', 'Kapsam_Tipi_5', 'Kapsam_Tipi_6', 'Kapsam_Tipi_7', 'Kapsam_Tipi_8', 'Kapsam_Tipi_9', 'Kapsam_Tipi_10', 'Kapsam_Tipi_11', 'Kapsam_Tipi_12', 
#            'meslek_kirilim_encoded', 'dagitim_kanali_encoded', 'kapsam_grubu_encoded' 
#            #'sigorta_tip_1', 'sigorta_tip_4', 'sigorta_tip_6', 'sigorta_tip_7', 'sigorta_tip_8', 
#            #'musteri_segmenti_101', 'musteri_segmenti_102', 'musteri_segmenti_103', 'musteri_segmenti_104', 'musteri_segmenti_105', 'musteri_segmenti_106'
#           ]

# worse
#features = [#'dogum_tarihi', 'cinsiyet', 'memleket', 
#            'gelir', 'cocuk_sayisi','sene_basi_hesap_degeri', 'sene_sonu_hesap_degeri',
#            'year', 'month', 
#            #'ocak_odeme_orani', 'subat_odeme_orani', 'mart_odeme_orani', 'nisan_odeme_orani', 'mayis_odeme_orani', 'haziran_odeme_orani', 'temmuz_odeme_orani', 'agustos_odeme_orani', 'eylul_odeme_orani', 'ekim_odeme_orani', 'kasim_odeme_orani', 'aralik_odeme_orani', 
#            'aralik_odeme_orani',
#            'average_vade_tutari', 'min_vade_tutari', 'max_vade_tutari', 'change_in_vade_tutari', 'vade_vs_gelir', 
#            'toplam_notpaid', 'hesap_degeri_degisimi',#'hesap_degeri_degisimi_perc', 
#            'sozlesme_kokeni_NEW', 'sozlesme_kokeni_detay_NEW', #'uyruk_TR',
#            'medeni_hal_0', 'medeni_hal_1', 
#            #'yatirim_karakteri_Bilge', 'yatirim_karakteri_Cesur', 'yatirim_karakteri_Dengeli', 'yatirim_karakteri_Dikkatli', 'yatirim_karakteri_Temkinli', 'yatirim_karakteri_Yetkin',
#            #'police_sehir_1', 'police_sehir_2', 'police_sehir_3', 'police_sehir_4', 'police_sehir_5', 'police_sehir_6', 'police_sehir_7', 'police_sehir_8', 'police_sehir_9', 'police_sehir_10', 'police_sehir_11', 'police_sehir_12', 
#            'Kapsam_Tipi_1', 'Kapsam_Tipi_2', 'Kapsam_Tipi_3', 'Kapsam_Tipi_4', 'Kapsam_Tipi_5', 'Kapsam_Tipi_6', 'Kapsam_Tipi_7', 'Kapsam_Tipi_8', 'Kapsam_Tipi_9', 'Kapsam_Tipi_10', 'Kapsam_Tipi_11', 'Kapsam_Tipi_12', 
#            #'meslek_kirilim_encoded', 'dagitim_kanali_encoded', 'kapsam_grubu_encoded' ,
#            'odeme_ratio','trend_vade','trend_vade_odenen_fark','PD_OMEDI','PD_ODEDI','PY_OMEDI','PY_ODEDI','PA_OMEDI','PA_ODEDI',
#            #'office_id_grouped_encoded',
#            #'kapsam_tipi_grouped_0','kapsam_tipi_grouped_1','kapsam_tipi_grouped_2','kapsam_tipi_grouped_3','kapsam_tipi_grouped_4','kapsam_tipi_grouped_5','kapsam_tipi_grouped_6','kapsam_tipi_grouped_7','kapsam_tipi_grouped_8','kapsam_tipi_grouped_9','kapsam_tipi_grouped_10',
#            'office_mean_vade_tutari','vade_in_office',
#            'num_of_artis', 'artis_durumu_2020'
#            # the last two made the performance worse
#            #'sigorta_tip_1', 'sigorta_tip_4', 'sigorta_tip_6', 'sigorta_tip_7', 'sigorta_tip_8', 
#            #'musteri_segmenti_101', 'musteri_segmenti_102', 'musteri_segmenti_103', 'musteri_segmenti_104', 'musteri_segmenti_105', 'musteri_segmenti_106'
#           ]

In [None]:
#print(train.loc[:, (train.dtypes == int) | (train.dtypes == float)].columns.tolist())

## Best Model Config

In [None]:
## best config
# lgbm & rf & opt ensemble.. new features .. 
test['pred'] = np.NaN
test_preds = pd.DataFrame()

for i in pd.unique(train.model_index):
    print(i)
    sub_train = train[(train['model_index'] == i)]
    sub_test  = test[(test['model_index'] == i)]
    
    X_train, X_test, y_train, y_test = train_test_split(sub_train[features],
                                                        sub_train['artis_durumu'],
                                                        test_size=0.2,
                                                        stratify = sub_train['artis_durumu'],
                                                        random_state=0)
    
    lgbm_fit = lgbm.LGBMClassifier(boosting_type='gbdt', 
                                   objective='binary', 
                                   metric='f1_score',
                                   feature_fraction = 0.4,
                                   bagging_fraction = 0.6,
                                   n_estimators = 200,
                                   max_depth = 3
                                  )
    lgbm_fit.fit(X_train,y_train,eval_metric = "auc",
                 eval_set=[(X_train,y_train),(X_test,y_test)],
                 callbacks=[
                            log_evaluation(period=10)]) #early_stopping(stopping_rounds=50, first_metric_only=True),
    
    lr_fit = RandomForestClassifier(random_state=SEED,
                                   n_estimators = 400) #class_weight
    lr_fit.fit(X_train,y_train)
    
    prob1 = lgbm_fit.predict_proba(X_test)
    prob2 = lr_fit.predict_proba(X_test)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    
    c1,c2 = optimal_ensemble(prob1,prob2,y_test)
    prob = c1*prob1+c2*prob2
    
    cutoff = get_optimal_cutoff2(pd.DataFrame(prob)[1], X_test, y_test)
    y_test_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test_f1_cutoff = f1_score(y_test,y_test_pred)
    print("model_index " + str(i) + " : test_f1_score= " + str(test_f1_cutoff))
    
    test_preds = pd.concat([test_preds,
                       pd.DataFrame({ 
                           'model_index': i,
                           'actual':y_test,
                           'preds' :y_test_pred})]
                      , ignore_index=True)
    
    #train performance
    prob1 = lgbm_fit.predict_proba(X_train)
    prob2 = lr_fit.predict_proba(X_train)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    prob = c1*prob1+c2*prob2
    
    y_train_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    train_f1_cutoff = f1_score(y_train,y_train_pred)
    print("model_index " + str(i) + " : train_f1_score= " + str(train_f1_cutoff))


    ## submission preds
    prob1 = lgbm_fit.predict_proba(sub_test[features])
    prob2 = lr_fit.predict_proba(sub_test[features]) 
    prob = c1*prob1+c2*prob2
    
    subm_pred = np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test.loc[(test['model_index'] == i), 'pred'] = subm_pred

In [None]:
f1_score(test_preds.actual,test_preds.preds)

In [None]:
importances = lr_fit.feature_importances_
#
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]

fig = plt.gcf()
fig.set_size_inches(18, 8)

plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]), importances[sorted_indices], align='center')
plt.xticks(range(X_train.shape[1]), X_train.columns[sorted_indices], rotation=90)
plt.tight_layout()
plt.show()

## Best model config but with third model

In [None]:
dt.drop(columns = 'model_index', inplace = True)

modeling_codes = pd.DataFrame(dt.groupby(['musteri_segmenti','sigorta_tip'])['policy_id'].count()).reset_index().reset_index()
#modeling_codes.sort_values('sigorta_tip')

# sigorta_tip = 1 -- keep musteri_segmenti
# sigorta_tip = 4 -- do not detail on musteri_segmenti / use musteri_segmenti= 102
# sigorta_tip = 6 --  exclude 105 and 101
# sigorta_tip = 7 -- ok
# sigorta_tip = 8 -- do not detail on musteri_segmenti/ combine them
modeling_codes.rename(columns = {'index':'model_index'}, inplace = True)

modeling_codes['model_index'] = np.NaN
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([1,7]), 'model_index']= range(1, 1+ len(modeling_codes[modeling_codes['sigorta_tip'].isin([1,7])]))
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([4]), 'model_index'] = 1 + modeling_codes['model_index'].max()
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([8]), 'model_index'] = 1 + modeling_codes['model_index'].max()
modeling_codes.loc[(modeling_codes['sigorta_tip'].isin([6])) & (modeling_codes['musteri_segmenti'].isin([101,105])), 'model_index'] = 1 + modeling_codes['model_index'].max()
modeling_codes.loc[modeling_codes['model_index'].isnull() == True, 'model_index'] = range(16,20)
modeling_codes.loc[modeling_codes['model_index'] == 15,'model_index'] = 16
modeling_codes.loc[modeling_codes['model_index'] == 2,'model_index'] = 4

##
dt = dt.merge(modeling_codes[['sigorta_tip','musteri_segmenti','model_index']], on = ['sigorta_tip','musteri_segmenti'])

In [None]:
train = dt.loc[dt['artis_durumu'].isnull() == False]
# drop na columns if there is any na values in it
train = train.dropna(axis=1, how='any') # na cols 

train['artis_durumu'] = train['artis_durumu'].astype('category')

test = dt.loc[dt['artis_durumu'].isnull() == True]
test = test[train.columns].copy()

gc.collect()

In [None]:
## best config
# lgbm & rf & xgboost with weights ~ opt ensemble.. reduced features .. 
test['pred'] = np.NaN
test_preds = pd.DataFrame()

for i in pd.unique(train.model_index):
    print(i)
    sub_train = train[(train['model_index'] == i)]
    sub_test  = test[(test['model_index'] == i)]
    
    X_train, X_test, y_train, y_test = train_test_split(sub_train[features],
                                                        sub_train['artis_durumu'],
                                                        test_size=0.2,
                                                        stratify = sub_train['artis_durumu'],
                                                        random_state=0)
    
    weights_train = [0]* len(y_train)
    percc = pd.DataFrame(y_train.value_counts(normalize = True)).reset_index()
    w = round(percc.iloc[0,1]/percc.iloc[1,1],0)
    w = max(0, w-1)
    weights_train = np.where(y_train == 1,w,1)
    
    xgb_fit = XGBClassifier(objective='binary:logistic',
                            max_depth= 3,
                            learning_rate= 0.01,
                            n_estimators=1000,
                            eval_metric='auc',random_state=0)
    
    xgb_fit.fit(X_train, y_train,sample_weight=weights_train)
    
    lgbm_fit = lgbm.LGBMClassifier(boosting_type='gbdt', 
                                   objective='binary', 
                                   metric='f1_score',
                                   feature_fraction = 0.4,
                                   bagging_fraction = 0.6,
                                   n_estimators = 200,
                                   max_depth = 3
                                  )
    lgbm_fit.fit(X_train,y_train) #early_stopping(stopping_rounds=50, first_metric_only=True),
    
    lr_fit = RandomForestClassifier(random_state=SEED,
                                   n_estimators = 400) #class_weight
    lr_fit.fit(X_train,y_train)
    
    
    
    prob1 = lgbm_fit.predict_proba(X_test)
    prob2 = lr_fit.predict_proba(X_test)
    prob3 = xgb_fit.predict_proba(X_test)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    prob3 = pd.DataFrame(prob3)[1]
    
    c1,c2,c3 = optimal_ensemble3(prob1,prob2,prob3,y_test)
    prob = c1*prob1+c2*prob2+c3*prob3
    
    cutoff = get_optimal_cutoff2(pd.DataFrame(prob)[1], X_test, y_test)
    y_test_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test_f1_cutoff = f1_score(y_test,y_test_pred)
    print("model_index " + str(i) + " : test_f1_score= " + str(test_f1_cutoff))
    
    test_preds = pd.concat([test_preds,
                       pd.DataFrame({ 
                           'model_index': i,
                           'actual':y_test,
                           'preds' :y_test_pred})]
                      , ignore_index=True)
    
    #train performance
    prob1 = lgbm_fit.predict_proba(X_train)
    prob2 = lr_fit.predict_proba(X_train)
    prob3 = lr_fit.predict_proba(X_train)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    prob3 = pd.DataFrame(prob3)[1]
    prob = c1*prob1+c2*prob2+c3*prob3
    
    y_train_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    train_f1_cutoff = f1_score(y_train,y_train_pred)
    print("model_index " + str(i) + " : train_f1_score= " + str(train_f1_cutoff))


    ## submission preds
    prob1 = lgbm_fit.predict_proba(sub_test[features])
    prob2 = lr_fit.predict_proba(sub_test[features]) 
    prob3 = xgb_fit.predict_proba(sub_test[features]) 
    prob = c1*prob1+c2*prob2+c3*prob3
    
    subm_pred = np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test.loc[(test['model_index'] == i), 'pred'] = subm_pred

In [None]:
f1_score(test_preds.actual,test_preds.preds)

## Best model config with a special treatment for index 9 and 11 

with a deeper level: max_vade_tutarı groups

In [None]:
import gc
gc.collect()

In [None]:
dt.drop(columns = 'model_index', inplace = True)

modeling_codes = pd.DataFrame(dt.groupby(['musteri_segmenti','sigorta_tip'])['policy_id'].count()).reset_index().reset_index()
#modeling_codes.sort_values('sigorta_tip')

# sigorta_tip = 1 -- keep musteri_segmenti
# sigorta_tip = 4 -- do not detail on musteri_segmenti / use musteri_segmenti= 102
# sigorta_tip = 6 --  exclude 105 and 101
# sigorta_tip = 7 -- ok
# sigorta_tip = 8 -- do not detail on musteri_segmenti/ combine them
modeling_codes.rename(columns = {'index':'model_index'}, inplace = True)

modeling_codes['model_index'] = np.NaN
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([1,7]), 'model_index']= range(1, 1+ len(modeling_codes[modeling_codes['sigorta_tip'].isin([1,7])]))
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([4]), 'model_index'] = 1 + modeling_codes['model_index'].max()
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([8]), 'model_index'] = 1 + modeling_codes['model_index'].max()
modeling_codes.loc[(modeling_codes['sigorta_tip'].isin([6])) & (modeling_codes['musteri_segmenti'].isin([101,105])), 'model_index'] = 1 + modeling_codes['model_index'].max()
modeling_codes.loc[modeling_codes['model_index'].isnull() == True, 'model_index'] = range(16,20)
modeling_codes.loc[modeling_codes['model_index'] == 15,'model_index'] = 16
modeling_codes.loc[modeling_codes['model_index'] == 2,'model_index'] = 4

##
dt = dt.merge(modeling_codes[['sigorta_tip','musteri_segmenti','model_index']], on = ['sigorta_tip','musteri_segmenti'])

In [None]:
dt['max_vade_group']= dt.groupby(['sigorta_tip','musteri_segmenti'])['hesap_degeri_degisimi'].transform(
                     lambda x: pd.qcut(x, 3, duplicates = 'drop', labels=False))

In [None]:
dt[dt.model_index == 9]['max_vade_group'].value_counts()

In [None]:
dt.loc[(dt.sigorta_tip == 1) & (dt.musteri_segmenti == 106) & (dt.max_vade_group == 0.0),'model_index'] = 20
dt.loc[(dt.sigorta_tip == 1) & (dt.musteri_segmenti == 106) & (dt.max_vade_group == 1.0),'model_index'] = 21
dt.loc[(dt.sigorta_tip == 1) & (dt.musteri_segmenti == 106) & (dt.max_vade_group == 2.0),'model_index'] = 22
#dt.loc[(dt.sigorta_tip == 1) & (dt.musteri_segmenti == 106) & (dt.max_vade_group == 3.0),'model_index'] = 23
#dt.loc[(dt.sigorta_tip == 1) & (dt.musteri_segmenti == 106) & (dt.max_vade_group == 4.0),'model_index'] = 24

dt.loc[(dt.sigorta_tip == 1) & (dt.musteri_segmenti == 105) & (dt.max_vade_group == 0.0),'model_index'] = 30
dt.loc[(dt.sigorta_tip == 1) & (dt.musteri_segmenti == 105) & (dt.max_vade_group == 1.0),'model_index'] = 31
dt.loc[(dt.sigorta_tip == 1) & (dt.musteri_segmenti == 105) & (dt.max_vade_group == 2.0),'model_index'] = 32
#dt.loc[(dt.sigorta_tip == 1) & (dt.musteri_segmenti == 105) & (dt.max_vade_group == 3.0),'model_index'] = 33
#dt.loc[(dt.sigorta_tip == 1) & (dt.musteri_segmenti == 105) & (dt.max_vade_group == 4.0),'model_index'] = 34

In [None]:
dt['model_index'].value_counts()

In [None]:
train = dt.loc[dt['artis_durumu'].isnull() == False]
# drop na columns if there is any na values in it
train = train.dropna(axis=1, how='any') # na cols 

train['artis_durumu'] = train['artis_durumu'].astype('category')

test = dt.loc[dt['artis_durumu'].isnull() == True]
test = test[train.columns].copy()

In [None]:
## best config
# lgbm & rf & opt ensemble.. new features .. 
test['pred'] = np.NaN
test_preds = pd.DataFrame()

for i in pd.unique(train.model_index):
    print(i)
    sub_train = train[(train['model_index'] == i)]
    sub_test  = test[(test['model_index'] == i)]
    
    X_train, X_test, y_train, y_test = train_test_split(sub_train[features],
                                                        sub_train['artis_durumu'],
                                                        test_size=0.2,
                                                        stratify = sub_train['artis_durumu'],
                                                        random_state=0)
    
    lgbm_fit = lgbm.LGBMClassifier(boosting_type='gbdt', 
                                   objective='binary', 
                                   metric='f1_score',
                                   feature_fraction = 0.4,
                                   bagging_fraction = 0.6,
                                   n_estimators = 200,
                                   max_depth = 3
                                  )
    lgbm_fit.fit(X_train,y_train) #early_stopping(stopping_rounds=50, first_metric_only=True),
    
    lr_fit = RandomForestClassifier(random_state=SEED,
                                   n_estimators = 400) #class_weight
    lr_fit.fit(X_train,y_train)
    
    prob1 = lgbm_fit.predict_proba(X_test)
    prob2 = lr_fit.predict_proba(X_test)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    
    c1,c2 = optimal_ensemble(prob1,prob2,y_test)
    prob = c1*prob1+c2*prob2
    
    cutoff = get_optimal_cutoff2(pd.DataFrame(prob)[1], X_test, y_test)
    y_test_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test_f1_cutoff = f1_score(y_test,y_test_pred)
    print("model_index " + str(i) + " : test_f1_score= " + str(test_f1_cutoff))
    
    test_preds = pd.concat([test_preds,
                       pd.DataFrame({ 
                           'model_index': i,
                           'actual':y_test,
                           'preds' :y_test_pred})]
                      , ignore_index=True)
    
    #train performance
    prob1 = lgbm_fit.predict_proba(X_train)
    prob2 = lr_fit.predict_proba(X_train)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    prob = c1*prob1+c2*prob2
    
    y_train_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    train_f1_cutoff = f1_score(y_train,y_train_pred)
    print("model_index " + str(i) + " : train_f1_score= " + str(train_f1_cutoff))


    ## submission preds
    prob1 = lgbm_fit.predict_proba(sub_test[features])
    prob2 = lr_fit.predict_proba(sub_test[features]) 
    prob = c1*prob1+c2*prob2
    
    subm_pred = np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test.loc[(test['model_index'] == i), 'pred'] = subm_pred

In [None]:
f1_score(test_preds.actual,test_preds.preds)

## Focus on another column for submodels 

### Based on Hesap degisim tutarı

In [None]:
pd.qcut(x=dt['hesap_degeri_degisimi'], q = 20, labels=range(20)).value_counts()

In [None]:
# hesap degisim degeri yaramadi
dt['model_index'] = pd.qcut(x=dt['max_vade_tutari'], q = 10, labels=range(10))

train = dt.loc[dt['artis_durumu'].isnull() == False]
# drop na columns if there is any na values in it
train = train.dropna(axis=1, how='any') # na cols 

train['artis_durumu'] = train['artis_durumu'].astype('category')

test = dt.loc[dt['artis_durumu'].isnull() == True]
test = test[train.columns].copy()

In [None]:
## best config
# lgbm & rf & opt ensemble.. new features .. 
test['pred'] = np.NaN
test_preds = pd.DataFrame()

for i in pd.unique(train.model_index):
    print(i)
    sub_train = train[(train['model_index'] == i)]
    sub_test  = test[(test['model_index'] == i)]
    
    X_train, X_test, y_train, y_test = train_test_split(sub_train[features],
                                                        sub_train['artis_durumu'],
                                                        test_size=0.2,
                                                        stratify = sub_train['artis_durumu'],
                                                        random_state=0)
    
    lgbm_fit = lgbm.LGBMClassifier(boosting_type='gbdt', 
                                   objective='binary', 
                                   metric='f1_score',
                                   feature_fraction = 0.4,
                                   bagging_fraction = 0.6,
                                   n_estimators = 200,
                                   max_depth = 3
                                  )
    lgbm_fit.fit(X_train,y_train) #early_stopping(stopping_rounds=50, first_metric_only=True),
    
    lr_fit = RandomForestClassifier(random_state=SEED,
                                   n_estimators = 400) #class_weight
    lr_fit.fit(X_train,y_train)
    
    prob1 = lgbm_fit.predict_proba(X_test)
    prob2 = lr_fit.predict_proba(X_test)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    
    c1,c2 = optimal_ensemble(prob1,prob2,y_test)
    prob = c1*prob1+c2*prob2
    
    cutoff = get_optimal_cutoff2(pd.DataFrame(prob)[1], X_test, y_test)
    y_test_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test_f1_cutoff = f1_score(y_test,y_test_pred)
    print("model_index " + str(i) + " : test_f1_score= " + str(test_f1_cutoff))
    
    test_preds = pd.concat([test_preds,
                       pd.DataFrame({ 
                           'model_index': i,
                           'actual':y_test,
                           'preds' :y_test_pred})]
                      , ignore_index=True)
    
    #train performance
    prob1 = lgbm_fit.predict_proba(X_train)
    prob2 = lr_fit.predict_proba(X_train)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    prob = c1*prob1+c2*prob2
    
    y_train_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    train_f1_cutoff = f1_score(y_train,y_train_pred)
    print("model_index " + str(i) + " : train_f1_score= " + str(train_f1_cutoff))


    ## submission preds
    prob1 = lgbm_fit.predict_proba(sub_test[features])
    prob2 = lr_fit.predict_proba(sub_test[features]) 
    prob = c1*prob1+c2*prob2
    
    subm_pred = np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test.loc[(test['model_index'] == i), 'pred'] = subm_pred

In [None]:
f1_score(test_preds.actual,test_preds.preds)

### Based on Hesap degisim perc group

reduced feature set ile full feature set arasinda net bir fark yok

In [None]:
dt['median_perc'] = dt.groupby(['sigorta_tip','musteri_segmenti'])['hesap_degeri_degisimi'].transform('median')
dt['hesap_degisim_group'] = np.where(dt['hesap_degeri_degisimi'] < dt['median_perc'], 0,1)

In [None]:
train = dt.loc[dt['artis_durumu'].isnull() == False]
# drop na columns if there is any na values in it
train = train.dropna(axis=1, how='any') # na cols 

train['artis_durumu'] = train['artis_durumu'].astype('category')

test = dt.loc[dt['artis_durumu'].isnull() == True]
test = test[train.columns].copy()

modeling_codes = pd.DataFrame(train.groupby(['musteri_segmenti','sigorta_tip'])['policy_id'].count()).reset_index().reset_index()
#modeling_codes.sort_values('sigorta_tip')

# sigorta_tip = 1 -- keep musteri_segmenti
# sigorta_tip = 4 -- do not detail on musteri_segmenti / use musteri_segmenti= 102
# sigorta_tip = 6 --  exclude 105 and 101
# sigorta_tip = 7 -- ok
# sigorta_tip = 8 -- do not detail on musteri_segmenti/ combine them
modeling_codes.rename(columns = {'index':'model_index'}, inplace = True)

modeling_codes['model_index'] = np.NaN
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([1,7]), 'model_index']= range(1, 1+ len(modeling_codes[modeling_codes['sigorta_tip'].isin([1,7])]))
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([4]), 'model_index'] = 1 + modeling_codes['model_index'].max()
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([8]), 'model_index'] = 1 + modeling_codes['model_index'].max()
modeling_codes.loc[(modeling_codes['sigorta_tip'].isin([6])) & (modeling_codes['musteri_segmenti'].isin([101,105])), 'model_index'] = 1 + modeling_codes['model_index'].max()
modeling_codes.loc[modeling_codes['model_index'].isnull() == True, 'model_index'] = range(16,20)
modeling_codes.loc[modeling_codes['model_index'] == 15,'model_index'] = 16
modeling_codes.loc[modeling_codes['model_index'] == 2,'model_index'] = 4
##
modeling_codes2 = pd.DataFrame(train.groupby(['sigorta_tip','musteri_segmenti','hesap_degisim_group'])['policy_id'].count()).reset_index().reset_index()
modeling_codes2 = modeling_codes2.sort_values('policy_id', ascending = False)
modeling_codes2 = modeling_codes2.merge(modeling_codes[['sigorta_tip','musteri_segmenti','model_index']], on = ['sigorta_tip','musteri_segmenti'])

modeling_codes2.loc[(modeling_codes2['model_index'] == 11)& (modeling_codes2['hesap_degisim_group'] == 1), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 7) & (modeling_codes2['hesap_degisim_group'] == 1), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 9) & (modeling_codes2['hesap_degisim_group'] == 1), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 5) & (modeling_codes2['hesap_degisim_group'] == 1), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 3) & (modeling_codes2['hesap_degisim_group'] == 1), 'model_index'] = 1 + max(modeling_codes2['model_index'])

train = train.merge(modeling_codes2[['sigorta_tip','musteri_segmenti','hesap_degisim_group','model_index']], on = ['sigorta_tip','musteri_segmenti','hesap_degisim_group'], how = 'left')
test = test.merge(modeling_codes2[['sigorta_tip','musteri_segmenti','hesap_degisim_group','model_index']], on = ['sigorta_tip','musteri_segmenti','hesap_degisim_group'], how = 'left')

In [None]:
features = ['dogum_tarihi', 'cinsiyet', 'memleket', 'gelir', 'cocuk_sayisi','sene_basi_hesap_degeri', 'sene_sonu_hesap_degeri',
            'year', 'month', 
            'ocak_odenen_tutar','subat_odenen_tutar','mart_odenen_tutar', 'nisan_odenen_tutar','mayis_odenen_tutar','haziran_odenen_tutar','temmuz_odenen_tutar','agustos_odenen_tutar','eylul_odenen_tutar','ekim_odenen_tutar','kasim_odenen_tutar','aralik_odenen_tutar',
            'ocak_vade_tutari','subat_vade_tutari','mart_vade_tutari','nisan_vade_tutari','mayis_vade_tutari','haziran_vade_tutari','temmuz_vade_tutari','agustos_vade_tutari','eylul_vade_tutari','ekim_vade_tutari','kasim_vade_tutari','aralik_vade_tutari',
            'ocak_odeme_orani', 'subat_odeme_orani', 'mart_odeme_orani', 'nisan_odeme_orani', 'mayis_odeme_orani', 'haziran_odeme_orani', 'temmuz_odeme_orani', 'agustos_odeme_orani', 'eylul_odeme_orani', 'ekim_odeme_orani', 'kasim_odeme_orani', 'aralik_odeme_orani', 
            'average_vade_tutari', 'min_vade_tutari', 'max_vade_tutari', 'change_in_vade_tutari', 'vade_vs_gelir', 
            'toplam_notpaid', 'hesap_degeri_degisimi','hesap_degeri_degisimi_perc', 
            'sozlesme_kokeni_NEW', 'sozlesme_kokeni_detay_NEW', #'uyruk_TR',
            'medeni_hal_0', 'medeni_hal_1', 
            'yatirim_karakteri_Bilge', 'yatirim_karakteri_Cesur', 'yatirim_karakteri_Dengeli', 'yatirim_karakteri_Dikkatli', 'yatirim_karakteri_Temkinli', 'yatirim_karakteri_Yetkin',
            'police_sehir_1', 'police_sehir_2', 'police_sehir_3', 'police_sehir_4', 'police_sehir_5', 'police_sehir_6', 'police_sehir_7', 'police_sehir_8', 'police_sehir_9', 'police_sehir_10', 'police_sehir_11', 'police_sehir_12', 
            'Kapsam_Tipi_1', 'Kapsam_Tipi_2', 'Kapsam_Tipi_3', 'Kapsam_Tipi_4', 'Kapsam_Tipi_5', 'Kapsam_Tipi_6', 'Kapsam_Tipi_7', 'Kapsam_Tipi_8', 'Kapsam_Tipi_9', 'Kapsam_Tipi_10', 'Kapsam_Tipi_11', 'Kapsam_Tipi_12', 
            'meslek_kirilim_encoded', 'dagitim_kanali_encoded', 'kapsam_grubu_encoded' ,
            'odeme_ratio','trend_vade','trend_vade_odenen_fark','PD_OMEDI','PD_ODEDI','PY_OMEDI','PY_ODEDI','PA_OMEDI','PA_ODEDI',
            'office_id_grouped_encoded',
            'kapsam_tipi_grouped_0','kapsam_tipi_grouped_1','kapsam_tipi_grouped_2','kapsam_tipi_grouped_3','kapsam_tipi_grouped_4','kapsam_tipi_grouped_5','kapsam_tipi_grouped_6','kapsam_tipi_grouped_7','kapsam_tipi_grouped_8','kapsam_tipi_grouped_9','kapsam_tipi_grouped_10',
            'office_mean_vade_tutari','vade_in_office',
            'num_of_artis'
           ]  

In [None]:
## best config
# lgbm & rf & opt ensemble.. new features .. 
test['pred'] = np.NaN
test_preds = pd.DataFrame()

for i in pd.unique(train.model_index):
    print(i)
    sub_train = train[(train['model_index'] == i)]
    sub_test  = test[(test['model_index'] == i)]
    
    X_train, X_test, y_train, y_test = train_test_split(sub_train[features],
                                                        sub_train['artis_durumu'],
                                                        test_size=0.2,
                                                        stratify = sub_train['artis_durumu'],
                                                        random_state=0)
    
    lgbm_fit = lgbm.LGBMClassifier(boosting_type='gbdt', 
                                   objective='binary', 
                                   metric='f1_score',
                                   feature_fraction = 0.4,
                                   bagging_fraction = 0.6,
                                   n_estimators = 200,
                                   max_depth = 3
                                  )
    lgbm_fit.fit(X_train,y_train)
                 #,eval_metric = "auc",
                 #eval_set=[(X_train,y_train),(X_test,y_test)],
                 #callbacks=[log_evaluation(period=10)]) #early_stopping(stopping_rounds=50, first_metric_only=True),
    
    lr_fit = RandomForestClassifier(random_state=SEED,
                                   n_estimators = 400) #class_weight
    lr_fit.fit(X_train,y_train)
    
    prob1 = lgbm_fit.predict_proba(X_test)
    prob2 = lr_fit.predict_proba(X_test)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    
    c1,c2 = optimal_ensemble(prob1,prob2,y_test)
    prob = c1*prob1+c2*prob2
    
    cutoff = get_optimal_cutoff2(pd.DataFrame(prob)[1], X_test, y_test)
    y_test_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test_f1_cutoff = f1_score(y_test,y_test_pred)
    print("model_index " + str(i) + " : test_f1_score= " + str(test_f1_cutoff))
    
    test_preds = pd.concat([test_preds,
                       pd.DataFrame({ 
                           'model_index': i,
                           'actual':y_test,
                           'preds' :y_test_pred})]
                      , ignore_index=True)
    
    #train performance
    prob1 = lgbm_fit.predict_proba(X_train)
    prob2 = lr_fit.predict_proba(X_train)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    prob = c1*prob1+c2*prob2
    
    y_train_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    train_f1_cutoff = f1_score(y_train,y_train_pred)
    print("model_index " + str(i) + " : train_f1_score= " + str(train_f1_cutoff))


    ## submission preds
    prob1 = lgbm_fit.predict_proba(sub_test[features])
    prob2 = lr_fit.predict_proba(sub_test[features]) 
    prob = c1*prob1+c2*prob2
    
    subm_pred = np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test.loc[(test['model_index'] == i), 'pred'] = subm_pred

In [None]:
f1_score(test_preds.actual,test_preds.preds)

## Feature Elim

NOT APPLIED DUE TO VEERY LONG RUN DURATIONS

In [None]:
from sklearn.feature_selection import RFECV
## rfe added - but takes too much time
# lgbm & rf & opt ensemble.. new features .. 
test['pred'] = np.NaN
test_preds = pd.DataFrame()

for i in [4]:#pd.unique(train.model_index):
    print(i)
    sub_train = train[(train['model_index'] == i)]
    sub_test  = test[(test['model_index'] == i)]
    print("data shape:", sub_train.shape[0])
    
    #feature selection
    rfe = RandomForestClassifier()
    rfe = RFECV(estimator=rfe, step=1, cv=2,scoring='roc_auc')   #5-fold cross-validation
    rfe = rfe.fit(sub_train[features], sub_train['artis_durumu'])
    best_features = sub_train[features].columns[rfe.support_]
    print('Optimal number of features :', rfe.n_features_)
    print('Best features :',best_features )
    #best_features = features
    
    X_train, X_test, y_train, y_test = train_test_split(sub_train[best_features],
                                                        sub_train['artis_durumu'],
                                                        test_size=0.2,
                                                        stratify = sub_train['artis_durumu'],
                                                        random_state=0)
    
    lgbm_fit = lgbm.LGBMClassifier(boosting_type='gbdt', 
                                   objective='binary', 
                                   metric='f1_score',
                                   #learning_rate = 0.005,
                                   feature_fraction = 0.4,
                                   bagging_fraction = 0.6,
                                   n_estimators = 300,
                                   max_depth = 3
                                  )
    lgbm_fit.fit(X_train,y_train,
                 eval_metric = "auc",
                 eval_set=[(X_train,y_train),(X_test,y_test)],
                 callbacks=[early_stopping(stopping_rounds=50, first_metric_only=True),
                            log_evaluation(period=10)])
    
    lr_fit = RandomForestClassifier(random_state=SEED,
                                   n_estimators = 400) #class_weight
    lr_fit.fit(X_train,y_train)
    
    prob1 = lgbm_fit.predict_proba(X_test)
    prob2 = lr_fit.predict_proba(X_test)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    
    c1,c2 = optimal_ensemble(prob1,prob2,y_test)
    prob = c1*prob1+c2*prob2
    
    cutoff = get_optimal_cutoff2(pd.DataFrame(prob)[1], X_test, y_test)
    y_test_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test_f1_cutoff = f1_score(y_test,y_test_pred)
    print("model_index " + str(i) + " : test_f1_score= " + str(test_f1_cutoff))
    
    test_preds = pd.concat([test_preds,
                       pd.DataFrame({ 
                           'model_index': i,
                           'actual':y_test,
                           'preds' :y_test_pred})]
                      , ignore_index=True)
    
    #train performance
    prob1 = lgbm_fit.predict_proba(X_train)
    prob2 = lr_fit.predict_proba(X_train)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    prob = c1*prob1+c2*prob2
    
    y_train_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    train_f1_cutoff = f1_score(y_train,y_train_pred)
    print("model_index " + str(i) + " : train_f1_score= " + str(train_f1_cutoff))


    ## submission preds
    prob1 = lgbm_fit.predict_proba(sub_test[best_features])
    prob2 = lr_fit.predict_proba(sub_test[best_features]) 
    prob = c1*prob1+c2*prob2
    
    subm_pred = np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test.loc[(test['model_index'] == i), 'pred'] = subm_pred
    

In [None]:
f1_score(test_preds.actual,test_preds.preds)

## Deeper level with Kapsam_tipi

In [None]:
modeling_codes = pd.DataFrame(train.groupby(['musteri_segmenti','sigorta_tip'])['policy_id'].count()).reset_index().reset_index()
#modeling_codes.sort_values('sigorta_tip')

# sigorta_tip = 1 -- keep musteri_segmenti
# sigorta_tip = 4 -- do not detail on musteri_segmenti / use musteri_segmenti= 102
# sigorta_tip = 6 --  exclude 105 and 101
# sigorta_tip = 7 -- ok
# sigorta_tip = 8 -- do not detail on musteri_segmenti/ combine them
modeling_codes.rename(columns = {'index':'model_index'}, inplace = True)

modeling_codes['model_index'] = np.NaN
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([1,7]), 'model_index']= range(1, 1+ len(modeling_codes[modeling_codes['sigorta_tip'].isin([1,7])]))
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([4]), 'model_index'] = 1 + modeling_codes['model_index'].max()
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([8]), 'model_index'] = 1 + modeling_codes['model_index'].max()
modeling_codes.loc[(modeling_codes['sigorta_tip'].isin([6])) & (modeling_codes['musteri_segmenti'].isin([101,105])), 'model_index'] = 1 + modeling_codes['model_index'].max()
modeling_codes.loc[modeling_codes['model_index'].isnull() == True, 'model_index'] = range(16,20)
modeling_codes.loc[modeling_codes['model_index'] == 15,'model_index'] = 16
modeling_codes.loc[modeling_codes['model_index'] == 2,'model_index'] = 4
##
modeling_codes2 = pd.DataFrame(train.groupby(['sigorta_tip','musteri_segmenti','kapsam_tipi'])['policy_id'].count()).reset_index().reset_index()
modeling_codes2 = modeling_codes2.sort_values('policy_id', ascending = False)
modeling_codes2 = modeling_codes2.merge(modeling_codes[['sigorta_tip','musteri_segmenti','model_index']], on = ['sigorta_tip','musteri_segmenti'])


In [None]:
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['kapsam_tipi'] == 'PENSION215'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['kapsam_tipi'] == 'PENSION056'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['kapsam_tipi'] == 'PENSION001'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['kapsam_tipi'] == 'PENSION312'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['kapsam_tipi'] == 'PENSION247'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['kapsam_tipi'] == 'PENSION121'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['kapsam_tipi'] == 'PENSION195'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['kapsam_tipi'] == 'PENSION251'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['kapsam_tipi'] == 'PENSION243'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['kapsam_tipi'] == 'PENSION202'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['kapsam_tipi'] == 'PENSION238'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['kapsam_tipi'] == 'PENSION245'), 'model_index'] = 1 + max(modeling_codes2['model_index'])

modeling_codes2.loc[(modeling_codes2['model_index'] == 9) & (modeling_codes2['kapsam_tipi'] == 'PENSION247'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 9) & (modeling_codes2['kapsam_tipi'] == 'PENSION251'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 9) & (modeling_codes2['kapsam_tipi'] == 'PENSION215'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 9) & (modeling_codes2['kapsam_tipi'] == 'PENSION312'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 9) & (modeling_codes2['kapsam_tipi'] == 'PENSION195'), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 9) & (modeling_codes2['kapsam_tipi'] == 'PENSION243'), 'model_index'] = 1 + max(modeling_codes2['model_index'])

In [None]:
train.drop(columns = ['model_index'],inplace = True)
test.drop(columns = ['model_index'],inplace = True)

In [None]:
train = train.merge(modeling_codes2[['sigorta_tip','musteri_segmenti','kapsam_tipi','model_index']], on = ['sigorta_tip','musteri_segmenti','kapsam_tipi'], how = 'left')
test = test.merge(modeling_codes2[['sigorta_tip','musteri_segmenti','kapsam_tipi','model_index']], on = ['sigorta_tip','musteri_segmenti','kapsam_tipi'], how = 'left')

In [None]:
train['model_index'].value_counts()

In [None]:
## best config
# lgbm & rf & opt ensemble.. new features .. 
test['pred'] = np.NaN
test_preds = pd.DataFrame()

for i in pd.unique(train.model_index):
    print(i)
    sub_train = train[(train['model_index'] == i)]
    sub_test  = test[(test['model_index'] == i)]
    
    X_train, X_test, y_train, y_test = train_test_split(sub_train[features],
                                                        sub_train['artis_durumu'],
                                                        test_size=0.2,
                                                        stratify = sub_train['artis_durumu'],
                                                        random_state=0)
    
    lgbm_fit = lgbm.LGBMClassifier(boosting_type='gbdt', 
                                   objective='binary', 
                                   metric='f1_score',
                                   feature_fraction = 0.4,
                                   bagging_fraction = 0.6,
                                   n_estimators = 200,
                                   max_depth = 3
                                  )
    lgbm_fit.fit(X_train,y_train,eval_metric = "auc",
                 eval_set=[(X_train,y_train),(X_test,y_test)],
                 callbacks=[early_stopping(stopping_rounds=50, first_metric_only=True),
                            log_evaluation(period=10)])
    
    lr_fit = RandomForestClassifier(random_state=SEED,
                                   n_estimators = 400) #class_weight
    lr_fit.fit(X_train,y_train)
    
    prob1 = lgbm_fit.predict_proba(X_test)
    prob2 = lr_fit.predict_proba(X_test)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    
    c1,c2 = optimal_ensemble(prob1,prob2,y_test)
    prob = c1*prob1+c2*prob2
    
    cutoff = get_optimal_cutoff2(pd.DataFrame(prob)[1], X_test, y_test)
    y_test_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test_f1_cutoff = f1_score(y_test,y_test_pred)
    print("model_index " + str(i) + " : test_f1_score= " + str(test_f1_cutoff))
    
    test_preds = pd.concat([test_preds,
                       pd.DataFrame({ 
                           'model_index': i,
                           'actual':y_test,
                           'preds' :y_test_pred})]
                      , ignore_index=True)
    
    #train performance
    prob1 = lgbm_fit.predict_proba(X_train)
    prob2 = lr_fit.predict_proba(X_train)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    prob = c1*prob1+c2*prob2
    
    y_train_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    train_f1_cutoff = f1_score(y_train,y_train_pred)
    print("model_index " + str(i) + " : train_f1_score= " + str(train_f1_cutoff))


    ## submission preds
    prob1 = lgbm_fit.predict_proba(sub_test[features])
    prob2 = lr_fit.predict_proba(sub_test[features]) 
    prob = c1*prob1+c2*prob2
    
    subm_pred = np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test.loc[(test['model_index'] == i), 'pred'] = subm_pred

In [None]:
f1_score(test_preds.actual,test_preds.preds)

## Deeper level with office_id

In [None]:
modeling_codes = pd.DataFrame(train.groupby(['musteri_segmenti','sigorta_tip'])['policy_id'].count()).reset_index().reset_index()
#modeling_codes.sort_values('sigorta_tip')

# sigorta_tip = 1 -- keep musteri_segmenti
# sigorta_tip = 4 -- do not detail on musteri_segmenti / use musteri_segmenti= 102
# sigorta_tip = 6 --  exclude 105 and 101
# sigorta_tip = 7 -- ok
# sigorta_tip = 8 -- do not detail on musteri_segmenti/ combine them
modeling_codes.rename(columns = {'index':'model_index'}, inplace = True)

modeling_codes['model_index'] = np.NaN
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([1,7]), 'model_index']= range(1, 1+ len(modeling_codes[modeling_codes['sigorta_tip'].isin([1,7])]))
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([4]), 'model_index'] = 1 + modeling_codes['model_index'].max()
modeling_codes.loc[modeling_codes['sigorta_tip'].isin([8]), 'model_index'] = 1 + modeling_codes['model_index'].max()
modeling_codes.loc[(modeling_codes['sigorta_tip'].isin([6])) & (modeling_codes['musteri_segmenti'].isin([101,105])), 'model_index'] = 1 + modeling_codes['model_index'].max()
modeling_codes.loc[modeling_codes['model_index'].isnull() == True, 'model_index'] = range(16,20)
modeling_codes.loc[modeling_codes['model_index'] == 15,'model_index'] = 16
modeling_codes.loc[modeling_codes['model_index'] == 2,'model_index'] = 4
##
modeling_codes2 = pd.DataFrame(train.groupby(['sigorta_tip','musteri_segmenti','office_id'])['policy_id'].count()).reset_index().reset_index()
modeling_codes2 = modeling_codes2.sort_values('policy_id', ascending = False)

modeling_codes2 = modeling_codes2.merge(modeling_codes[['sigorta_tip','musteri_segmenti','model_index']], on = ['sigorta_tip','musteri_segmenti'])
#modeling_codes2.loc[modeling_codes2['policy_id'] > 3000,'model_index']= range()

modeling_codes2[(modeling_codes2['policy_id'] > 2000)]

modeling_codes2.loc[(modeling_codes2['model_index'] == 9) & (modeling_codes2['office_id'] == 10006), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['office_id'] == 10006), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['office_id'] == 10017), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 7) & (modeling_codes2['office_id'] == 10006), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 6) & (modeling_codes2['office_id'] == 10006), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 4) & (modeling_codes2['office_id'] == 10006), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 3) & (modeling_codes2['office_id'] == 10006), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 5) & (modeling_codes2['office_id'] == 10006), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['office_id'] == 10091), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['office_id'] == 10029), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['office_id'] == 10117), 'model_index'] = 1 + max(modeling_codes2['model_index'])
modeling_codes2.loc[(modeling_codes2['model_index'] == 11) & (modeling_codes2['office_id'] == 10023), 'model_index'] = 1 + max(modeling_codes2['model_index'])

In [None]:
#train.drop(columns = ['model_index_x','model_index_y'],inplace = True)
#test.drop(columns = ['model_index_x','model_index_y'],inplace = True)

train.drop(columns = ['model_index'],inplace = True)
test.drop(columns = ['model_index'],inplace = True)

In [None]:
train = train.merge(modeling_codes2[['sigorta_tip','musteri_segmenti','office_id','model_index']], on = ['sigorta_tip','musteri_segmenti','office_id'], how = 'left')
test = test.merge(modeling_codes2[['sigorta_tip','musteri_segmenti','office_id','model_index']], on = ['sigorta_tip','musteri_segmenti','office_id'], how = 'left')

In [None]:
train['model_index'].value_counts()

In [None]:
## best config
# lgbm & rf & opt ensemble.. new features .. 
test['pred'] = np.NaN
test_preds = pd.DataFrame()

for i in pd.unique(train.model_index):
    print(i)
    sub_train = train[(train['model_index'] == i)]
    sub_test  = test[(test['model_index'] == i)]
    
    X_train, X_test, y_train, y_test = train_test_split(sub_train[features],
                                                        sub_train['artis_durumu'],
                                                        test_size=0.2,
                                                        stratify = sub_train['artis_durumu'],
                                                        random_state=0)
    
    lgbm_fit = lgbm.LGBMClassifier(boosting_type='gbdt', 
                                   objective='binary', 
                                   metric='f1_score',
                                   feature_fraction = 0.4,
                                   bagging_fraction = 0.6,
                                   n_estimators = 100,
                                   max_depth = 3
                                  )
    lgbm_fit.fit(X_train,y_train,eval_metric = "auc",
                 eval_set=[(X_train,y_train),(X_test,y_test)],
                 callbacks=[early_stopping(stopping_rounds=30, first_metric_only=True),
                            log_evaluation(period=10)])
    
    lr_fit = RandomForestClassifier(random_state=SEED,
                                   n_estimators = 400) #class_weight
    lr_fit.fit(X_train,y_train)
    
    prob1 = lgbm_fit.predict_proba(X_test)
    prob2 = lr_fit.predict_proba(X_test)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    
    c1,c2 = optimal_ensemble(prob1,prob2,y_test)
    prob = c1*prob1+c2*prob2
    
    cutoff = get_optimal_cutoff2(pd.DataFrame(prob)[1], X_test, y_test)
    y_test_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test_f1_cutoff = f1_score(y_test,y_test_pred)
    print("model_index " + str(i) + " : test_f1_score= " + str(test_f1_cutoff))
    
    test_preds = pd.concat([test_preds,
                       pd.DataFrame({ 
                           'model_index': i,
                           'actual':y_test,
                           'preds' :y_test_pred})]
                      , ignore_index=True)
    
    #train performance
    prob1 = lgbm_fit.predict_proba(X_train)
    prob2 = lr_fit.predict_proba(X_train)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    prob = c1*prob1+c2*prob2
    
    y_train_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    train_f1_cutoff = f1_score(y_train,y_train_pred)
    print("model_index " + str(i) + " : train_f1_score= " + str(train_f1_cutoff))


    ## submission preds
    prob1 = lgbm_fit.predict_proba(sub_test[features])
    prob2 = lr_fit.predict_proba(sub_test[features]) 
    prob = c1*prob1+c2*prob2
    
    subm_pred = np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test.loc[(test['model_index'] == i), 'pred'] = subm_pred

In [None]:
importances = lr_fit.feature_importances_
#
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]

fig = plt.gcf()
fig.set_size_inches(18, 8)

plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]), importances[sorted_indices], align='center')
plt.xticks(range(X_train.shape[1]), X_train.columns[sorted_indices], rotation=90)
plt.tight_layout()
plt.show()

In [None]:
f1_score(test_preds.actual,test_preds.preds)

## Modeling with SMOTE Sampling

In [None]:
# lgbm & rf & opt ensemble with soft SMOTE
test['pred'] = np.NaN
test_preds = pd.DataFrame()

for i in pd.unique(train.model_index):
    print(i)
    sub_train = train[(train['model_index'] == i)]
    sub_test  = test[(test['model_index'] == i)]
    
    X_train, X_test, y_train, y_test = train_test_split(sub_train[features],
                                                        sub_train['artis_durumu'],
                                                        test_size=0.2,
                                                        stratify = sub_train['artis_durumu'],
                                                        random_state=0)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
    X_test_scaled = scaler.transform(X_test)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)
    
    ## SMOTE 
    over = SMOTE(sampling_strategy=0.2, k_neighbors=10,random_state = 0)
    X_train_over, y_train_over = over.fit_resample(X_train_scaled, y_train.ravel())

    lgbm_fit = lgbm.LGBMClassifier(boosting_type='gbdt', 
                                   objective='binary', 
                                   metric='f1_score',
                                   feature_fraction = 0.4,
                                   bagging_fraction = 0.6,
                                   n_estimators = 100,
                                   max_depth = 3
                                  )
    lgbm_fit.fit(X_train_over,y_train_over)
    
    lr_fit = RandomForestClassifier(random_state=SEED,
                                   n_estimators = 400) #class_weight
    lr_fit.fit(X_train_over,y_train_over)
    
    prob1 = lgbm_fit.predict_proba(X_test_scaled)
    prob2 = lr_fit.predict_proba(X_test_scaled)
    
    prob1 = pd.DataFrame(prob1)[1]
    prob2 = pd.DataFrame(prob2)[1]
    
    c1,c2 = optimal_ensemble(prob1,prob2,y_test)
    prob = c1*prob1+c2*prob2
    
    cutoff = get_optimal_cutoff2(pd.DataFrame(prob)[1], X_test_scaled, y_test)
    y_test_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test_f1_cutoff = f1_score(y_test,y_test_pred)
    print("model_index " + str(i) + " : test_f1_score= " + str(test_f1_cutoff))

    test_preds = pd.concat([test_preds,
                           pd.DataFrame({'actual':y_test,
                                        'preds' :y_test_pred})]
                          , ignore_index=True)
    
    sub_test_scaled = scaler.transform(sub_test[features])
    prob1 = lgbm_fit.predict_proba(sub_test_scaled)
    prob2 = lr_fit.predict_proba(sub_test_scaled) 
    prob = c1*prob1+c2*prob2
    
    subm_pred = np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test.loc[(test['model_index'] == i), 'pred'] = subm_pred

## Ensemble of two models with mean probs

In [None]:
# lgbm & rf
test['pred'] = np.NaN
test_preds = pd.DataFrame()

for i in pd.unique(train.model_index):
    print(i)
    sub_train = train[(train['model_index'] == i)]
    sub_test  = test[(test['model_index'] == i)]
    
    X_train, X_test, y_train, y_test = train_test_split(sub_train[features],
                                                        sub_train['artis_durumu'],
                                                        test_size=0.2,
                                                        stratify = sub_train['artis_durumu'],
                                                        random_state=0)
    
    lgbm_fit = lgbm.LGBMClassifier(boosting_type='gbdt', 
                                   objective='binary', 
                                   metric='f1_score',
                                   feature_fraction = 0.4,
                                   bagging_fraction = 0.6,
                                   n_estimators = 100,
                                   max_depth = 3
                                  )
    lgbm_fit.fit(X_train,y_train)
    
    lr_fit = RandomForestClassifier(random_state=SEED,
                                   n_estimators = 400) #class_weight
    lr_fit.fit(X_train,y_train)
    
    prob1 = lgbm_fit.predict_proba(X_test)
    prob2 = lr_fit.predict_proba(X_test)
    prob = np.mean( np.array([ prob1, prob2 ]), axis=0 )
    
    cutoff = get_optimal_cutoff2(prob, X_test, y_test)
    y_test_pred= np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test_f1_cutoff = f1_score(y_test,y_test_pred)
    print("model_index " + str(i) + " : test_f1_score= " + str(test_f1_cutoff))

    test_preds = pd.concat([test_preds,
                           pd.DataFrame({'actual':y_test,
                                        'preds' :y_test_pred})]
                          , ignore_index=True)
    
    prob1 = lgbm_fit.predict_proba(sub_test[features])
    prob2 = lr_fit.predict_proba(sub_test[features])
    prob = np.mean( np.array([ prob1, prob2 ]), axis=0 )    
    
    subm_pred = np.where(pd.DataFrame(prob)[1]<=cutoff, 0, 1)
    test.loc[(test['model_index'] == i), 'pred'] = subm_pred

## Only LGBM 

In [None]:
## only LGBM
test['pred'] = np.NaN
test_preds = pd.DataFrame()

for i in pd.unique(train.model_index):
    print(i)
    sub_train = train[(train['model_index'] == i)]
    sub_test  = test[(test['model_index'] == i)]
    
    X_train, X_test, y_train, y_test = train_test_split(sub_train[features],
                                                        sub_train['artis_durumu'],
                                                        test_size=0.2,
                                                        random_state=0)
    
    lgbm_fit = lgbm.LGBMClassifier(boosting_type='gbdt', 
                                   objective='binary', 
                                   metric='f1_score',
                                   feature_fraction = 0.4,
                                   bagging_fraction = 0.6,
                                   n_estimators = 100,
                                   max_depth = 3
                                  )
    lgbm_fit.fit(X_train,y_train)
    
    cutoff = get_optimal_cutoff(lgbm_fit, X_test, y_test)
    y_test_pred= np.where(pd.DataFrame(lgbm_fit.predict_proba(X_test))[1]<=cutoff, 0, 1)
    test_f1_cutoff = f1_score(y_test,y_test_pred)
    print("model_index " + str(i) + " : test_f1_score= " + str(test_f1_cutoff))

    test_preds = pd.concat([test_preds,
                           pd.DataFrame({'actual':y_test,
                                        'preds' :y_test_pred})]
                          , ignore_index=True)
    subm_pred = np.where(pd.DataFrame(lgbm_fit.predict_proba(sub_test[features]))[1]<=cutoff, 0, 1)
    test.loc[(test['model_index'] == i), 'pred'] = subm_pred

In [None]:
f1_score(test_preds.actual, test_preds.preds)

# Predictions for Submission

In [None]:
sample = sample.merge(test[['policy_id','pred']], left_on = 'POLICY_ID',right_on = 'policy_id', how = 'left')
sample.drop(columns = ['ARTIS_DURUMU','policy_id'],inplace = True)
sample.columns = ['POLICY_ID','ARTIS_DURUMU']
sample['ARTIS_DURUMU'] = np.where(sample['ARTIS_DURUMU']== 0.0, 0, 1)

In [None]:
sample['ARTIS_DURUMU'].value_counts()

In [None]:
sample.to_csv('./submission.csv', index=False)