In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import roc_auc_score 
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import time
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb
from tqdm import tqdm_notebook
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
%matplotlib notebook

# Feature enigneering

In [16]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.drop('id',axis = 1,inplace = True)
test.drop('id',axis = 1,inplace = True)

print('Number of target we remove: {:,}'.format(sum(train.cancel == -1)))
train = train[train.cancel != -1]

Number of target we remove: 3,452


In [17]:
train = train[~train['zip.code'].isnull()]
test['zip.code'].fillna(train['zip.code'].mode()[0],inplace = True)

train_copy = train.copy()
test_copy = test.copy()
train_copy.drop('cancel',axis = 1,inplace = True)
train_copy['train'] = 1
test_copy['train'] = 0
all_data = pd.concat([train_copy,test_copy],axis = 0).reset_index(drop = True)

target = train.cancel.reset_index(drop = True)
all_data_train = all_data[all_data.train == 1].drop('train',axis = 1)
all_data_test = all_data[all_data.train == 0].drop('train',axis = 1)

## Age

In [18]:
Age_threshold = 100
all_data.loc[all_data['ni.age'] > Age_threshold,'ni.age'] = np.nan

## Length at residence

In [19]:
all_data.loc[all_data['len.at.res'] > all_data['ni.age'],'len.at.res'] = np.nan

print('Length of residence null rate:{:.2%}'.format(all_data['len.at.res'].isnull().sum()/len(all_data)))

Length of residence null rate:0.57%


## tenure

In [20]:
all_data.loc[all_data['tenure'] > all_data['ni.age'],'tenure'] = np.nan

print('Tenure null rate:{:.2%}'.format(all_data['tenure'].isnull().sum()/len(all_data)))

Tenure null rate:0.10%


## Zipcode & income

In [21]:
#zipcode = pd.read_csv('zip_code.csv',sep = ';')
#income = pd.read_csv('US_Income.csv',encoding = 'ISO-8859-1')

#all_data = all_data.merge(zipcode,left_on = 'zip.code',right_on = 'Zip',how = 'left')

zip_income = pd.read_excel("MedianZIP-3.xlsx")
zip_income.head()
all_data = all_data.merge(zip_income,left_on = 'zip.code',right_on = 'Zip',how = 'left')

In [22]:
all_data.drop(['Zip'],axis = 1,inplace = True)

In [23]:
all_data['dwelling.type'] = np.where(all_data['dwelling.type'] == 'House','House','Other')

# Split train/split

In [24]:
all_data_train = all_data[all_data['train'] == 1].drop('train',axis = 1)
all_data_test = all_data[all_data['train'] == 0].drop('train',axis = 1)

# Fill NA with mode & median

In [25]:
def fill_na_numerical(col):
    return all_data[col].fillna(all_data[col].median())

numerical_cols = ['ni.age','len.at.res','premium','n.adults','n.children','tenure','Median','Mean','Pop']
categorical_cols = ['zip.code','house.color','credit','coverage.type','dwelling.type','sales.channel',
                   'ni.gender','ni.marital.status','claim.ind']

In [26]:
all_data_train[numerical_cols] = all_data_train[numerical_cols].fillna(all_data_train[numerical_cols].median())

all_data_test[numerical_cols] = all_data_test[numerical_cols].fillna(all_data_train[numerical_cols].median())


In [27]:
all_data_train['family_size'] = all_data_train['n.adults'] + all_data_train['n.children']
all_data_test['family_size'] = all_data_test['n.adults'] + all_data_test['n.children']

# Mean Encoding

In [28]:
def mean_encoding(col,smooth = 1):
    all_data_train['target'] = target
    prior = target.mean()
    n = all_data_train.groupby(col).size()

    means = all_data_train.groupby(col).target.mean() 
    smooth_mean = (n*means + smooth*prior)/(n + smooth)
    
    all_data_train[col + '_encoding'] = all_data_train[col].map(smooth_mean)
    all_data_test[col + '_encoding'] = all_data_test[col].map(smooth_mean)

    all_data_train[col + '_encoding'].fillna(prior,inplace = True)
    all_data_test[col + '_encoding'].fillna(prior,inplace = True)

    all_data_train.drop(['target',col],axis = 1,inplace = True)
    all_data_test.drop(col,axis = 1,inplace = True)

In [29]:
for col in categorical_cols:
    mean_encoding(col)

# Other features

In [30]:
def add_feature(dt):
    dt['pmed_ratio'] = dt['premium']/dt['Median']/dt['n.adults']
    dt['pmean_ratio'] = dt['premium']/dt['Mean']/dt['n.adults']
    dt['p_perad'] = dt['premium']/dt['n.adults']
    dt['p_perp'] = dt['premium']/dt['family_size']

add_feature(all_data_train)
add_feature(all_data_test)
all_data_train.head()

Unnamed: 0,year,ni.age,len.at.res,premium,n.adults,n.children,tenure,Median,Mean,Pop,...,coverage.type_encoding,dwelling.type_encoding,sales.channel_encoding,ni.gender_encoding,ni.marital.status_encoding,claim.ind_encoding,pmed_ratio,pmean_ratio,p_perad,p_perp
0,2013,37.0,18.0,950.507336,2.0,0.0,15.0,99669.2313,144247.8556,27946.0,...,0.251935,0.258435,0.181762,0.241706,0.257989,0.228874,0.004768,0.003295,475.253668,475.253668
1,2013,40.0,17.0,909.346046,5.0,0.0,15.0,31956.2752,38214.0329,38872.0,...,0.231298,0.258435,0.181762,0.241706,0.23574,0.228874,0.005691,0.004759,181.869209,181.869209
2,2013,45.0,14.0,897.084502,1.0,0.0,14.0,70750.2531,79724.2953,30051.0,...,0.231298,0.229433,0.329669,0.242612,0.257989,0.228874,0.01268,0.011252,897.084502,897.084502
3,2013,45.0,24.366136,979.039007,5.0,0.0,22.0,53746.2317,64582.271,15329.0,...,0.231298,0.229433,0.329669,0.242612,0.257989,0.295541,0.003643,0.003032,195.807801,195.807801
4,2013,36.0,16.0,932.379027,2.0,4.0,4.0,67760.3587,74438.4069,680.0,...,0.251935,0.229433,0.181762,0.242612,0.23574,0.228874,0.00688,0.006263,466.189513,155.396504


# Train Test Split in training data

In [31]:
X = all_data_train
y = target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123)

# XGB

In [32]:
### Random Search
def skf_cv(X, y,clf,folds = 3):
    from sklearn.model_selection import StratifiedKFold
    
    X_arr,y_arr = np.array(X),np.array(y)
    skf = StratifiedKFold(n_splits=folds,random_state = 123)
    cv_train = []
    cv_test = []
    
    for train_index, val_index in skf.split(X, y):
        x_tr,x_val = X_arr[train_index],X_arr[val_index]
        y_tr,y_val = y_arr[train_index],y_arr[val_index]
        clf.fit(x_tr,y_tr)
        predict_test = clf.predict_proba(x_val)[:,1]
        predict_train = clf.predict_proba(x_tr)[:,1]
        cv_test.append(roc_auc_score(y_val,predict_test))
        cv_train.append(roc_auc_score(y_tr,predict_train))
    return np.mean(cv_train),np.mean(cv_test)


In [None]:
xgb = XGBClassifier(base_score=0.5, 
                    booster='gbtree',
                    colsample_bylevel=1,
                    colsample_bytree=.8, 
                    gamma=0.1,
                    eta=0.02, 
                    max_delta_step=0,
                    max_depth=6,
                    min_child_weight=1, 
                    min_samples_leaf=5,
                    min_samples_split=5, 
                    missing=None, 
                    n_estimators=100
                    , n_jobs=3, 
                    objective='binary:logistic', 
                    random_state=123,
                    reg_alpha=0.1,
                    reg_lambda=0.8, 
                    scale_pos_weight=1,
                    silent=True, 
                    subsample=0.8)

xgb.fit(X_train,y_train)
auc_train = roc_auc_score(y_train,xgb.predict_proba(X_train)[:,1])
auc_test = roc_auc_score(y_test,xgb.predict_proba(X_test)[:,1])
print('train AUC: {train:.2%}, test AUC {test:.2%}'.format(train = auc_train,test = auc_test))

In [None]:
feature_imp = pd.DataFrame(sorted(zip(xgb.feature_importances_,X_train.columns)), columns=['Value','Feature'])
select_ft = feature_imp.sort_values(by="Value", ascending=False)["Feature"][0:15]
X_train_select =X_train[select_ft]
X_test_select =X_test[select_ft]
xgb.fit(X_train_select,y_train)
auc_train = roc_auc_score(y_train,xgb.predict_proba(X_train_select)[:,1])
auc_test = roc_auc_score(y_test,xgb.predict_proba(X_test_select)[:,1])
print('train AUC: {train:.2%}, test AUC {test:.2%}'.format(train = auc_train,test = auc_test))

# RF

In [None]:
rf = RandomForestClassifier(class_weight = 'balanced_subsample',
                            n_estimators = 2000,
                            max_depth = 10,
                            min_samples_leaf = 15,
                            min_samples_split = 5,
                            max_features = 'sqrt',
                            random_state = 123,
                            n_jobs = 24)

rf.fit(X_train_select,y_train)

auc_train = roc_auc_score(y_train,rf.predict_proba(X_train)[:,1])
auc_test = roc_auc_score(y_test,rf.predict_proba(X_test)[:,1])
print('train AUC: {train:.2%}, test AUC {test:.2%}'.format(train = auc_train,test = auc_test))

# LGB

In [34]:
import lightgbm as lgb

In [None]:
lgb_estimator = lgb.LGBMClassifier(num_leaves = 2**9,
                                  min_data_in_leaf = 1000,
                                   n_estimators = 2000,
                                   class_weight = 'balanced',
                                   subsample_for_bin=200000,
                                  max_depth = 9,
                                  learning_rate = 0.02,
                                  bagging_freq = 6,
                                  bagging_fraction = 0.7,
                                  reg_lambda = 0.8,
                                  random_seed = 123,
                                  metric = 'auc',
                                  objective = 'binary',
                                   boosting_type = 'dart',
                                  verbosity = -1,
                                  num_threads = 24)

lgb_estimator.fit(X_train,y_train)

auc_train = roc_auc_score(y_train,lgb_estimator.predict_proba(X_train)[:,1])
auc_test = roc_auc_score(y_test,lgb_estimator.predict_proba(X_test)[:,1])

print('train AUC: {train:.2%}, test AUC {test:.2%}'.format(train = auc_train,test = auc_test))


# Adaboost

In [36]:
adaboost = AdaBoostClassifier(learning_rate = .1,
                             n_estimators  =200)

In [None]:
adaboost.fit(X_train,y_train)
auc_train = roc_auc_score(y_train,adaboost.predict_proba(X_train)[:,1])
auc_test = roc_auc_score(y_test,adaboost.predict_proba(X_test)[:,1])
print('train AUC: {train:.2%}, test AUC {test:.2%}'.format(train = auc_train,test = auc_test))


# NB

In [None]:
nb = GaussianNB()

# Stacking

In [37]:
from sklearn.model_selection import StratifiedKFold

def get_oof(clf, x_train, y_train, x_test):
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    NFOLDS = 5 # set folds for out-of-fold prediction
    kf = StratifiedKFold(n_splits = NFOLDS, random_state=123)
    
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train,y_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.fit(x_tr, y_tr)

        oof_train[test_index] = clf.predict_proba(x_te)[:,1]
        oof_test_skf[i, :] = clf.predict_proba(x_test)[:,1]

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
rf_oof_train,rf_oof_test = get_oof(rf, X.values, y.values, all_data_test.values)
print('rf finish')
lgb_oof_train,lgb_oof_test = get_oof(lgb_estimator, X.values, y.values, all_data_test.values)
print('lgb finish')
xgb_oof_train, xgb_oof_test = get_oof(xgb, X.values, y.values, all_data_test.values)
print('xgb finish')
ada_oof_train, ada_oof_test = get_oof(adaboost, X.values, y.values, all_data_test.values)
print('ada finish')
nb_oof_train, nb_oof_test = get_oof(nb, X.values, y.values, all_data_test.values)
print('all finish')

In [None]:
x_train_stack = np.concatenate((rf_oof_train,lgb_oof_train,xgb_oof_train,nb_oof_train), axis=1)
x_test_stack = np.concatenate((rf_oof_test,lgb_oof_test,xgb_oof_test,nb_oof_test), axis=1)

In [None]:
LR = LogisticRegression(random_state=123,
                        solver='saga',
                        max_iter = 200,
                        class_weight='balanced',
                       C = 1)

LR.fit(x_train_stack, y)

In [None]:
skf_cv(x_train_stack, y,LR,folds = 5)
result = LR.predict_proba(x_test_stack)[:,1]

# Output

In [None]:
temp = pd.DataFrame(pd.read_csv('test.csv')['id'])
temp['Predicted'] = result
temp.columns = ['ID','Predicted']

In [None]:
temp.to_csv('result.csv',index = False)

In [None]:
temp.head()