In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score 
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import time
import lightgbm as lgb
from tqdm import tqdm_notebook
%matplotlib notebook

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.drop('id',axis = 1,inplace = True)
test.drop('id',axis = 1,inplace = True)


In [3]:
train = train[train['cancel'] != -1]

In [4]:
train.dropna(inplace = True)

In [5]:
X = train.iloc[:,1:]
y = train.iloc[:,0]

In [6]:
X = pd.get_dummies(X,dummy_na = True)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123)

In [630]:

rf = RandomForestClassifier(class_weight = 'balanced',
                            n_estimators = 800,
                            max_depth = 15,
                            min_samples_leaf = 15,
                            min_samples_split = 5,
                            max_features = 'sqrt',
                            random_state = 123,
                            n_jobs = 24)
# parameters = {'n_estimators':[800,1000],
#              'min_samples_leaf':[15,25]}
# grid = GridSearchCV(estimator=rf, param_grid=parameters,cv = 3,verbose = 2,n_jobs = 24,scoring = 'roc_auc')
# grid.fit(X_train,y_train)

In [631]:
rf.fit(X_train,y_train)
auc_train = roc_auc_score(y_train,rf.predict(X_train))
auc_test = roc_auc_score(y_test,rf.predict(X_test))

print('train AUC: {train:.2%}, test AUC {test:.2%}'.format(train = auc_train,test = auc_test))

train AUC: 69.63%, test AUC 66.42%


In [636]:
plt.figure(figsize=(20, 10))
sns.barplot(x="Random Forest", y="features", data=feature_importance.sort_values(by="Random Forest", ascending=False))
plt.title('Random Forest Features (avg over folds)')
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [450]:
# from numpy.random import uniform
# from random import randint

# experiment = 20
# best_param = None
# best_test_auc = 0
# best_train_auc = None

# for i in tqdm_notebook(range(experiment)):
#     param_grid = {
#         'num_leaves': 2**randint(5,15),
#         'min_data_in_leaf': 10**randint(2,4),
#         'max_depth': randint(-1,15),
#         'learning_rate': 10**uniform(-1,-4),
#         'bagging_freq': randint(3,7),
#         'bagging_fraction': uniform(0.6,0.9),
#         'reg_alpha': uniform(0,1),
#         'reg_lambda': uniform(0,1),
#         'boosting_type=':'gbdt',
#         'random_seed' : 123,
#         'n_estimators':1000, 
#         'metric':'auc',
#         'objective':'binary', 
#         'verbosity' : -1,
#         'num_threads' : 24
#     }


#     lgb_estimator = lgb.LGBMClassifier(**param_grid)

#     cv_train,cv_test = skf_cv(X_train, y_train,lgb_estimator)
    
#     if best_test_auc < cv_test:
#         best_param = param_grid
#         best_test_auc = cv_test
#         best_train_auc = cv_train
        

# print('train AUC: {train:.2%}, test AUC {test:.2%}'.format(train = best_train_auc,test = best_test_auc))
# print(best_param)

### lgb final result

In [10]:
lgb_estimator = lgb.LGBMClassifier(num_leaves = 2**9,
                                  min_data_in_leaf = 1000,
                                   n_estimators = 2000,
                                   class_weight = 'balanced',
                                   subsample_for_bin=200000,
                                  max_depth = 9,
                                  learning_rate = 0.02,
                                  bagging_freq = 6,
                                  bagging_fraction = 0.7,
                                  reg_lambda = 0.8,
                                  random_seed = 123,
                                  metric = 'auc',
                                  objective = 'binary',
                                   boosting_type = 'dart',
                                  verbosity = -1,
                                  num_threads = 24)

lgb_estimator.fit(X_train,y_train)

auc_train = roc_auc_score(y_train,lgb_estimator.predict(X_train))
auc_test = roc_auc_score(y_test,lgb_estimator.predict(X_test))

In [11]:
print('train AUC: {train:.2%}, test AUC {test:.2%}'.format(train = auc_train,test = auc_test))


train AUC: 68.14%, test AUC 67.13%


In [416]:
feature_imp = pd.DataFrame(sorted(zip(lgb_estimator.feature_importances_,X_train.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

### Advanced Feature enigneering

In [79]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.drop('id',axis = 1,inplace = True)
test.drop('id',axis = 1,inplace = True)

# Drop target = -1
print('Number of target we remove: {:,}'.format(sum(train.cancel == -1)))
train = train[train.cancel != -1]

Number of target we remove: 3,452


In [80]:
train_copy = train.copy()
test_copy = test.copy()
train_copy.drop('cancel',axis = 1,inplace = True)
train_copy['train'] = 1
test_copy['train'] = 0
all_data = pd.concat([train_copy,test_copy],axis = 0).reset_index(drop = True)

In [81]:
target = train.cancel.reset_index(drop = True)
all_data_train = all_data[all_data.train == 1].drop('train',axis = 1)
all_data_test = all_data[all_data.train == 0].drop('train',axis = 1)

In [82]:
print('Shape of training: {}'.format(all_data_train.shape))
print('Length of target: {}'.format(len(target)))
print('Shape of testing: {}'.format(all_data_test.shape))

assert len(all_data_test) + len(all_data_train) == len(all_data)

Shape of training: (1045123, 16)
Length of target: 1045123
Shape of testing: (444430, 16)


### Age 

In [83]:
Age_threshold = 100

all_data.loc[all_data['ni.age'] > Age_threshold,'ni.age'] = np.nan

### Length at residence

In [84]:
all_data.loc[all_data['len.at.res'] > all_data['ni.age'],'len.at.res'] = np.nan

print('Length of residence null rate:{:.2%}'.format(all_data['len.at.res'].isnull().sum()/len(all_data)))

Length of residence null rate:0.57%


### tenure

In [85]:
all_data.loc[all_data['tenure'] > all_data['ni.age'],'tenure'] = np.nan

print('Tenure null rate:{:.2%}'.format(all_data['tenure'].isnull().sum()/len(all_data)))

Tenure null rate:0.10%


### Zip code

In [86]:
zipcode = pd.read_csv('zip_code.csv',sep = ';')
# zipcode
# zipcode = zipcode[['STATE','zipcode']]
# # zipcode.drop_duplicates(inplace = True)

all_data = all_data.merge(zipcode,left_on = 'zip.code',right_on = 'Zip',how = 'left')
# all_data.drop('zipcode',axis = 1,inplace = True)

In [87]:
all_data.drop(['Timezone','Daylight savings time flag','geopoint','Zip'],axis = 1,inplace = True)

In [88]:
# all_data['zip_region_code'] = all_data['zip.code'].apply(lambda x: str(x)[:3])

In [89]:
all_data_train.isnull().sum()

year                    0
zip.code              944
house.color           939
ni.age               1001
len.at.res            960
credit                905
coverage.type         979
dwelling.type         990
premium               955
sales.channel        1020
ni.gender             957
ni.marital.status     992
n.adults              929
n.children            935
tenure                978
claim.ind             982
dtype: int64

### More feauture engineering

In [90]:
# def age_group(df):
#     bins = (17,25,50,100)
#     group_names = ['Young Adult','Adult','Senior']
#     categories = pd.cut(df['ni.age'], bins, labels=group_names)
#     df['age.range'] = categories
#     return df

# def len_of_resid_group(df):
#     bins = (-1,10,15,20,100)
#     group_names = ['below 10','11-15','16-20','20+']
#     categories = pd.cut(df['len.at.res'], bins, labels=group_names)
#     df['len.at.res.range'] = categories
#     return df

# def n_adult_group(df):
#     bins = (-1,1,2,5,13)
#     group_names = ['one','two','3-5','above 6']
#     categories = pd.cut(df['n.adults'], bins, labels=group_names)
#     df['n.adults.range'] = categories
#     return df

# def n_children_group(df):
#     bins = (-1,0,3,13)
#     group_names = ['No Child','1-3','Above 4']
#     categories = pd.cut(df['n.children'], bins, labels=group_names)
#     df['n.children.range'] = categories
#     return df

# def tenure_group(df):
#     bins = (-1,5,10,15,20,40)
#     group_names = ['below 5','6-10','11-15','16-20','above 20']
#     categories = pd.cut(df['tenure'], bins, labels=group_names)
#     df['tenure.range'] = categories
#     return df

# def numerical_feature_encoding(df):
#     df = age_group(df)
#     df = len_of_resid_group(df)
#     df = n_adult_group(df)
#     df = n_children_group(df)
#     df = tenure_group(df)

In [91]:
# def family_size_encoding(df):
#     df.loc[df['family_size'] == 1,'family_size_encoding'] = 'live_alone'
#     df.loc[(all_data['family_size'] == 2) & (all_data['n.children'] == 0),'family_size_encoding'] = 'couple with no kid'
#     df.loc[(all_data['family_size'] == 2) & (all_data['n.children'] != 0),'family_size_encoding'] = 'couple with kids'
#     df.loc[(df['family_size'] >= 3) & (df['family_size'] <= 6) 
#            & (df['n.children'] >= df['n.adults']),'family_size_encoding'] = 'regular size family with more kids'
#     df.loc[(df['family_size'] >= 3) & (df['family_size'] <= 6) 
#            & (df['n.children'] < df['n.adults']),'family_size_encoding'] = 'regular size family with more adults'
#     df.loc[df['family_size'] >6,'family_size_encoding'] = 'large size family'
    
#     return df

# numerical_feature_encoding(all_data)
# all_data['family_size'] = all_data['n.adults'] + all_data['n.children']
# all_data['More_children'] = np.where(all_data['n.children'] >= all_data['n.adults'],1,0)
# all_data = family_size_encoding(all_data)

In [92]:
# Categorical encoding

all_data.loc[(all_data['dwelling.type'].isnull()) | (all_data['dwelling.type'] == 'Landlord'),'dwelling.type'] = 'Other'

### Split back train/test

In [93]:
all_data_train = all_data[all_data['train'] == 1].drop('train',axis = 1)
all_data_test = all_data[all_data['train'] == 0].drop('train',axis = 1)

### Fill NA with mode & median of training

In [94]:
# def fill_na_categorical(col):
#     return all_data[col].fillna(all_data[col].mode()[0])
def fill_na_numerical(col):
    return all_data[col].fillna(all_data[col].median())

numerical_cols = ['ni.age','len.at.res','premium','n.adults','n.children','tenure','Latitude','Longitude']


# numerical_cols = ['ni.age','len.at.res','premium','n.adults','n.children','tenure','Latitude','Longitude','family_size']
# # categorical_cols = ['zip.code','house.color','credit','coverage.type','dwelling.type','sales.channel',
# #                    'ni.gender','ni.marital.status','claim.ind','State','age.range','len.at.res.range',
# #                    'n.adults.range','n.children.range','tenure.range','family_size_encoding','City']

categorical_cols = ['zip.code','house.color','credit','coverage.type','dwelling.type','sales.channel',
                   'ni.gender','ni.marital.status','claim.ind','State','City']




In [95]:
# all_data_train[categorical_cols] = all_data_train[categorical_cols].fillna(all_data_train[categorical_cols].mode().iloc[0])
all_data_train[numerical_cols] = all_data_train[numerical_cols].fillna(all_data_train[numerical_cols].median())

# all_data_test[categorical_cols] = all_data_test[categorical_cols].fillna(all_data_train[categorical_cols].mode().iloc[0])
all_data_test[numerical_cols] = all_data_test[numerical_cols].fillna(all_data_train[numerical_cols].median())


### Mean encoding cols

In [96]:
def mean_encoding(col,smooth = 1):
    all_data_train['target'] = target
    prior = target.mean()
    n = all_data_train.groupby(col).size()

    means = all_data_train.groupby(col).target.mean() 
    smooth_mean = (n*means + smooth*prior)/(n + smooth)
    
    all_data_train[col + '_encoding'] = all_data_train[col].map(smooth_mean)
    all_data_test[col + '_encoding'] = all_data_test[col].map(smooth_mean)

    all_data_train[col + '_encoding'].fillna(prior,inplace = True)
    all_data_test[col + '_encoding'].fillna(prior,inplace = True)

    all_data_train.drop(['target',col],axis = 1,inplace = True)
    all_data_test.drop(col,axis = 1,inplace = True)

In [97]:
for col in categorical_cols:
    mean_encoding(col)

### Train Test Split

In [99]:
X = all_data_train
y = target

# X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123)

In [None]:
param_grid = {
    'n_estimators':[1300,1500,2000],
    'min_samples_leaf':[5,10,15,20],
    'min_samples_split':[5,10,15,20,25]
    
}

rf = RandomForestClassifier(class_weight = 'balanced',
                            max_depth = 15,
                            max_features = 'sqrt',
                            random_state = 123,
                            n_jobs = 24)

grid = GridSearchCV(estimator=rf, param_grid=param_grid,cv = 3,verbose = 2,n_jobs = 24,scoring = 'roc_auc')
grid.fit(X_train,y_train)

print(grid.best_score_)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done 114 tasks      | elapsed: 329.4min


In [108]:

rf = RandomForestClassifier(class_weight = 'balanced',
                            n_estimators = 1300,
                            max_depth = 15,
                            min_samples_leaf = 15,
                            min_samples_split = 5,
                            max_features = 'sqrt',
                            random_state = 123,
                            n_jobs = 24)

rf.fit(X_train,y_train)
auc_train = roc_auc_score(y_train,rf.predict(X_train))
auc_test = roc_auc_score(y_test,rf.predict(X_test))

print('train AUC: {train:.2%}, test AUC {test:.2%}'.format(train = auc_train,test = auc_test))


train AUC: 70.61%, test AUC 66.60%


In [102]:
feature_importance = pd.DataFrame({'Random Forest':rf.feature_importances_,
                                   'features':X_train.columns})

feature_importance = feature_importance.sort_values('Random Forest',ascending = False)

feature_importance

Unnamed: 0,Random Forest,features
11,0.278193,credit_encoding
14,0.16444,sales.channel_encoding
1,0.066231,ni.age
5,0.060387,n.children
9,0.060352,zip.code_encoding
3,0.046009,premium
8,0.044704,Longitude
19,0.04283,City_encoding
7,0.031964,Latitude
2,0.029756,len.at.res


In [103]:
plt.figure(figsize=(20, 10))
sns.barplot(x="Random Forest", y="features", data=feature_importance.sort_values(by="Random Forest", ascending=False))
plt.title('Random Forest Features (avg over folds)')
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

### lightGBM

In [315]:
import lightgbm as lgb

##### Random Search

In [366]:
def skf_cv(X, y,clf,folds = 3):
    from sklearn.model_selection import StratifiedKFold
    
    X_arr,y_arr = np.array(X),np.array(y)
    skf = StratifiedKFold(n_splits=folds,random_state = 123)
    cv_train = []
    cv_test = []
    
    for train_index, val_index in skf.split(X, y):
        x_tr,x_val = X_arr[train_index],X_arr[val_index]
        y_tr,y_val = y_arr[train_index],y_arr[val_index]
        clf.fit(x_tr,y_tr)
        predict_test = clf.predict(x_val)
        predict_train = clf.predict(x_tr)
        cv_test.append(roc_auc_score(y_val,predict_test))
        cv_train.append(roc_auc_score(y_tr,predict_train))
    return np.mean(cv_train),np.mean(cv_test)


In [397]:
from numpy.random import uniform
from random import randint

param_grid = {
        'n_estimators':[800,1000,1200,2000,5000,10000], 
}

lgb_estimator = lgb.LGBMClassifier(num_leaves = 2**9,
                                  min_data_in_leaf = 1000,
                                  max_depth = 9,
                                  learning_rate = 0.02,
                                  bagging_freq = 6,
                                  bagging_fraction = 0.7,
                                  reg_lambda = 0.8,
                                  random_seed = 123,
                                  metric = 'auc',
                                  objective = 'binary',
                                   boosting_type = 'dart',
                                  verbosity = -1,
                                  num_threads = 24)

grid = GridSearchCV(estimator=lgb_estimator, param_grid=param_grid,cv = 3,verbose = 2,scoring = 'roc_auc')
grid.fit(X_train,y_train)

print(grid.best_score_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] n_estimators=800 ................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................. n_estimators=800, total= 2.3min
[CV] n_estimators=800 ................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.4min remaining:    0.0s


[CV] ................................. n_estimators=800, total= 2.4min
[CV] n_estimators=800 ................................................
[CV] ................................. n_estimators=800, total= 2.4min
[CV] n_estimators=1000 ...............................................
[CV] ................................ n_estimators=1000, total= 3.0min
[CV] n_estimators=1000 ...............................................
[CV] ................................ n_estimators=1000, total= 3.2min
[CV] n_estimators=1000 ...............................................
[CV] ................................ n_estimators=1000, total= 3.2min
[CV] n_estimators=1200 ...............................................
[CV] ................................ n_estimators=1200, total= 3.7min
[CV] n_estimators=1200 ...............................................
[CV] ................................ n_estimators=1200, total= 3.9min
[CV] n_estimators=1200 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed: 219.5min finished


0.7317396110071686


In [399]:
print(grid.best_score_)

0.7317396110071686


In [436]:
lgb_estimator = lgb.LGBMClassifier(num_leaves = 2**9,
                                  min_data_in_leaf = 1000,
                                   n_estimators = 2000,
                                   class_weight = 'balanced',
                                   subsample_for_bin=200000,
                                  max_depth = 9,
                                  learning_rate = 0.02,
                                  bagging_freq = 6,
                                  bagging_fraction = 0.7,
                                  reg_lambda = 0.8,
                                  random_seed = 123,
                                  metric = 'auc',
                                  objective = 'binary',
                                   boosting_type = 'dart',
                                  verbosity = -1,
                                  num_threads = 24)

lgb_estimator.fit(X_train,y_train)

auc_train = roc_auc_score(y_train,lgb_estimator.predict(X_train))
auc_test = roc_auc_score(y_test,lgb_estimator.predict(X_test))

In [437]:
print('train AUC: {train:.2%}, test AUC {test:.2%}'.format(train = auc_train,test = auc_test))


train AUC: 68.13%, test AUC 67.01%


In [438]:
feature_imp = pd.DataFrame(sorted(zip(lgb_estimator.feature_importances_,X_train.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [964]:
lgb_estimator = lgb.LGBMClassifier(num_leaves = 2**9,
                                  min_data_in_leaf = 1000,
                                   n_estimators = 2000,
                                   class_weight = 'balanced',
                                   subsample_for_bin=200000,
                                  max_depth = 9,
                                  learning_rate = 0.02,
                                  bagging_freq = 6,
                                  bagging_fraction = 0.7,
                                  reg_lambda = 0.8,
                                  random_seed = 123,
                                  metric = 'auc',
                                  objective = 'binary',
                                   boosting_type = 'dart',
                                  verbosity = -1,
                                  num_threads = 24)

lgb_estimator.fit(X_train,y_train)

auc_train = roc_auc_score(y_train,lgb_estimator.predict(X_train))
auc_test = roc_auc_score(y_test,lgb_estimator.predict(X_test))

In [965]:
print('train AUC: {train:.2%}, test AUC {test:.2%}'.format(train = auc_train,test = auc_test))


train AUC: 68.23%, test AUC 66.99%


In [966]:
feature_imp = pd.DataFrame(sorted(zip(lgb_estimator.feature_importances_,X_train.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [12]:
all_data_train

NameError: name 'all_data_train' is not defined

In [None]:
lgb_estimator.fit(X_train,y_train)

In [1021]:
X_test_final = pd.get_dummies(all_data_test)

In [1064]:
result = rf.predict_proba(X_test_final)[:,1]

In [1066]:
temp = pd.DataFrame(pd.read_csv('test.csv')['id'])
temp['Predicted'] = result
temp.columns = ['ID','Predicted']

In [1067]:
temp.to_csv('result.csv',index = False)

In [1065]:
result

array([0.17144509, 0.46891321, 0.23437222, ..., 0.57582699, 0.25644704,
       0.37788014])

In [1063]:
result

array([0.14109213, 0.42809663, 0.21931016, ..., 0.58624985, 0.23887566,
       0.33094607])