In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
raw_train_df = pd.read_csv('train.csv', index_col='id')
raw_test_df = pd.read_csv('test.csv', index_col='id')

pipe_data = raw_train_df.copy()
pipe_test = raw_test_df.copy()

df_train = raw_train_df.copy()
df_test = raw_test_df.copy()


In [4]:
df_X = raw_train_df.copy().drop(columns = ['Exited'])

y = raw_train_df.Exited

**_Create Pipeline_**

In [8]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PowerTransformer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor,make_column_selector,make_column_transformer
import category_encoders as ce

In [9]:
import xgboost as xgb

In [38]:
class FeatureEngineering:
    def __init__(self, add_attributes=True):
        
        self.add_attributes = add_attributes
        
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X):
        
        if self.add_attributes:
            df = X.copy()
            surname_freq_new = df['Surname'].value_counts().to_dict()
            df['Surname'] = df['Surname'].map(surname_freq_new)
            df['balancesalaryratio'] = df['Balance']/df['EstimatedSalary']
            df['IsActive_by_CreditCard'] = df['HasCrCard'] * df['IsActiveMember']
            df['IsWorking'] = df['Age'].apply(lambda x: 1 if 18<=x<=60 else 0)
            result = df.drop(columns=['CustomerId'])
            
            return result
        else:
            return result

In [39]:
FeatureEngineering = FeatureEngineering(add_attributes = True).val

In [40]:
numerical_transformer = Pipeline(steps=[
    ('Scaler',StandardScaler())
])
# Preprocessing for categorical data
nominal_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())
])

In [41]:
preprocessing = ColumnTransformer([
    ("cat",nominal_transformer,['Geography', 'Gender']),
    ("num",numerical_transformer,['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary','IsWorking','Surname','balancesalaryratio','IsActive_by_CreditCard'])
])

In [42]:
data_processing = make_pipeline(FeatureEngineering,preprocessing)

In [43]:
data_processing

In [45]:
X = raw_train_df.copy().drop(columns=['Exited'])

In [46]:
X_processed = data_processing.fit_transform(X)

In [47]:
X_processed = pd.DataFrame(X_processed)

In [48]:
X_processed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,1.0,0.0,0.0,0.0,1.0,0.144135,-0.578074,-0.719973,-0.883163,0.814298,0.571263,-0.99555,1.369486,0.157928,-0.504635,-0.022925,-0.767616
1,1.0,0.0,0.0,0.0,1.0,-0.367706,-0.578074,-1.432694,-0.883163,0.814298,0.571263,1.00447,-1.254085,0.157928,-0.432418,-0.022925,1.302735
2,1.0,0.0,0.0,0.0,1.0,0.268974,0.211354,1.774548,-0.883163,0.814298,0.571263,-0.99555,1.437422,0.157928,1.430411,-0.022925,-0.767616
3,1.0,0.0,0.0,0.0,1.0,-0.941966,-0.465299,-1.076334,1.486918,-1.013348,0.571263,1.00447,-0.557018,0.157928,1.932227,-0.003713,1.302735
4,0.0,0.0,1.0,0.0,1.0,0.743362,-0.578074,-0.007253,-0.883163,0.814298,0.571263,1.00447,-1.938770,0.157928,-0.532411,-0.022925,1.302735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,0.0,0.0,1.0,1.0,0.0,0.131651,-0.578074,-1.076334,-0.883163,-1.013348,0.571263,1.00447,0.382957,0.157928,-0.867572,-0.022925,1.302735
165030,1.0,0.0,0.0,0.0,1.0,1.692140,-0.352523,-0.719973,-0.883163,-1.013348,-1.750507,-0.99555,0.382951,0.157928,-0.408346,-0.022925,-0.767616
165031,1.0,0.0,0.0,0.0,1.0,-1.141708,-0.803625,-0.007253,-0.883163,-1.013348,0.571263,1.00447,0.295366,0.157928,3.559888,-0.022925,1.302735
165032,0.0,0.0,1.0,1.0,0.0,-1.279031,-0.916401,0.705467,1.688303,-1.013348,-1.750507,1.00447,-0.823217,0.157928,0.841564,0.001841,-0.767616


In [49]:
X_processed.shape

(165034, 17)

**_OPTUNA Optimize Hyperparameter_**

In [50]:
import optuna
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

In [1]:
from sklearn.metrics import roc_auc_score, log_loss, f1_score,accuracy_score

In [51]:
y_t = raw_train_df['Exited']

In [52]:
X_train, X_valid, y_train, y_valid = train_test_split(X_processed, y_t, test_size=0.2, random_state=43)

In [2]:
def objective(trial):
    train_x, valid_x, train_y, valid_y = X_train, X_valid, y_train, y_valid
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dvalid = xgb.DMatrix(valid_x, label=valid_y)
        
    param = {
         "objective": "binary:logistic",
         # use exact for small dataset.
         "tree_method":  trial.suggest_categorical('tree_method', ['auto', 'exact','hist']),
         # defines booster, gblinear for linear functions.
         "booster": trial.suggest_categorical('booster', ['gbtree', 'dart']),
         # L2 regularization weight.
         "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
         # L1 regularization weight.
         "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
         # sampling ratio for training data.
         "subsample": trial.suggest_float("subsample", 0.2, 1.0),
         # sampling according to each tree.
         "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
         "max_leaves": trial.suggest_int("max_leaves", 10, 200, step=10),
         "max_depth": trial.suggest_int("max_depth", 5, 50, step=5),
         "min_child_weight": trial.suggest_int("min_child_weight", 0, 50, step=5),
         "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True)
     }
       
    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    auc = roc_auc_score(valid_y, preds)
    return auc

In [3]:

study = optuna.create_study(direction="maximize")
res = study.optimize(objective, n_trials=100)

NameError: name 'optuna' is not defined

**_Choose Hyperparameter_**

In [55]:
parameters3 = {'tree_method': 'hist', 
             'booster': 'dart', 
             'lambda': 0.09193005870812547, 
             'alpha': 0.2820120776331993, 
             'subsample': 0.6600330986205394, 
             'learning_rate': 0.09012626076157491, 
             'colsample_bytree': 0.9763192944362807, 
             'max_leaves': 110, 'max_depth': 30, 
             'min_child_weight': 50, 
             'gamma': 1.8863426876973137e-06}

In [56]:
parameters4 = {'tree_method': 'exact', 'booster': 'dart', 'lambda': 1.589908724515038e-05, 'alpha': 0.034871905942965756, 'subsample': 0.49107879007027627, 'learning_rate': 0.09868130087520165, 'colsample_bytree': 0.824447261923239, 'max_leaves': 200, 'max_depth': 15, 'min_child_weight': 25, 'gamma': 0.014976536373905775}

In [57]:
xgb_tunned = xgb.XGBRegressor(**parameters4)

In [58]:
xgb_tunned.fit(X_train,y_train)

In [62]:
X_testing = raw_test_df.copy()

In [63]:
X_testing

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
165034,15773898,Lucchese,586,France,Female,23.0,2,0.00,2,0.0,1.0,160976.75
165035,15782418,Nott,683,France,Female,46.0,2,0.00,1,1.0,0.0,72549.27
165036,15807120,K?,656,France,Female,34.0,7,0.00,2,1.0,0.0,138882.09
165037,15808905,O'Donnell,681,France,Male,36.0,8,0.00,1,1.0,0.0,113931.57
165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.00
...,...,...,...,...,...,...,...,...,...,...,...,...
275052,15662091,P'eng,570,Spain,Male,29.0,7,116099.82,1,1.0,1.0,148087.62
275053,15774133,Cox,575,France,Female,36.0,4,178032.53,1,1.0,1.0,42181.68
275054,15728456,Ch'iu,712,France,Male,31.0,2,0.00,2,1.0,0.0,16287.38
275055,15687541,Yegorova,709,France,Female,32.0,3,0.00,1,1.0,1.0,158816.58


In [64]:
Processed_X_testing = data_processing.fit_transform(X_testing)

In [65]:
X_test = pd.DataFrame(Processed_X_testing)

In [66]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,1.0,0.0,0.0,1.0,0.0,-0.878176,-1.706504,-1.067887,-0.881274,0.820030,-1.746219,1.009580,0.967874,0.157373,0.127079,-0.024559,-0.761515
1,1.0,0.0,0.0,1.0,0.0,0.329567,0.888990,-1.067887,-0.881274,-1.015806,0.572666,-0.990511,-0.790939,0.157373,-0.951403,-0.024559,-0.761515
2,1.0,0.0,0.0,1.0,0.0,-0.006609,-0.465181,0.713922,-0.881274,0.820030,0.572666,-0.990511,0.528413,0.157373,0.264698,-0.024559,-0.761515
3,1.0,0.0,0.0,0.0,1.0,0.304665,-0.239486,1.070284,-0.881274,-1.015806,0.572666,-0.990511,0.032150,0.157373,-0.159393,-0.024559,-0.761515
4,0.0,1.0,0.0,0.0,1.0,1.188684,-0.013791,1.783008,1.050038,-1.015806,0.572666,-0.990511,0.539331,0.157373,-0.990723,-0.014461,-0.761515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110018,0.0,0.0,1.0,0.0,1.0,-1.077392,-1.029419,0.713922,0.967796,-1.015806,0.572666,1.009580,0.711510,0.157373,1.674589,-0.015456,1.313172
110019,1.0,0.0,0.0,1.0,0.0,-1.015137,-0.239486,-0.355164,1.954171,-1.015806,0.572666,1.009580,-1.394946,0.157373,-0.912083,0.024449,1.313172
110020,1.0,0.0,0.0,0.0,1.0,0.690645,-0.803724,-1.067887,-0.881274,0.820030,0.572666,-0.990511,-1.909981,0.157373,1.402160,-0.024559,-0.761515
110021,1.0,0.0,0.0,1.0,0.0,0.653292,-0.690876,-0.711526,-0.881274,-1.015806,0.572666,1.009580,0.924908,0.157373,-0.867147,-0.024559,1.313172


In [67]:
y_predict = xgb_tunned.predict(X_test)

In [69]:
result = pd.DataFrame(y_predict,columns=['id'])

In [71]:
result['Exited'] = result

In [72]:
result

Unnamed: 0,id,Exited
0,0.013663,0.013663
1,0.656205,0.656205
2,0.016312,0.016312
3,0.189464,0.189464
4,0.359285,0.359285
...,...,...
110018,0.066784,0.066784
110019,0.111581,0.111581
110020,-0.026548,-0.026548
110021,0.124718,0.124718


In [74]:
result.to_csv('submission.csv', index=False)
result.to_csv(r'C:\Users\peaks\Desktop\Submission\submission.csv', index=False)