In [1]:
import sklearn
import pandas as pd
import numpy as np
import os
from sklearn import preprocessing
from sklearn import impute
from sklearn import pipeline
from sklearn import compose
from sklearn import model_selection
from sklearn import tree, neighbors



In [2]:
dir = '../'
train_data = pd.read_csv(os.path.join(dir, 'train.csv'))
# train_data = train_data.iloc[:1000]
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               165034 non-null  int64  
 1   CustomerId       165034 non-null  int64  
 2   Surname          165034 non-null  object 
 3   CreditScore      165034 non-null  int64  
 4   Geography        165034 non-null  object 
 5   Gender           165034 non-null  object 
 6   Age              165034 non-null  float64
 7   Tenure           165034 non-null  int64  
 8   Balance          165034 non-null  float64
 9   NumOfProducts    165034 non-null  int64  
 10  HasCrCard        165034 non-null  float64
 11  IsActiveMember   165034 non-null  float64
 12  EstimatedSalary  165034 non-null  float64
 13  Exited           165034 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 17.6+ MB


In [3]:
train_data.head(100)


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,15791534,Scott,588,Germany,Male,30.0,10,126683.40,1,1.0,1.0,131636.55,0
96,96,15671139,Shih,659,Spain,Female,39.0,0,107042.74,1,1.0,0.0,102284.20,1
97,97,15576935,Ampt,743,Spain,Male,43.0,2,161807.18,2,0.0,1.0,93228.86,0
98,98,15669946,Shih,639,Germany,Female,43.0,7,123873.52,1,1.0,1.0,51113.17,0


In [4]:
cont_features = ['CustomerId', 'CreditScore',
       'Age', 'Tenure', 'Balance', 'EstimatedSalary']

stages = [('cont_imp',impute.SimpleImputer()),('scaler', preprocessing.StandardScaler())]
cont_pipe = pipeline.Pipeline(stages)
cont_pipe

In [5]:
cat_features = [ 'Geography', 'Gender', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember']
stages = [('cat_imp',impute.SimpleImputer(strategy='most_frequent')),('ohe', preprocessing.OneHotEncoder())]
cat_pipe = pipeline.Pipeline(stages)
cat_pipe

In [6]:
pre_pipe = compose.ColumnTransformer(
    [ ("categorical", cat_pipe, cat_features), ("continous", cont_pipe, cont_features)]
)
pre_pipe

In [7]:

stages = [('pre',pre_pipe),  ("dt", tree.DecisionTreeClassifier())]
pipe = pipeline.Pipeline(stages)
pipe


In [15]:
### now set the fine tuning 
X_train = train_data
y_train = train_data['Exited']

pipe_grid = { 'pre__continous__cont_imp__strategy':['mean', 'median'], 'dt__max_depth':list(range(1,11)),'dt__min_samples_split':[2,5,10]}
cv = model_selection.KFold(10)
clf = model_selection.GridSearchCV(pipe, pipe_grid, cv=cv, scoring='accuracy',return_train_score=True)
try:
    clf.fit(X_train, y_train)
except Exception as e:
    print(e)
print(clf.best_params_)
print(clf.best_score_)
print(clf.best_index_)
print(clf.best_estimator_)


{'dt__max_depth': 8, 'dt__min_samples_split': 2, 'pre__continous__cont_imp__strategy': 'median'}
0.8627070475060734
43
Pipeline(steps=[('pre',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('cat_imp',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder())]),
                                                  ['Geography', 'Gender',
                                                   'NumOfProducts', 'HasCrCard',
                                                   'IsActiveMember']),
                                                 ('continous',
                                                  Pipeline(steps=[('cont_imp',
                                                                   SimpleImp

In [17]:
clf.feature_names_in_

array(['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'], dtype=object)

In [48]:
smoke_test = pd.read_csv(os.path.join(dir, "test.csv"))
# print(smoke_test.shape)
print(smoke_test.head())
# print(smoke_test.info())



X_test = smoke_test
# print(X_test)
x = clf.predict_proba(X_test)
df = pd.DataFrame(x)
smoke_test['Exited'] = df[1]


smoke_test.to_csv(os.path.join(dir, "submission1.csv"), columns=["id", "Exited"], index=False)

       id  CustomerId    Surname  CreditScore Geography  Gender   Age  Tenure  \
0  165034    15773898   Lucchese          586    France  Female  23.0       2   
1  165035    15782418       Nott          683    France  Female  46.0       2   
2  165036    15807120         K?          656    France  Female  34.0       7   
3  165037    15808905  O'Donnell          681    France    Male  36.0       8   
4  165038    15607314    Higgins          752   Germany    Male  38.0      10   

     Balance  NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  
0       0.00              2        0.0             1.0        160976.75  
1       0.00              1        1.0             0.0         72549.27  
2       0.00              2        1.0             0.0        138882.09  
3       0.00              1        1.0             0.0        113931.57  
4  121263.62              1        1.0             0.0        139431.00  
