## Read data

In [11]:
import numpy as np
import pandas as pd
df = pd.read_csv('data_C.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,churn
0,0,157138,15638646,Chinweike,628.0,Spain,Female,50.0,4,160133.08,1,1,1,653.53,0
1,1,116144,15695825,Onwumelu,710.0,Spain,Male,34.0,8,0.0,2,1,1,72993.73,0
2,2,145603,15592765,Marks,663.0,France,Female,23.0,8,125470.81,1,1,1,174536.17,0
3,3,79929,15807525,Onyemachukwu,644.0,Spain,Male,45.0,8,0.0,2,0,1,28690.9,0
4,4,52554,15611551,Hill,627.0,Spain,Male,23.0,5,0.0,2,1,0,134483.07,0


In [12]:
df.isnull().sum()

Unnamed: 0          0
id                  0
CustomerId          0
Surname             0
CreditScore        12
Geography           0
Gender              0
Age                 0
Tenure              0
Balance             0
NumOfProducts       0
HasCrCard           0
IsActiveMember      0
EstimatedSalary     0
churn               0
dtype: int64

In [13]:
# Drop unuseful columns
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('id', axis=1)
df = df.drop('CustomerId', axis=1)
df = df.drop('Surname', axis=1)

## Split data 

In [14]:
input_df=df.drop(['churn'],axis=1)
output_df=df['churn']

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(input_df, output_df, test_size = 0.2, random_state = 42)

In [16]:
x_train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
15731,725.0,Spain,Female,31.0,2,0.0,2,0,0,71830.1
6996,688.0,Spain,Male,36.0,4,0.0,1,1,0,129870.93
30824,748.0,France,Female,23.0,7,118028.35,1,0,1,41626.78
40019,701.0,France,Female,45.0,5,0.0,2,1,1,158948.63
21501,590.0,France,Female,23.0,5,113923.81,3,1,0,196789.9


In [17]:
x_train.shape

(33006, 10)

## Preprocessing

### Impute missing value

In [18]:
CSMean = np.mean(x_train['CreditScore'])
x_train['CreditScore'].fillna(CSMean, inplace=True)
x_test['CreditScore'].fillna(CSMean, inplace=True)

### Encoding

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41258 entries, 0 to 41257
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      41246 non-null  float64
 1   Geography        41258 non-null  object 
 2   Gender           41258 non-null  object 
 3   Age              41258 non-null  float64
 4   Tenure           41258 non-null  int64  
 5   Balance          41258 non-null  float64
 6   NumOfProducts    41258 non-null  int64  
 7   HasCrCard        41258 non-null  int64  
 8   IsActiveMember   41258 non-null  int64  
 9   EstimatedSalary  41258 non-null  float64
 10  churn            41258 non-null  int64  
dtypes: float64(4), int64(5), object(2)
memory usage: 3.5+ MB


In [20]:
df['Geography'].value_counts()

France     23469
Spain       9057
Germany     8732
Name: Geography, dtype: int64

In [21]:
df['Gender'].value_counts()

Male      23219
Female    18039
Name: Gender, dtype: int64

In [22]:
train_encode={"Geography": {"Germany":0,"France":1,"Spain":2}, "Gender": {"Male":0,"Female":1}}
test_encode={"Geography": {"Germany":0,"France":1,"Spain":2}, "Gender": {"Male":0,"Female":1}}
x_train=x_train.replace(train_encode)
x_test=x_test.replace(test_encode)

In [24]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33006 entries, 15731 to 15795
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      33006 non-null  float64
 1   Geography        33006 non-null  int64  
 2   Gender           33006 non-null  int64  
 3   Age              33006 non-null  float64
 4   Tenure           33006 non-null  int64  
 5   Balance          33006 non-null  float64
 6   NumOfProducts    33006 non-null  int64  
 7   HasCrCard        33006 non-null  int64  
 8   IsActiveMember   33006 non-null  int64  
 9   EstimatedSalary  33006 non-null  float64
dtypes: float64(4), int64(6)
memory usage: 2.8 MB


## Modelling

### RandomForest

In [145]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion='gini', max_depth=8)
rf.fit(x_train, y_train)

RandomForestClassifier(max_depth=8)

In [146]:
y_predict = rf.predict(x_test)

In [147]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.87      0.96      0.92      6431
           1       0.79      0.51      0.62      1821

    accuracy                           0.86      8252
   macro avg       0.83      0.74      0.77      8252
weighted avg       0.85      0.86      0.85      8252



### GradientBoosting

In [169]:
import xgboost as xgb
xgboost = xgb.XGBClassifier(n_estimators=4, max_depth=6, learning_rate=0.5, objective="binary:logistic", random_state=42)
xgboost.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.5, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=4, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, ...)

In [170]:
y_predict = xgboost.predict(x_test)

In [171]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.88      0.95      0.92      6431
           1       0.77      0.55      0.64      1821

    accuracy                           0.86      8252
   macro avg       0.83      0.75      0.78      8252
weighted avg       0.86      0.86      0.86      8252



In [91]:
# import pickle as pkl
# filename = '2602148814.pkl'
# pkl.dump(xgboost, open(filename, 'wb'))