In [67]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report, accuracy_score


In [68]:
df = pd.read_csv('ezybank_churn_dataset.csv')
df.head()

Unnamed: 0,customer_id,age,account_balance,num_transactions,credit_score,is_active,churned
0,CUST001,56,8933,12,542,0,0
1,CUST002,69,8013,39,385,1,0
2,CUST003,46,3112,39,327,1,0
3,CUST004,32,7541,37,639,1,0
4,CUST005,60,6735,9,630,1,1


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   customer_id       500 non-null    object
 1   age               500 non-null    int64 
 2   account_balance   500 non-null    int64 
 3   num_transactions  500 non-null    int64 
 4   credit_score      500 non-null    int64 
 5   is_active         500 non-null    int64 
 6   churned           500 non-null    int64 
dtypes: int64(6), object(1)
memory usage: 27.5+ KB


In [70]:
x = df.drop(columns = ['customer_id','churned'])
y = df['churned']
x
y

0      0
1      0
2      0
3      0
4      1
      ..
495    0
496    1
497    0
498    1
499    0
Name: churned, Length: 500, dtype: int64

In [71]:
## Split the dataset Train and Test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

In [72]:
## Scaling Process
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

## Build Stacking Classifier

In [73]:
base_learners = [
    ('lr', LogisticRegression()),
    ('rf', RandomForestClassifier(n_estimators = 100, random_state=10)),
    ('knm', KNeighborsClassifier(n_neighbors=5))
]

## Meta Learners

meta_learner = GradientBoostingClassifier(n_estimators=100, random_state=8)

In [74]:
stack_model = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, cv=5)
stack_model

In [75]:
stack_model.fit(x_train_scaled, y_train)

In [76]:
y_pred = stack_model.predict(x_test_scaled)

In [77]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.88      0.79        72
           1       0.31      0.14      0.20        28

    accuracy                           0.67       100
   macro avg       0.52      0.51      0.49       100
weighted avg       0.61      0.67      0.63       100



In [78]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train_scaled, y_train)
y_pred_rf =rf.predict(x_test_scaled)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.71      0.86      0.78        72
           1       0.23      0.11      0.15        28

    accuracy                           0.65       100
   macro avg       0.47      0.48      0.46       100
weighted avg       0.58      0.65      0.60       100



In [79]:
#from catboost import CatBoostClassifier

#cbc = CatBoostClassifier(random_state=42)
#cbc.fit(x_train_scaled, y_train)
#y_pred_cbc = cbc.predict(x_test_scaled)
#print(classification_report(y_test, y_pred_cbc))


In [80]:
stack_model = StackingClassifier(
    estimators = base_learners,
    final_estimator=meta_learner,
    stack_method = 'predict_proba',
    passthrough = True )

In [81]:
stack_model.fit(x_train_scaled, y_train)

meta_features = stack_model.transform(x_train_scaled)
meta_features

array([[ 0.3682484 ,  0.19      ,  0.2       , ..., -0.14794084,
         1.12530906, -0.96559161],
       [ 0.31409265,  0.73      ,  0.4       , ...,  1.4652926 ,
         0.27997094,  1.03563452],
       [ 0.30652492,  0.17      ,  0.        , ...,  0.14537433,
         0.10964162,  1.03563452],
       ...,
       [ 0.33756467,  0.21      ,  0.6       , ..., -0.58791359,
         1.57952059,  1.03563452],
       [ 0.38230388,  0.24      ,  0.4       , ...,  0.62201148,
         0.34305587,  1.03563452],
       [ 0.39335016,  0.16      ,  0.4       , ...,  0.14537433,
         1.45965921,  1.03563452]])

In [82]:
base_learners

[('lr', LogisticRegression()),
 ('rf', RandomForestClassifier(random_state=10)),
 ('knm', KNeighborsClassifier())]

In [83]:
n_features = x_train_scaled.shape[1]
n_base = len(base_learners)

In [84]:
column_names = []

In [85]:
for name, _ in base_learners:
    column_names.append(f'{name}')

In [86]:
column_names

['lr', 'rf', 'knm']

In [87]:
if stack_model.passthrough:
    column_names.extend([f'orig_test_{i}' for i in range(n_features)])

In [88]:
column_names

['lr',
 'rf',
 'knm',
 'orig_test_0',
 'orig_test_1',
 'orig_test_2',
 'orig_test_3',
 'orig_test_4']

In [89]:
stack_model.passthrough

True

In [90]:
print(column_names)

['lr', 'rf', 'knm', 'orig_test_0', 'orig_test_1', 'orig_test_2', 'orig_test_3', 'orig_test_4']


In [91]:
pd.DataFrame(meta_features, columns = column_names)

Unnamed: 0,lr,rf,knm,orig_test_0,orig_test_1,orig_test_2,orig_test_3,orig_test_4
0,0.368248,0.19,0.2,0.918606,-0.544999,-0.147941,1.125309,-0.965592
1,0.314093,0.73,0.4,-0.494893,-1.375300,1.465293,0.279971,1.035635
2,0.306525,0.17,0.0,-0.023727,-0.157397,0.145374,0.109642,1.035635
3,0.274133,0.05,0.4,-1.302606,0.511853,-0.331263,0.601704,-0.965592
4,0.252628,0.06,0.2,-0.764130,0.244846,-1.174544,-0.735697,-0.965592
...,...,...,...,...,...,...,...,...
395,0.259046,0.17,0.2,-0.764130,-1.839960,-1.101215,-0.186858,-0.965592
396,0.348550,0.20,0.0,0.514749,-1.735932,0.585347,0.431375,-0.965592
397,0.337565,0.21,0.6,0.312821,0.566564,-0.587914,1.579521,1.035635
398,0.382304,0.24,0.4,1.591700,-0.487206,0.622011,0.343056,1.035635
