In [183]:
# !pip install pandas
# !pip install scikit-learn
# !pip install xgboost
!pip install lightgbm

Defaulting to user installation because normal site-packages is not writeable
Collecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.5.0


In [184]:
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb

# Load Train and Test

In [116]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [33]:
train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [34]:
train.shape

(165034, 14)

# Data Investigation

In [36]:
train.isnull().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [37]:
train.shape

(165034, 14)

In [38]:
train.dtypes

id                   int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                float64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard          float64
IsActiveMember     float64
EstimatedSalary    float64
Exited               int64
dtype: object

In [39]:
cats = train.select_dtypes('object')

for c in cats.columns:
    print(c, cats[c].unique())

Surname ['Okwudilichukwu' 'Okwudiliolisa' 'Hsueh' ... 'Aliyev' 'McMinn' 'Elkins']
Geography ['France' 'Spain' 'Germany']
Gender ['Male' 'Female']


In [40]:
train['Exited'].unique()

array([0, 1])

In [41]:
train[train['Exited'].isnull()]

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited


In [42]:
train['Exited'].value_counts()

Exited
0    130113
1     34921
Name: count, dtype: int64

In [43]:
print(f'No churn: {train['Exited'].value_counts()[0]} or {train['Exited'].value_counts()[0] / len(train) * 100:.2f}%')
print(f'Churn: {train['Exited'].value_counts()[1]} or {train['Exited'].value_counts()[1] / len(train) * 100:.2f}%')
print(f'Total: {len(train)}')

No churn: 130113 or 78.84%
Churn: 34921 or 21.16%
Total: 165034


# Label Encode Categorical Features

In [20]:
# getting an additional row in the data somehow below. Exited is NaN

In [21]:
# RMV = ['id', 'CustomerId', 'Surname', 'Exited']
# FEATURES = [c for c in train.columns if not c in RMV]
# combined = pd.concat([train,test], axis=0, ignore_index=True)
# CATS = []

# for c in FEATURES:
#     ftype = 'numerical'
#     if combined[c].dtype == 'object':
#         CATS.append(c)
#         ftype = 'categorical'
#     if combined[c].dtype == 'int64':
#         combined[c] = combined[c].astype('int32')
#     elif combined[c].dtype == 'float64':
#         combined[c] = combined[c].astype('float32')
        
# train = combined.iloc[:len(train)].copy()
# test = combined.iloc[len(train):].reset_index(drop=True).copy()
# train.head()

# Train Models

In [107]:
print(len(train))
print(len(test))

165034
110023


In [145]:
combined = pd.concat([train, test], axis=0, ignore_index=True).copy()
combined.drop(['id','CustomerId','Surname'], axis=1, inplace=True)

In [146]:
combined['isMale'] = combined['Gender'].map({'Male':1,'Female':0})
combined

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,isMale
0,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0.0,1
1,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0.0,1
2,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0.0,1
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0.0,1
4,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
275052,570,Spain,Male,29.0,7,116099.82,1,1.0,1.0,148087.62,,1
275053,575,France,Female,36.0,4,178032.53,1,1.0,1.0,42181.68,,0
275054,712,France,Male,31.0,2,0.00,2,1.0,0.0,16287.38,,1
275055,709,France,Female,32.0,3,0.00,1,1.0,1.0,158816.58,,0


In [127]:
combined['Geography'].value_counts()

Geography
France     157386
Spain       60126
Germany     57545
Name: count, dtype: int64

In [None]:
dummy_geo = pd.get_dummies(combined['Geography'], columns=['Geography'], drop_first=True, dtype='int')
dummy_geo

Unnamed: 0,Germany,Spain
0,0,0
1,0,0
2,0,0
3,0,0
4,0,1
...,...,...
275052,0,1
275053,0,0
275054,0,0
275055,0,0


In [None]:
combined = pd.concat([combined, dummy_geo], axis=1)
combined

In [150]:
freq_geo = combined['Geography'].value_counts(normalize=True)
combined['Geography_Freq'] = combined['Geography'].map(freq_geo)
combined

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,isMale,Germany,Spain,Geography_Freq
0,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0.0,1,0,0,0.572194
1,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0.0,1,0,0,0.572194
2,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0.0,1,0,0,0.572194
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0.0,1,0,0,0.572194
4,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0.0,1,0,1,0.218595
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275052,570,Spain,Male,29.0,7,116099.82,1,1.0,1.0,148087.62,,1,0,1,0.218595
275053,575,France,Female,36.0,4,178032.53,1,1.0,1.0,42181.68,,0,0,0,0.572194
275054,712,France,Male,31.0,2,0.00,2,1.0,0.0,16287.38,,1,0,0,0.572194
275055,709,France,Female,32.0,3,0.00,1,1.0,1.0,158816.58,,0,0,0,0.572194


In [151]:
mean_geo = combined.groupby('Geography')['Exited'].mean()
combined['Geography_Mean'] = combined['Geography'].map(mean_geo)
combined

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,isMale,Germany,Spain,Geography_Freq,Geography_Mean
0,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0.0,1,0,0,0.572194,0.165282
1,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0.0,1,0,0,0.572194,0.165282
2,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0.0,1,0,0,0.572194,0.165282
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0.0,1,0,0,0.572194,0.165282
4,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0.0,1,0,1,0.218595,0.172176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275052,570,Spain,Male,29.0,7,116099.82,1,1.0,1.0,148087.62,,1,0,1,0.218595,0.172176
275053,575,France,Female,36.0,4,178032.53,1,1.0,1.0,42181.68,,0,0,0,0.572194,0.165282
275054,712,France,Male,31.0,2,0.00,2,1.0,0.0,16287.38,,1,0,0,0.572194,0.165282
275055,709,France,Female,32.0,3,0.00,1,1.0,1.0,158816.58,,0,0,0,0.572194,0.165282


In [167]:
train = combined[:len(train)].copy()
test = combined[len(train):].reset_index(drop=True).copy()

In [168]:
X = train.drop('Exited', axis=1)
y = train[['Exited']]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [170]:
predictors = [
    'CreditScore',
    'Age', 
    'Tenure', 
    'Balance',
    'NumOfProducts', 
    'HasCrCard', 
    'IsActiveMember', 
    'EstimatedSalary',
    'isMale', 
    'Germany', 
    'Spain', 
    'Geography_Freq', 
    'Geography_Mean'
]

## XGBoost

In [172]:
train_dmatrix = xgb.DMatrix(data=X_train[predictors], label=y_train)
val_dmatrix = xgb.DMatrix(data=X_val[predictors], label=y_val)

In [173]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 3,
    'learning_rate': 0.1,
    'seed': 42
}

In [174]:
num_round = 100
bst = xgb.train(params, train_dmatrix, num_round)

In [None]:
# Make predictions on the test set
y_pred_prob = bst.predict(val_dmatrix)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]

# Evaluate the model
score = roc_auc_score(y_val, y_pred)
print(f"Score: {score:.2f}")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_val, y_pred))

Score: 0.75
Classification Report:
              precision    recall  f1-score   support

         0.0       0.89      0.95      0.92     26052
         1.0       0.76      0.54      0.63      6955

    accuracy                           0.87     33007
   macro avg       0.82      0.75      0.77     33007
weighted avg       0.86      0.87      0.86     33007



## LightGBM

In [193]:
d_train = lgb.Dataset(X_train[predictors], label=y_train)

In [199]:
params = {
    "objective": "binary",          # Binary classification
    "metric": "binary_logloss",     # Loss function for binary classification
    "boosting_type": "gbdt",        # Gradient Boosting Decision Tree
    "num_leaves": 31,               # Maximum number of leaves in a tree
    "learning_rate": 0.07,          # Step size for each iteration
    "feature_fraction": 0.8         # Percentage of features to use per iteration
}

model = lgb.train(params, d_train, num_boost_round=100)

[LightGBM] [Info] Number of positive: 27966, number of negative: 104061
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064496 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 868
[LightGBM] [Info] Number of data points in the train set: 132027, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211820 -> initscore=-1.313988
[LightGBM] [Info] Start training from score -1.313988


In [200]:
y_pred = model.predict(X_val[predictors])

# Convert probabilities to binary predictions
y_pred = (y_pred > 0.5).astype(int)


# Evaluate the model
score = roc_auc_score(y_val, y_pred)
print(f"Score: {score:.2f}")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_val, y_pred))

Score: 0.76
Classification Report:
              precision    recall  f1-score   support

         0.0       0.89      0.95      0.92     26052
         1.0       0.75      0.56      0.64      6955

    accuracy                           0.87     33007
   macro avg       0.82      0.76      0.78     33007
weighted avg       0.86      0.87      0.86     33007



# Create Submission CSV

In [177]:
submission = pd.read_csv('data/sample_submission.csv')
submission.head()

Unnamed: 0,id,Exited
0,165034,0.5
1,165035,0.5
2,165036,0.5
3,165037,0.5
4,165038,0.5


## Benchmark

### Random Guess

In [27]:
# dummy model - approx 1 in 5 customers churn (20% churn rate)
random_preds = submission.sample(frac=0.20, random_state=42).index
random_preds

Index([16412, 81431, 80555, 90124, 37688, 56720, 57437, 25027, 40756, 31166,
       ...
       22481, 12095, 86880, 46552, 97149, 74384, 41331, 10459, 53808, 33429],
      dtype='int64', length=22005)

In [28]:
for i in submission.index:
    if i in random_preds:
        submission.loc[i, 'Exited'] = 1
    else:
        submission.loc[i, 'Exited'] = 0

submission.head()

Unnamed: 0,id,Exited
0,165034,0.0
1,165035,0.0
2,165036,0.0
3,165037,1.0
4,165038,0.0


### Guess No One Churns

In [102]:
submission['Exited'] = 0
submission.to_csv('data/submission.csv', index=False) 
submission

Unnamed: 0,id,Exited
0,165034,0
1,165035,0
2,165036,0
3,165037,0
4,165038,0
...,...,...
110018,275052,0
110019,275053,0
110020,275054,0
110021,275055,0


## XGBoost

In [178]:
test_dmatrix = xgb.DMatrix(data=test[predictors])

In [179]:
# Make predictions on the test set
test_preds = bst.predict(test_dmatrix)
test_preds = [1 if prob > 0.5 else 0 for prob in test_preds]

In [182]:
submission['Exited'] = test_preds
submission.to_csv('data/submission.csv', index=False)
submission

Unnamed: 0,id,Exited
0,165034,0
1,165035,1
2,165036,0
3,165037,0
4,165038,0
...,...,...
110018,275052,0
110019,275053,0
110020,275054,0
110021,275055,0


Goal:
- Predict bank customer churn

To Do:
- Train/Test split
- Review feature distributions
- Preprocessing
- Encode categorical variables
- Scale variables?
- Drop features that aren't predictive
- Label Target variable (Exited)
- AUC ROC is the evaluation metric
- Estabilish baseline model (avg?)
- Evaluation framework
- Using best model make predictions
- Submit predictions