In [30]:
# !pip install pandas
# !pip install scikit-learn

In [31]:
import pandas as pd
from sklearn.metrics import roc_auc_score

# Load Train and Test

In [32]:
train = pd.read_csv('data/train.csv')

In [33]:
train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [34]:
train.shape

(165034, 14)

In [35]:
test = pd.read_csv('data/test.csv')

# Data Investigation

In [36]:
train.isnull().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [37]:
train.shape

(165034, 14)

In [38]:
train.dtypes

id                   int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                float64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard          float64
IsActiveMember     float64
EstimatedSalary    float64
Exited               int64
dtype: object

In [39]:
cats = train.select_dtypes('object')

for c in cats.columns:
    print(c, cats[c].unique())

Surname ['Okwudilichukwu' 'Okwudiliolisa' 'Hsueh' ... 'Aliyev' 'McMinn' 'Elkins']
Geography ['France' 'Spain' 'Germany']
Gender ['Male' 'Female']


In [40]:
train['Exited'].unique()

array([0, 1])

In [41]:
train[train['Exited'].isnull()]

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited


In [42]:
train['Exited'].value_counts()

Exited
0    130113
1     34921
Name: count, dtype: int64

In [43]:
print(f'No churn: {train['Exited'].value_counts()[0]} or {train['Exited'].value_counts()[0] / len(train) * 100:.2f}%')
print(f'Churn: {train['Exited'].value_counts()[1]} or {train['Exited'].value_counts()[1] / len(train) * 100:.2f}%')
print(f'Total: {len(train)}')

No churn: 130113 or 78.84%
Churn: 34921 or 21.16%
Total: 165034


# Label Encode Categorical Features

In [20]:
# getting an additional row in the data somehow below. Exited is NaN

In [21]:
# RMV = ['id', 'CustomerId', 'Surname', 'Exited']
# FEATURES = [c for c in train.columns if not c in RMV]
# combined = pd.concat([train,test], axis=0, ignore_index=True)
# CATS = []

# for c in FEATURES:
#     ftype = 'numerical'
#     if combined[c].dtype == 'object':
#         CATS.append(c)
#         ftype = 'categorical'
#     if combined[c].dtype == 'int64':
#         combined[c] = combined[c].astype('int32')
#     elif combined[c].dtype == 'float64':
#         combined[c] = combined[c].astype('float32')
        
# train = combined.iloc[:len(train)].copy()
# test = combined.iloc[len(train):].reset_index(drop=True).copy()
# train.head()

# Train Models

In [44]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (201.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.3/201.3 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.24.3 xgboost-2.1.3


In [60]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [47]:
train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [53]:
train.dtypes

id                   int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                float64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard          float64
IsActiveMember     float64
EstimatedSalary    float64
Exited               int64
dtype: object

In [57]:
X = train[['CreditScore'
           ,'Age'
           ,'Tenure'
           ,'Balance'
           ,'NumOfProducts'
           ,'HasCrCard'
           ,'IsActiveMember'
           ,'EstimatedSalary']]
y = train[['Exited']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [83]:
train_dmatrix = xgb.DMatrix(data=X_train, label=y_train, enable_categorical='True')
val_dmatrix = xgb.DMatrix(data=X_test, label=y_test, enable_categorical='True')

In [70]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 3,
    'learning_rate': 0.1,
    'seed': 42
}

In [79]:
num_round = 100
bst = xgb.train(params, train_dmatrix, num_round)

In [80]:
# Make predictions on the test set
y_pred_prob = bst.predict(val_dmatrix)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]

# Evaluate the model
score = roc_auc_score(y_test, y_pred)
print(f"Score: {score:.2f}")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Score: 0.73
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.95      0.91     26052
           1       0.73      0.50      0.59      6955

    accuracy                           0.86     33007
   macro avg       0.81      0.73      0.75     33007
weighted avg       0.85      0.86      0.85     33007



# Compute CV Score

In [81]:
print(f'ROC AUC SCORE: {roc_auc_score(y_test, y_pred)}')

ROC AUC SCORE: 0.725192210833545


# Create Submission CSV

In [82]:
submission = pd.read_csv('data/sample_submission.csv')
submission.head()

Unnamed: 0,id,Exited
0,165034,0.5
1,165035,0.5
2,165036,0.5
3,165037,0.5
4,165038,0.5


## Benchmark

### Random Guess

In [27]:
# dummy model - approx 1 in 5 customers churn (20% churn rate)
random_preds = submission.sample(frac=0.20, random_state=42).index
random_preds

Index([16412, 81431, 80555, 90124, 37688, 56720, 57437, 25027, 40756, 31166,
       ...
       22481, 12095, 86880, 46552, 97149, 74384, 41331, 10459, 53808, 33429],
      dtype='int64', length=22005)

In [28]:
for i in submission.index:
    if i in random_preds:
        submission.loc[i, 'Exited'] = 1
    else:
        submission.loc[i, 'Exited'] = 0

submission.head()

Unnamed: 0,id,Exited
0,165034,0.0
1,165035,0.0
2,165036,0.0
3,165037,1.0
4,165038,0.0


### Guess No Churn

In [101]:
submission['Exited'] = 0
submission.to_csv('data/random_submission.csv', index=False) 
submission

Unnamed: 0,id,Exited
0,165034,0
1,165035,0
2,165036,0
3,165037,0
4,165038,0
...,...,...
110018,275052,0
110019,275053,0
110020,275054,0
110021,275055,0


## XGBoost

In [85]:
features = ['CreditScore'
           ,'Age'
           ,'Tenure'
           ,'Balance'
           ,'NumOfProducts'
           ,'HasCrCard'
           ,'IsActiveMember'
           ,'EstimatedSalary']

test_dmatrix = xgb.DMatrix(data=test[features])


In [96]:

# Make predictions on the test set
test_preds = bst.predict(test_dmatrix)
test_preds = [1 if prob > 0.5 else 0 for prob in test_preds]


In [97]:
submission['Exited'] = test_preds

In [98]:
submission.head()

Unnamed: 0,id,Exited
0,165034,0
1,165035,1
2,165036,0
3,165037,0
4,165038,0


## Submission

In [99]:
submission.to_csv('data/submission.csv', index=False)

Goal:
- Predict bank customer churn

To Do:
- Train/Test split
- Review feature distributions
- Preprocessing
- Encode categorical variables
- Scale variables?
- Drop features that aren't predictive
- Label Target variable (Exited)
- AUC ROC is the evaluation metric
- Estabilish baseline model (avg?)
- Evaluation framework
- Using best model make predictions
- Submit predictions

Benchmark - Random guess that 1 in 5 customers will churn. Private LB: 0.50022. Public LB: 0.49376