In [3]:
# !pip install pandas
# !pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading numpy-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.1/16.1 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m00:01

In [4]:
import pandas as pd
from sklearn.metrics import roc_auc_score

# Load Train and Test

In [30]:
train = pd.read_csv('data/train.csv')

In [8]:
train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668.0,France,Male,33.0,3.0,0.0,2.0,1.0,0.0,181449.97,0.0
1,1,15749177,Okwudiliolisa,627.0,France,Male,33.0,1.0,0.0,2.0,1.0,1.0,49503.5,0.0
2,2,15694510,Hsueh,678.0,France,Male,40.0,10.0,0.0,2.0,1.0,0.0,184866.69,0.0
3,3,15741417,Kao,581.0,France,Male,34.0,2.0,148882.54,1.0,1.0,1.0,84560.88,0.0
4,4,15766172,Chiemenam,716.0,Spain,Male,33.0,5.0,0.0,2.0,1.0,1.0,15068.83,0.0


In [9]:
train.shape

(154229, 14)

In [10]:
test = pd.read_csv('data/test.csv')

# Data Investigation

In [11]:
train.isnull().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        1
Geography          1
Gender             1
Age                1
Tenure             1
Balance            1
NumOfProducts      1
HasCrCard          1
IsActiveMember     1
EstimatedSalary    1
Exited             1
dtype: int64

In [12]:
train.shape

(154229, 14)

In [13]:
train.dtypes

id                   int64
CustomerId           int64
Surname             object
CreditScore        float64
Geography           object
Gender              object
Age                float64
Tenure             float64
Balance            float64
NumOfProducts      float64
HasCrCard          float64
IsActiveMember     float64
EstimatedSalary    float64
Exited             float64
dtype: object

In [14]:
cats = train.select_dtypes('object')

for c in cats.columns:
    print(c, cats[c].unique())

Surname ['Okwudilichukwu' 'Okwudiliolisa' 'Hsueh' ... 'McDowell' 'Herring' "O'Sul"]
Geography ['France' 'Spain' 'Germany' nan]
Gender ['Male' 'Female' nan]


In [27]:
train['Exited'].unique()

array([ 0.,  1., nan])

In [31]:
train[train['Exited'].isnull()]

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited


In [32]:
train['Exited'].value_counts()

Exited
0    130113
1     34921
Name: count, dtype: int64

In [41]:
print(f'No churn: {train['Exited'].value_counts()[0]} or {train['Exited'].value_counts()[0] / len(train) * 100:.2f}%')
print(f'Churn: {train['Exited'].value_counts()[1]} or {train['Exited'].value_counts()[1] / len(train) * 100:.2f}%')
print(f'Total: {len(train)}')

No churn: 130113 or 78.84%
Churn: 34921 or 21.16%
Total: 165034


# Label Encode Categorical Features

In [None]:
# getting an additional row in the data somehow below. Exited is NaN

In [15]:
RMV = ['id', 'CustomerId', 'Surname', 'Exited']
FEATURES = [c for c in train.columns if not c in RMV]
combined = pd.concat([train,test], axis=0, ignore_index=True)

In [16]:
CATS = []

for c in FEATURES:
    ftype = 'numerical'
    if combined[c].dtype == 'object':
        CATS.append(c)
        ftype = 'categorical'
    if combined[c].dtype == 'int64':
        combined[c] = combined[c].astype('int32')
    elif combined[c].dtype == 'float64':
        combined[c] = combined[c].astype('float32')

In [17]:
train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

In [18]:
train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668.0,France,Male,33.0,3.0,0.0,2.0,1.0,0.0,181449.96875,0.0
1,1,15749177,Okwudiliolisa,627.0,France,Male,33.0,1.0,0.0,2.0,1.0,1.0,49503.5,0.0
2,2,15694510,Hsueh,678.0,France,Male,40.0,10.0,0.0,2.0,1.0,0.0,184866.6875,0.0
3,3,15741417,Kao,581.0,France,Male,34.0,2.0,148882.546875,1.0,1.0,1.0,84560.882812,0.0
4,4,15766172,Chiemenam,716.0,Spain,Male,33.0,5.0,0.0,2.0,1.0,1.0,15068.830078,0.0


# Train Models

# Compute CV Score

In [19]:
# roc_auc_score(y, clf.predict_proba(X)[:, 1])
# roc_auc_score(y, clf.decision_function(X))

# Create Submission CSV

In [65]:
submission = pd.read_csv('data/sample_submission.csv')
submission.head()

Unnamed: 0,id,Exited
0,165034,0.5
1,165035,0.5
2,165036,0.5
3,165037,0.5
4,165038,0.5


In [66]:
# dummy model - approx 1 in 5 customers churn (20% churn rate)
random_preds = submission.sample(frac=0.20, random_state=42).index
random_preds

Index([16412, 81431, 80555, 90124, 37688, 56720, 57437, 25027, 40756, 31166,
       ...
       22481, 12095, 86880, 46552, 97149, 74384, 41331, 10459, 53808, 33429],
      dtype='int64', length=22005)

In [70]:
for i in submission.index:
    if i in random_preds:
        submission.loc[i, 'Exited'] = 1
    else:
        submission.loc[i, 'Exited'] = 0

submission.head()

Unnamed: 0,id,Exited
0,165034,0.0
1,165035,0.0
2,165036,0.0
3,165037,1.0
4,165038,0.0


In [71]:
submission.to_csv('data/submission.csv', index=False)

Goal:
- Predict bank customer churn

To Do:
- Train/Test split
- Review feature distributions
- Preprocessing
- Encode categorical variables
- Scale variables?
- Drop features that aren't predictive
- Label Target variable (Exited)
- AUC ROC is the evaluation metric
- Estabilish baseline model (avg?)
- Evaluation framework
- Using best model make predictions
- Submit predictions

Benchmark - Random guess that 1 in 5 customers will churn. Private LB: 0.50022. Public LB: 0.49376