### 1. Load the data
Load the training data and display the first few rows to get an overview of the dataset

In [None]:
import pandas as pd

# Load the training data
df = pd.read_csv("data/train.csv")

# Show the first 5 rows
(df.head())

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


### 2. Explore the Dataset
We'll check data types, look for missing values, and start understanding the structure of our dataset.

In [36]:
# Check data types and non-null counts
summary_df = pd.DataFrame({
    "Non-Null Count": df.notnull().sum(),
    "Data Type": df.dtypes
})

summary_df.index.name = "Column"
summary_df.reset_index(inplace=True)
summary_df


Unnamed: 0,Column,Non-Null Count,Data Type
0,id,165034,int64
1,CustomerId,165034,int64
2,Surname,165034,object
3,CreditScore,165034,int64
4,Geography,165034,object
5,Gender,165034,object
6,Age,165034,float64
7,Tenure,165034,int64
8,Balance,165034,float64
9,NumOfProducts,165034,int64


In [34]:
# Check for missing values in each column
df.isnull().sum().to_frame(name="Missing Values")

Unnamed: 0,Missing Values
id,0
CustomerId,0
Surname,0
CreditScore,0
Geography,0
Gender,0
Age,0
Tenure,0
Balance,0
NumOfProducts,0


In [None]:
# Show unique values and how many times each appears
df['Geography'].value_counts().to_frame(name='Count').reset_index().rename(columns={'index': 'Geography'})

Unnamed: 0,Geography,Count
0,France,94215
1,Spain,36213
2,Germany,34606


#### 3. Clean the Data
We’ll drop irrelevant columns, and convert categorical columns Gender and Geography into numeric values to make the data usable for modelling.

In [40]:
# Make a copy of the original DataFrame to keep things safe
df_clean = df.copy()

# Drop unnecessary columns
df_clean = df_clean.drop(['id', 'CustomerId', 'Surname'], axis=1)

# Convert 'Gender' to binary: Male = 1, Female = 0
df_clean['Gender'] = df_clean['Gender'].map({'Male': 1, 'Female': 0})

# Convert 'Geography' to dummy variables (drop_first=True avoids multicollinearity)
df_clean = pd.get_dummies(df_clean, columns=['Geography'], drop_first=True)

# Display the first 5 rows of the cleaned dataset as a DataFrame (not plain print)
df_clean.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain
0,668,1,33.0,3,0.0,2,1.0,0.0,181449.97,0,False,False
1,627,1,33.0,1,0.0,2,1.0,1.0,49503.5,0,False,False
2,678,1,40.0,10,0.0,2,1.0,0.0,184866.69,0,False,False
3,581,1,34.0,2,148882.54,1,1.0,1.0,84560.88,0,False,False
4,716,1,33.0,5,0.0,2,1.0,1.0,15068.83,0,False,True


#### 4. Feature Engineering
To help the model, we create new features based on existing ones, like balance-to-salary ratio and a flag for having any balance.

In [44]:
df_clean['BalanceSalaryRatio'] = df_clean['Balance'] / (df_clean['EstimatedSalary'] + 1)

In [45]:
df_clean['HasBalance'] = (df_clean['Balance'] > 0).astype(int)

In [None]:
df_clean[['Balance', 'EstimatedSalary', 'BalanceSalaryRatio', 'HasBalance']].head()

CreditScore           0
Gender                0
Age                   0
Tenure                0
Balance               0
NumOfProducts         0
HasCrCard             0
IsActiveMember        0
EstimatedSalary       0
Exited                0
Geography_Germany     0
Geography_Spain       0
BalanceSalaryRatio    0
HasBalance            0
dtype: int64

In [None]:
(df_clean.isnull().sum())

Unnamed: 0,Balance,EstimatedSalary,BalanceSalaryRatio,HasBalance
0,0.0,181449.97,0.0,0
1,0.0,49503.5,0.0,0
2,0.0,184866.69,0.0,0
3,148882.54,84560.88,1.760634,1
4,0.0,15068.83,0.0,0


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import pandas as pd

# Define features (X) and target (y)
X = df_clean.drop("Exited", axis=1)
y = df_clean["Exited"]

# Split the data into train and test sets (80/20), stratified to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create and train the logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

# Confusion Matrix
conf_matrix = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)
print("Confusion Matrix:")
display(conf_matrix)

# Classification Report
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).T
print("\n Classification Report:")
display(report_df.round(2))

# ROC AUC Score
roc_score = roc_auc_score(y_test, y_proba)
print(f"\n ROC AUC Score: {roc_score:.4f}")


📊 Confusion Matrix:


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,24829,1194
Actual 1,4297,2687



📋 Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.85,0.95,0.9,26023.0
1,0.69,0.38,0.49,6984.0
accuracy,0.83,0.83,0.83,0.83
macro avg,0.77,0.67,0.7,33007.0
weighted avg,0.82,0.83,0.81,33007.0



🔍 ROC AUC Score: 0.8095


#### 6. Feature Scaling

To help the logistic regression model perform better, we scale the features so they are on a similar range. This helps the model converge and treat all features fairly.


In [None]:
from sklearn.preprocessing import StandardScaler

# Create a scaler
scaler = StandardScaler()

# Fit only on training features, then transform both X_train and X_test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 7. Retrain Logistic Regression with Scaled Data

We retrain the model using the scaled training set to help it converge and possibly improve performance.


In [51]:
# Create and train the logistic regression model again (with more iterations allowed)
log_reg_scaled = LogisticRegression(max_iter=1000, random_state=42)
log_reg_scaled.fit(X_train_scaled, y_train)

# Make predictions
y_pred_scaled = log_reg_scaled.predict(X_test_scaled)
y_proba_scaled = log_reg_scaled.predict_proba(X_test_scaled)[:, 1]

# Confusion Matrix
conf_matrix_scaled = pd.DataFrame(
    confusion_matrix(y_test, y_pred_scaled),
    index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)
print("Confusion Matrix:")
display(conf_matrix_scaled)

# Classification Report
report_scaled = classification_report(y_test, y_pred_scaled, output_dict=True)
report_scaled_df = pd.DataFrame(report_scaled).T
print("\nClassification Report:")
display(report_scaled_df.round(2))

# ROC AUC Score
roc_score_scaled = roc_auc_score(y_test, y_proba_scaled)
print(f"\nROC AUC Score: {roc_score_scaled:.4f}")


Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,24835,1188
Actual 1,4324,2660



Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.85,0.95,0.9,26023.0
1,0.69,0.38,0.49,6984.0
accuracy,0.83,0.83,0.83,0.83
macro avg,0.77,0.67,0.7,33007.0
weighted avg,0.82,0.83,0.81,33007.0



ROC AUC Score: 0.8141


### 8. Train a Decision Tree Classifier

We now train a classification tree (CART) to capture more complex relationships between variables and see if it performs better than logistic regression, especially for detecting churners.


In [52]:
from sklearn.tree import DecisionTreeClassifier

# Create the Decision Tree model
tree_clf = DecisionTreeClassifier(random_state=42)

# Train the model
tree_clf.fit(X_train_scaled, y_train)

# Make predictions
y_pred_tree = tree_clf.predict(X_test_scaled)
y_proba_tree = tree_clf.predict_proba(X_test_scaled)[:, 1]

# Confusion Matrix
conf_matrix_tree = pd.DataFrame(
    confusion_matrix(y_test, y_pred_tree),
    index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)
print("Confusion Matrix:")
display(conf_matrix_tree)

# Classification Report
report_tree = classification_report(y_test, y_pred_tree, output_dict=True)
report_tree_df = pd.DataFrame(report_tree).T
print("\nClassification Report:")
display(report_tree_df.round(2))

# ROC AUC Score
roc_score_tree = roc_auc_score(y_test, y_proba_tree)
print(f"\nROC AUC Score: {roc_score_tree:.4f}")


Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,22590,3433
Actual 1,3184,3800



Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.88,0.87,0.87,26023.0
1,0.53,0.54,0.53,6984.0
accuracy,0.8,0.8,0.8,0.8
macro avg,0.7,0.71,0.7,33007.0
weighted avg,0.8,0.8,0.8,33007.0



ROC AUC Score: 0.7063


### 9. Hyperparameter Tuning — Decision Tree

Decision trees can easily overfit. We’ll try limiting the depth of the tree and adjusting other parameters to find a better balance between bias and variance.


In [53]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)

# Create a GridSearchCV object
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

# Fit to training data
grid_search.fit(X_train, y_train)

# Best model
best_dt = grid_search.best_estimator_

# Predict on test set
y_pred_dt_tuned = best_dt.predict(X_test)
y_proba_dt_tuned = best_dt.predict_proba(X_test)[:, 1]

# Confusion Matrix
conf_matrix_dt = pd.DataFrame(
    confusion_matrix(y_test, y_pred_dt_tuned),
    index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)
print("Confusion Matrix:")
display(conf_matrix_dt)

# Classification Report
report_dt = classification_report(y_test, y_pred_dt_tuned, output_dict=True)
report_dt_df = pd.DataFrame(report_dt).T
print("\nClassification Report:")
display(report_dt_df.round(2))

# ROC AUC Score
roc_dt_score = roc_auc_score(y_test, y_proba_dt_tuned)
print(f"\nROC AUC Score: {roc_dt_score:.4f}")


Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,24727,1296
Actual 1,3244,3740



Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.88,0.95,0.92,26023.0
1,0.74,0.54,0.62,6984.0
accuracy,0.86,0.86,0.86,0.86
macro avg,0.81,0.74,0.77,33007.0
weighted avg,0.85,0.86,0.85,33007.0



ROC AUC Score: 0.8840


#### 10. Random Forest — Baseline

We train a Random Forest to reduce overfitting by combining multiple decision trees with bagging and random feature selection.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

# Predict
y_pred_rf = rf_clf.predict(X_test)
y_proba_rf = rf_clf.predict_proba(X_test)[:, 1]

# Confusion Matrix
conf_matrix_rf = pd.DataFrame(
    confusion_matrix(y_test, y_pred_rf),
    index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)
print("Confusion Matrix:")
display(conf_matrix_rf)

# Classification Report
report_rf = classification_report(y_test, y_pred_rf, output_dict=True)
report_rf_df = pd.DataFrame(report_rf).T
print("\nClassification Report:")
display(report_rf_df.round(2))

# ROC AUC Score
roc_rf_score = roc_auc_score(y_test, y_proba_rf)
print(f"\nROC AUC Score: {roc_rf_score:.4f}")


Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,24617,1406
Actual 1,3265,3719



Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.88,0.95,0.91,26023.0
1,0.73,0.53,0.61,6984.0
accuracy,0.86,0.86,0.86,0.86
macro avg,0.8,0.74,0.76,33007.0
weighted avg,0.85,0.86,0.85,33007.0



ROC AUC Score: 0.8726


#### 11. AdaBoost — Baseline

Using AdaBoost to improve performance on difficult samples, especially churners, by sequentially combining weak learners.

In [56]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Base learner (shallow trees)
base_estimator = DecisionTreeClassifier(max_depth=1, random_state=42)

# AdaBoost model
ada_clf = AdaBoostClassifier(
    estimator=base_estimator,
    n_estimators=100,
    learning_rate=1.0,
    random_state=42
)

# Train
ada_clf.fit(X_train, y_train)

# Predict
y_pred_ada = ada_clf.predict(X_test)
y_proba_ada = ada_clf.predict_proba(X_test)[:, 1]

# Confusion Matrix
conf_matrix_ada = pd.DataFrame(
    confusion_matrix(y_test, y_pred_ada),
    index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)
print("Confusion Matrix:")
display(conf_matrix_ada)

# Classification Report
report_ada = classification_report(y_test, y_pred_ada, output_dict=True)
report_ada_df = pd.DataFrame(report_ada).T
print("\nClassification Report:")
display(report_ada_df.round(2))

# ROC AUC Score
roc_ada_score = roc_auc_score(y_test, y_proba_ada)
print(f"\nROC AUC Score: {roc_ada_score:.4f}")


Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,24746,1277
Actual 1,3387,3597



Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.88,0.95,0.91,26023.0
1,0.74,0.52,0.61,6984.0
accuracy,0.86,0.86,0.86,0.86
macro avg,0.81,0.73,0.76,33007.0
weighted avg,0.85,0.86,0.85,33007.0



ROC AUC Score: 0.8777


In [58]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Initialise CatBoost
cat_clf = CatBoostClassifier(
    iterations=100,              # Number of boosting rounds
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    verbose=0                    # Suppress training output
)

# Train
cat_clf.fit(X_train, y_train)

# Predict
y_pred_cat = cat_clf.predict(X_test)
y_proba_cat = cat_clf.predict_proba(X_test)[:, 1]

# Confusion Matrix
conf_matrix_cat = pd.DataFrame(
    confusion_matrix(y_test, y_pred_cat),
    index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)
print("Confusion Matrix:")
display(conf_matrix_cat)

# Classification Report
report_cat = classification_report(y_test, y_pred_cat, output_dict=True)
report_cat_df = pd.DataFrame(report_cat).T
print("\nClassification Report:")
display(report_cat_df.round(2))

# ROC AUC Score
roc_cat_score = roc_auc_score(y_test, y_proba_cat)
print(f"\nROC AUC Score: {roc_cat_score:.4f}")

Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,24792,1231
Actual 1,3241,3743



Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.88,0.95,0.92,26023.0
1,0.75,0.54,0.63,6984.0
accuracy,0.86,0.86,0.86,0.86
macro avg,0.82,0.74,0.77,33007.0
weighted avg,0.86,0.86,0.86,33007.0



ROC AUC Score: 0.8890
