In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
# Load the dataset
data = pd.read_csv('/content/Churn_Modelling.csv')

In [4]:
# Drop unnecessary columns
data = data.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

In [5]:
# Encode categorical variables
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])
data = pd.get_dummies(data, columns=['Geography'], drop_first=True)

In [6]:
# Splitting the dataset into features and target variable
X = data.drop(columns=['Exited'])
y = data['Exited']

In [7]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Custom transformer for selecting specific columns
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]

In [10]:
# Define pipelines for different models
logistic_regression_pipe = Pipeline([
    ('selector', ColumnSelector(columns=['Age', 'Balance', 'CreditScore', 'EstimatedSalary', 'Gender',
                                         'NumOfProducts', 'Tenure', 'HasCrCard', 'IsActiveMember',
                                         'Geography_Germany'])),
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression())
])

random_forest_pipe = Pipeline([
    ('selector', ColumnSelector(columns=['Age', 'Balance', 'CreditScore', 'EstimatedSalary', 'Gender',
                                         'NumOfProducts', 'Tenure', 'HasCrCard', 'IsActiveMember',
                                         'Geography_Germany'])),
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier())
])

gradient_boosting_pipe = Pipeline([
    ('selector', ColumnSelector(columns=['Age', 'Balance', 'CreditScore', 'EstimatedSalary', 'Gender',
                                         'NumOfProducts', 'Tenure', 'HasCrCard', 'IsActiveMember',
                                         'Geography_Germany'])),
    ('scaler', StandardScaler()),
    ('clf', GradientBoostingClassifier())
])

In [11]:
# Hyperparameter tuning using GridSearchCV
param_grid_logistic = {
    'clf__penalty': ['l2'],
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100]
}

param_grid_rf = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

param_grid_gb = {
    'clf__n_estimators': [100, 200, 300],
    'clf__learning_rate': [0.01, 0.1, 0.5],
    'clf__max_depth': [3, 5, 7],
    'clf__min_samples_split': [2, 5, 10]
}


In [12]:
grid_search_logistic = GridSearchCV(logistic_regression_pipe, param_grid_logistic, cv=5, scoring='accuracy')
grid_search_logistic.fit(X_train, y_train)
best_logistic_model = grid_search_logistic.best_estimator_

In [13]:
# GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(random_forest_pipe, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)
best_rf_model = grid_search_rf.best_estimator_

In [14]:
# GridSearchCV for Gradient Boosting
grid_search_gb = GridSearchCV(gradient_boosting_pipe, param_grid_gb, cv=5, scoring='accuracy')
grid_search_gb.fit(X_train, y_train)
best_gb_model = grid_search_gb.best_estimator_

In [15]:
# Ensemble of best models
ensemble_model = VotingClassifier(estimators=[
    ('logistic', best_logistic_model),
    ('random_forest', best_rf_model),
    ('gradient_boosting', best_gb_model)
], voting='soft')

ensemble_model.fit(X_train, y_train)

In [16]:
# Predictions
y_pred_ensemble = ensemble_model.predict(X_test)


In [17]:

# Model evaluation
print("Ensemble Model:")
print("Accuracy:", accuracy_score(y_test, y_pred_ensemble))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_ensemble))
print("Classification Report:")
print(classification_report(y_test, y_pred_ensemble))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_ensemble))

Ensemble Model:
Accuracy: 0.863
Confusion Matrix:
[[1556   51]
 [ 223  170]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1607
           1       0.77      0.43      0.55       393

    accuracy                           0.86      2000
   macro avg       0.82      0.70      0.74      2000
weighted avg       0.85      0.86      0.85      2000

ROC AUC Score: 0.7004169101149392
