In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [None]:
df = pd.read_csv("/content/Churn_Modelling.csv")

In [None]:
df = df.drop(columns=["RowNumber", "CustomerId", "Surname"])

In [None]:
df['Gender'] = LabelEncoder().fit_transform(df['Gender'])
df = pd.get_dummies(df, columns=['Geography'], drop_first=True)

In [None]:
X = df.drop("Exited", axis=1)
y = df["Exited"]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(f"\n{name}")
    print("Classification Report:\n", classification_report(y_test, preds))
    print("ROC AUC Score:", roc_auc_score(y_test, preds))


Logistic Regression
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.96      0.89      1607
           1       0.59      0.22      0.33       393

    accuracy                           0.82      2000
   macro avg       0.71      0.59      0.61      2000
weighted avg       0.79      0.82      0.78      2000

ROC AUC Score: 0.5932909614583779

Random Forest
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.76      0.47      0.58       393

    accuracy                           0.87      2000
   macro avg       0.82      0.72      0.75      2000
weighted avg       0.86      0.87      0.85      2000

ROC AUC Score: 0.7179728953006171

Gradient Boosting
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.96      0.92      1607
           1       0.75      0.49      0.6

model fixed as Gradient Boosting

featuring part

In [None]:
# Basic preprocessing

# --- Feature Engineering ---
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 30, 45, 60, 100], labels=[0, 1, 2, 3])
df['BalanceSalaryRatio'] = df['Balance'] / (df['EstimatedSalary'] + 1)
df['TenureByAge'] = df['Tenure'] / df['Age']
df['CreditScorePerAge'] = df['CreditScore'] / df['Age']
df['IsHighValueCustomer'] = ((df['Balance'] > df['Balance'].median()) &
                             (df['EstimatedSalary'] > df['EstimatedSalary'].median())).astype(int)
df['Products_CreditCard'] = df['NumOfProducts'] * df['HasCrCard']
df['ActiveHighBalance'] = ((df['IsActiveMember'] == 1) &
                           (df['Balance'] > df['Balance'].median())).astype(int)

# Replace NaNs from division
df.fillna(0, inplace=True)

# Split features and target
X = df.drop("Exited", axis=1)
y = df["Exited"]

# Optional: scale
X_scaled = StandardScaler().fit_transform(X)


parameter tuning

In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from scipy.stats import randint, uniform

In [12]:
df = pd.read_csv("/content/Churn_Modelling.csv")

In [13]:
df = df.drop(columns=["RowNumber", "CustomerId", "Surname"])
df['Gender'] = LabelEncoder().fit_transform(df['Gender'])
df = pd.get_dummies(df, columns=['Geography'], drop_first=True)

In [14]:
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 30, 45, 60, 100], labels=[0, 1, 2, 3])
df['BalanceSalaryRatio'] = df['Balance'] / (df['EstimatedSalary'] + 1)
df['TenureByAge'] = df['Tenure'] / df['Age']
df['CreditScorePerAge'] = df['CreditScore'] / df['Age']
df['IsHighValueCustomer'] = ((df['Balance'] > df['Balance'].median()) &
                             (df['EstimatedSalary'] > df['EstimatedSalary'].median())).astype(int)
df['Products_CreditCard'] = df['NumOfProducts'] * df['HasCrCard']
df['ActiveHighBalance'] = ((df['IsActiveMember'] == 1) &
                           (df['Balance'] > df['Balance'].median())).astype(int)
df.fillna(0, inplace=True)


In [15]:
X = df.drop("Exited", axis=1)
y = df["Exited"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
param_dist = {
    'n_estimators': randint(50, 300),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(3, 10),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'subsample': uniform(0.6, 0.4)
}

In [17]:
gbc = GradientBoostingClassifier(random_state=42)
random_search = RandomizedSearchCV(gbc, param_distributions=param_dist, n_iter=50, cv=5,
                                   scoring='roc_auc', random_state=42, verbose=1, n_jobs=-1)

In [18]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [19]:
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

In [20]:

print("Best Parameters:", random_search.best_params_)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))


Best Parameters: {'learning_rate': np.float64(0.03201038490553535), 'max_depth': 3, 'min_samples_leaf': 9, 'min_samples_split': 5, 'n_estimators': 254, 'subsample': np.float64(0.9442922333025374)}

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.48      0.58       393

    accuracy                           0.87      2000
   macro avg       0.82      0.72      0.75      2000
weighted avg       0.86      0.87      0.85      2000

ROC AUC Score: 0.7186228823958793


saving trained model

In [None]:
import joblib

# Save the trained model
joblib.dump(best_model, "churn_model.pkl")
print("Model saved as churn_model.pkl")


Model saved as churn_model.pkl


created module

In [9]:
import pandas as pd
import joblib

class ChurnPredictor:
    def __init__(self, model_path="churn_model.pkl"):
        self.model = joblib.load(model_path)

    def preprocess(self, df):
        # Basic preprocessing
        df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
        df = pd.get_dummies(df, columns=['Geography'], drop_first=True)

        # Ensure all expected one-hot encoded columns are present
        for col in ['Geography_Germany', 'Geography_Spain']:
            if col not in df.columns:
                df[col] = 0

        # Feature engineering (must match training time)
        df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 30, 45, 60, 100], labels=[0, 1, 2, 3])
        df['BalanceSalaryRatio'] = df['Balance'] / (df['EstimatedSalary'] + 1)
        df['TenureByAge'] = df['Tenure'] / df['Age']
        df['CreditScorePerAge'] = df['CreditScore'] / df['Age']
        df['IsHighValueCustomer'] = (
            (df['Balance'] > df['Balance'].median()) &
            (df['EstimatedSalary'] > df['EstimatedSalary'].median())
        ).astype(int)
        df['Products_CreditCard'] = df['NumOfProducts'] * df['HasCrCard']
        df['ActiveHighBalance'] = (
            (df['IsActiveMember'] == 1) &
            (df['Balance'] > df['Balance'].median())
        ).astype(int)

        df.fillna(0, inplace=True)

        # Drop columns that the model wasn’t trained with
        drop_cols = [col for col in ["RowNumber", "CustomerId", "Surname", "Exited"] if col in df.columns]
        df.drop(columns=drop_cols, inplace=True, errors='ignore')

        return df

    def predict(self, df):
        df_processed = self.preprocess(df.copy())
        predictions = self.model.predict(df_processed)
        prediction_probs = self.model.predict_proba(df_processed)[:, 1]
        return predictions, prediction_probs


In [5]:
import pandas as pd


# Load sample customer data (new or unseen)
new_data = pd.read_csv("/content/Churn_Modelling.csv")

# Create predictor instance
predictor = ChurnPredictor("churn_model.pkl")

# Predict
predictions, probs = predictor.predict(new_data)

# Output
new_data['PredictedChurn'] = predictions
new_data['ChurnProbability'] = probs
print(new_data[['PredictedChurn', 'ChurnProbability']].head())
new_data.to_csv("churn_predictions.csv", index=False)

   PredictedChurn  ChurnProbability
0               0          0.313107
1               0          0.192308
2               1          0.930990
3               0          0.064133
4               0          0.166795


In [10]:
import pandas as pd

predictor = ChurnPredictor("churn_model.pkl")

input_data = {
    "CreditScore": int(input("Enter Credit Score: ")),
    "Geography": input("Enter Geography (France/Germany/Spain): "),
    "Gender": input("Enter Gender (Male/Female): "),
    "Age": int(input("Enter Age: ")),
    "Tenure": int(input("Enter Tenure: ")),
    "Balance": float(input("Enter Balance: ")),
    "NumOfProducts": int(input("Enter Number of Products: ")),
    "HasCrCard": int(input("Has Credit Card (1 = Yes, 0 = No): ")),
    "IsActiveMember": int(input("Is Active Member (1 = Yes, 0 = No): ")),
    "EstimatedSalary": float(input("Enter Estimated Salary: "))
}

user_df = pd.DataFrame([input_data])

predictions, probs = predictor.predict(user_df)

user_df['PredictedChurn'] = predictions
user_df['ChurnProbability'] = probs
print("\nPrediction Result:")
print(user_df[['PredictedChurn', 'ChurnProbability']])


Enter Credit Score: 619
Enter Geography (France/Germany/Spain): France
Enter Gender (Male/Female): Female
Enter Age: 42
Enter Tenure: 2
Enter Balance: 0
Enter Number of Products: 1
Has Credit Card (1 = Yes, 0 = No): 0
Is Active Member (1 = Yes, 0 = No): 1
Enter Estimated Salary: 101348.88

Prediction Result:
   PredictedChurn  ChurnProbability
0               0          0.316993
