In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_csv("Churn_Modelling.csv")
df = df.drop(columns=["RowNumber", "CustomerId", "Surname"])

le = LabelEncoder()
df["Gender"] = le.fit_transform(df["Gender"])
df = pd.get_dummies(df, columns=["Geography"], drop_first=True)

scaler = StandardScaler()
numerical_features = ["CreditScore", "Age", "Balance", "Tenure", "NumOfProducts"]
df[numerical_features] = scaler.fit_transform(df[numerical_features])

X = df.drop(columns=["Exited"])
y = df["Exited"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LogisticRegression()
lr.fit(X_train, y_train)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)

print("Model Performance:")
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, lr.predict(X_test))}")
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf.predict(X_test))}")
print(f"Gradient Boosting Accuracy: {accuracy_score(y_test, gb.predict(X_test))}")

importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({"Feature": X.columns, "Importance": importances}).sort_values(by="Importance", ascending=False)
print("\nFeature Importance (Random Forest):")
print(feature_importance_df)

def predict_churn(input_data):
    input_df = pd.DataFrame([input_data])
    input_df["Gender"] = le.transform(input_df["Gender"])
    input_df = pd.get_dummies(input_df, columns=["Geography"], drop_first=True)
    for col in X.columns:
        if col not in input_df:
            input_df[col] = 0
    input_df[numerical_features] = scaler.transform(input_df[numerical_features])
    input_df = input_df[X.columns]
    predictions = {
        "Logistic Regression": lr.predict(input_df)[0],
        "Random Forest": rf.predict(input_df)[0],
        "Gradient Boosting": gb.predict(input_df)[0]
    }
    result_labels = {0: "No Churn", 1: "Churn"}
    predictions = {model: result_labels[pred] for model, pred in predictions.items()}
    return predictions

new_customer = {
    "CreditScore": 400,
    "Geography": "Germany",
    "Gender": "Male",
    "Age": 45,
    "Tenure": 1,
    "Balance": 50000,
    "NumOfProducts": 2
}

predictions = predict_churn(new_customer)
print("\nPredictions for External Input:")
for model, prediction in predictions.items():
    print(f"{model}: {prediction}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Performance:
Logistic Regression Accuracy: 0.812
Random Forest Accuracy: 0.866
Gradient Boosting Accuracy: 0.8675

Feature Importance (Random Forest):
              Feature  Importance
2                 Age    0.239783
8     EstimatedSalary    0.146638
0         CreditScore    0.144214
4             Balance    0.138613
5       NumOfProducts    0.130255
3              Tenure    0.082427
7      IsActiveMember    0.041110
9   Geography_Germany    0.025711
1              Gender    0.019258
6           HasCrCard    0.018722
10    Geography_Spain    0.013268

Predictions for External Input:
Logistic Regression: No Churn
Random Forest: Churn
Gradient Boosting: Churn


In [6]:
test_inputs = [
    {
        "CreditScore": 850,
        "Geography": "France",
        "Gender": "Female",
        "Age": 30,
        "Tenure": 10,
        "Balance": 100000,
        "NumOfProducts": 1
    },
    {
        "CreditScore": 400,
        "Geography": "Germany",
        "Gender": "Male",
        "Age": 45,
        "Tenure": 1,
        "Balance": 50000,
        "NumOfProducts": 2
    },
    {
        "CreditScore": 600,
        "Geography": "Spain",
        "Gender": "Female",
        "Age": 50,
        "Tenure": 5,
        "Balance": 0,
        "NumOfProducts": 1
    },
    
    {
        "CreditScore": 300,
        "Geography": "Germany",
        "Gender": "Female",
        "Age": 60,
        "Tenure": 2,
        "Balance": 50000,
        "NumOfProducts": 2
    }
]

for i, input_data in enumerate(test_inputs):
    predictions = predict_churn(input_data)
    majority_vote = max(set(predictions.values()), key=list(predictions.values()).count)
    print(f"Test Case {i + 1}: {majority_vote}")


Test Case 1: No Churn
Test Case 2: Churn
Test Case 3: Churn
Test Case 4: Churn
