In [7]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load Dataset
df = pd.read_csv("Churn_Modelling.csv")

# Feature Selection: Remove Irrelevant Columns
df = df.drop(columns=["RowNumber", "CustomerId", "Surname"])

# Encode Categorical Variables
le = LabelEncoder()
df["Gender"] = le.fit_transform(df["Gender"])  # Encode Gender
df = pd.get_dummies(df, columns=["Geography"], drop_first=True)  # One-hot encode Geography

# Feature Scaling
scaler = StandardScaler()
numerical_features = ["CreditScore", "Age", "Balance", "Tenure", "NumOfProducts"]
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Split Dataset into Features and Target
X = df.drop(columns=["Exited"])  # Features
y = df["Exited"]  # Target variable (Churn)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Model
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Random Forest Model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Gradient Boosting Model
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)

# Evaluate Models
print("Model Performance:")
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, lr.predict(X_test))}")
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf.predict(X_test))}")
print(f"Gradient Boosting Accuracy: {accuracy_score(y_test, gb.predict(X_test))}")

# Feature Importance from Random Forest
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({"Feature": X.columns, "Importance": importances}).sort_values(by="Importance", ascending=False)
print("\nFeature Importance (Random Forest):")
print(feature_importance_df)

# Predict Churn for External Input
# Predict Churn for External Input
def predict_churn(input_data):
    """
    Predict whether a customer will churn based on external input.
    
    Args:
    input_data (dict): Dictionary containing customer details.
    
    Returns:
    dict: Predictions from Logistic Regression, Random Forest, and Gradient Boosting models.
    """
    # Convert input data into DataFrame
    input_df = pd.DataFrame([input_data])
    
    # Encode Categorical Variables
    input_df["Gender"] = le.transform(input_df["Gender"])  # Encode Gender
    input_df = pd.get_dummies(input_df, columns=["Geography"], drop_first=True)
    
    # Handle missing columns due to one-hot encoding
    for col in X.columns:
        if col not in input_df:
            input_df[col] = 0
    
    # Scale Numerical Features
    input_df[numerical_features] = scaler.transform(input_df[numerical_features])
    input_df = input_df[X.columns]  # Reorder columns to match training data
    
    # Make Predictions
    predictions = {
        "Logistic Regression": lr.predict(input_df)[0],
        "Random Forest": rf.predict(input_df)[0],
        "Gradient Boosting": gb.predict(input_df)[0]
    }
    
    # Translate numeric predictions into labels
    result_labels = {0: "No Churn", 1: "Churn"}
    predictions = {model: result_labels[pred] for model, pred in predictions.items()}
    
    return predictions

new_customer = {
    "CreditScore": 400,        # Low credit score
    "Geography": "Germany",   # Customer location
    "Gender": "Male",         # Male
    "Age": 45,                # Mid-aged customer
    "Tenure": 1,              # Short tenure
    "Balance": 50000,         # Moderate balance
    "NumOfProducts": 2        # Low-to-moderate product engagement
}




predictions = predict_churn(new_customer)
print("\nPredictions for External Input:")
for model, prediction in predictions.items():
    print(f"{model}: {prediction}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Performance:
Logistic Regression Accuracy: 0.812
Random Forest Accuracy: 0.866
Gradient Boosting Accuracy: 0.8675

Feature Importance (Random Forest):
              Feature  Importance
2                 Age    0.239783
8     EstimatedSalary    0.146638
0         CreditScore    0.144214
4             Balance    0.138613
5       NumOfProducts    0.130255
3              Tenure    0.082427
7      IsActiveMember    0.041110
9   Geography_Germany    0.025711
1              Gender    0.019258
6           HasCrCard    0.018722
10    Geography_Spain    0.013268

Predictions for External Input:
Logistic Regression: No Churn
Random Forest: Churn
Gradient Boosting: Churn
