Supervised Learning – Classification

In [7]:
# =============================================
# 📊 CUSTOMER CHURN PREDICTION - CLASSIFICATION
# Logistic Regression vs Random Forest
# =============================================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# ==========================
# 🧾 1. LOAD DATA
# ==========================
df = pd.read_csv("customer_churn_dataset.csv")

print("✅ Dataset loaded successfully!")
print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

# ==========================
# 🎯 2. DEFINE TARGET VARIABLE
# ==========================
# If 'Churn' column exists, use it; otherwise look for similar ones
target_candidates = ["Churn", "Exited", "Customer Status", "Attrition_Flag"]
target_col = None

for col in target_candidates:
    if col in df.columns:
        target_col = col
        break

if target_col is None:
    raise ValueError("❌ Target column not found. Please check dataset column names.")

print(f"\n🎯 Target variable selected: {target_col}")

# Encode target variable
y = df[target_col].replace({'Yes': 1, 'No': 0, 'Exited': 1, 'Active': 0}).fillna(0)

# Drop target from feature set
X = df.drop(columns=[target_col])

# ==========================
# 🧹 3. DATA CLEANING
# ==========================
# Handle categorical features
X = pd.get_dummies(X, drop_first=True)

# Fill missing values with mean
X = X.fillna(X.mean())

print("\n✅ Data cleaned successfully!")
print("Final shape after cleaning:", X.shape)

if X.shape[0] == 0:
    raise ValueError("❌ Dataset is empty after cleaning. Check preprocessing steps!")

# ==========================
# 🧪 4. TRAIN/TEST SPLIT
# ==========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\n📦 Training set size:", X_train.shape)
print("📦 Testing set size:", X_test.shape)

# ==========================
# 🤖 5. LOGISTIC REGRESSION
# ==========================
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)
log_acc = accuracy_score(y_test, log_pred)

# ==========================
# 🌲 6. RANDOM FOREST
# ==========================
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

# ==========================
# 📈 7. RESULTS
# ==========================
print("\n=============================")
print("📊 MODEL ACCURACY COMPARISON")
print("=============================")
print(f"Logistic Regression Accuracy : {log_acc:.4f}")
print(f"Random Forest Accuracy       : {rf_acc:.4f}")

# Optional detailed report
print("\nDetailed Classification Report (Random Forest):")
print(classification_report(y_test, rf_pred))

# ==========================
# ✅ 8. CONCLUSION
# ==========================
if rf_acc > log_acc:
    print("\n✅ Random Forest performs better than Logistic Regression.")
else:
    print("\n✅ Logistic Regression performs better than Random Forest.")


✅ Dataset loaded successfully!
Dataset shape: (64374, 12)
Columns: ['CustomerID', 'Age', 'Gender', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Subscription Type', 'Contract Length', 'Total Spend', 'Last Interaction', 'Churn']

🎯 Target variable selected: Churn

✅ Data cleaned successfully!
Final shape after cleaning: (64374, 13)

📦 Training set size: (51499, 13)
📦 Testing set size: (12875, 13)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



📊 MODEL ACCURACY COMPARISON
Logistic Regression Accuracy : 0.8375
Random Forest Accuracy       : 0.9964

Detailed Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6793
           1       1.00      0.99      1.00      6082

    accuracy                           1.00     12875
   macro avg       1.00      1.00      1.00     12875
weighted avg       1.00      1.00      1.00     12875


✅ Random Forest performs better than Logistic Regression.
