# Customer Churn Prediction - EDA & Preprocessing

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

## Load and explore data

In [2]:
# Load data
df = pd.read_csv("data/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

## Data Cleaning & Preprocessing

In [3]:
# Drop irrelevant columns, handle missing values
# Encode target and categorical columns as needed

## Model Training and Evaluation

In [4]:
# Train-test split
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train models
log_model = LogisticRegression(max_iter=1000)
rf_model = RandomForestClassifier()
log_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

# Predictions
log_preds = log_model.predict(X_test)
rf_preds = rf_model.predict(X_test)
log_probs = log_model.predict_proba(X_test)[:, 1]
rf_probs = rf_model.predict_proba(X_test)[:, 1]

## Classification Reports

In [5]:
print("Logistic Regression Classification Report:")
print(classification_report(y_test, log_preds))

print("Random Forest Classification Report:")
print(classification_report(y_test, rf_preds))

## ROC Curves

In [6]:
# Convert target to binary if needed
y_test_bin = y_test.map({'No': 0, 'Yes': 1})

# ROC curves
fpr_log, tpr_log, _ = roc_curve(y_test_bin, log_probs)
fpr_rf, tpr_rf, _ = roc_curve(y_test_bin, rf_probs)

plt.figure(figsize=(8,6))
plt.plot(fpr_log, tpr_log, label='Logistic Regression')
plt.plot(fpr_rf, tpr_rf, label='Random Forest')
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.savefig("outputs/charts/roc_curve.png")
plt.show()

## Save Model Comparison as CSV

In [7]:
# Save model metrics to CSV
metrics_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Accuracy': [accuracy_score(y_test, log_preds), accuracy_score(y_test, rf_preds)],
    'Precision': [precision_score(y_test_bin, log_preds == 'Yes'), precision_score(y_test_bin, rf_preds == 'Yes')],
    'Recall': [recall_score(y_test_bin, log_preds == 'Yes'), recall_score(y_test_bin, rf_preds == 'Yes')],
    'F1 Score': [f1_score(y_test_bin, log_preds == 'Yes'), f1_score(y_test_bin, rf_preds == 'Yes')]
})

metrics_df.to_csv("outputs/model_comparison.csv", index=False)
metrics_df