In [2]:
# ===============================
# Phase 1: Exploration & Baseline Modeling
# Project: Customer Churn Prediction
# ===============================

# -------------------------------
# 1. Setup / Imports
# -------------------------------
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

# -------------------------------
# 2. Mount Google Drive
# -------------------------------
from google.colab import drive
drive.mount('/content/drive')

# Update path according to your Drive folder
csv_path = "/content/drive/MyDrive/customer-churn-ml-engineering/WA_Fn-UseC_-Telco-Customer-Churn.csv"

# -------------------------------
# 3. Load Dataset
# -------------------------------
from src.data.loader import load_csv
df = load_csv(csv_path)
df.head()

# -------------------------------
# 4. Exploratory Data Analysis (EDA)
# -------------------------------

# Basic info
print(df.info())
print(df.describe())

# Missing values
print(df.isnull().sum())

# Class balance
print(df['Churn'].value_counts(normalize=True))

# Visualizations
sns.countplot(x='Churn', data=df)
plt.show()

# Example: Histogram of numeric columns
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
df[numeric_cols].hist(bins=20, figsize=(12, 8))
plt.show()

# -------------------------------
# 5. Preprocessing / Feature Engineering
# -------------------------------

from src.features.preprocess import prepare_features
X, y, artifacts = prepare_features(df)
import joblib
from src.utils.paths import MODELS_DIR
joblib.dump(artifacts, MODELS_DIR / "preprocess_artifacts.joblib")

# -------------------------------
# 6. Baseline Models
# -------------------------------

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest ROC-AUC:", roc_auc_score(y_test, y_pred_rf))

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("XGBoost ROC-AUC:", roc_auc_score(y_test, y_pred_xgb))

# Confusion matrix example
cm = confusion_matrix(y_test, y_pred_xgb)
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# -------------------------------
# 7. Notes & Next Steps
# -------------------------------
# - Check feature importance
# - Handle class imbalance if needed
# - Prepare pipeline for Phase 2 (production-ready code)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


ModuleNotFoundError: No module named 'src'