In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# -----------------------------
# 1️⃣ GitHub Credentials & Repo
# -----------------------------
USERNAME = 'mule1993'            # GitHub username
TOKEN = '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'  # GitHub PAT (keep secret)
REPO_NAME = 'customer-churn-ml-engineering'

REPO_URL = f'https://{USERNAME}:{TOKEN}@github.com/{USERNAME}/{REPO_NAME}.git'
REPO_PATH = f'/content/{REPO_NAME}'

# -----------------------------
# 2️⃣ Clone or update repo
# -----------------------------
import os
if not os.path.exists(REPO_PATH):
    !git clone $REPO_URL $REPO_PATH
else:
    %cd $REPO_PATH
    !git pull

# -----------------------------
# 3️⃣ Change working directory to repo root
# -----------------------------
%cd $REPO_PATH

# -----------------------------
# 4️⃣ Ensure __init__.py exists in all src folders
# -----------------------------
folders = ['src', 'src/data', 'src/features', 'src/models', 'src/utils']
for f in folders:
    init_file = os.path.join(REPO_PATH, f, '__init__.py')
    if not os.path.exists(init_file):
        open(init_file, 'a').close()

# -----------------------------
# 5️⃣ Add repo root to Python path
# -----------------------------
import sys
if REPO_PATH not in sys.path:
    sys.path.insert(0, REPO_PATH)  # insert at front to prioritize

# -----------------------------
# 6️⃣ Verify imports
# -----------------------------
try:
    from src.data.loader import load_csv
    from src.features.preprocess import prepare_features
    from src.models.train import train_model
    from src.models.evaluate import evaluate_model
    from src.models.predict import load_artifacts, predict
    print("✅ All modules imported successfully")
except ModuleNotFoundError as e:
    print("❌ Module import failed:", e)


In [None]:

# ===============================
# Phase 1: Exploration & Baseline Modeling
# Project: Customer Churn Prediction
# ===============================

# -------------------------------
# 1. Setup / Imports
# -------------------------------
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

import sys
sys.path.append('/content/customer-churn-ml-engineering/src/data')
import loader  # Instead of from src import data
from loader import load_csv

sys.path.append('/content/customer-churn-ml-engineering/src/features')
import preprocess  # Instead of from src import data
from preprocess import prepare_features

sys.path.append('/content/customer-churn-ml-engineering/src/models')
import train  # Instead of from src import data
from train import train_model
import evaluate
from evaluate import evaluate_model

import joblib

sys.path.append('/content/customer-churn-ml-engineering/src/utils')
import paths  # Instead of from src import data
from paths import MODELS_DIR

import warnings
warnings.filterwarnings('ignore')

# -------------------------------
# 2. Mount Google Drive
# -------------------------------
# TOP

# Update path according to your Drive folder
csv_path = "/content/drive/MyDrive/customer-churn-ml-engineering/WA_Fn-UseC_-Telco-Customer-Churn.csv"

# -------------------------------
# 3. Load Dataset
# -------------------------------

df = load_csv(csv_path)
df.head()

# -------------------------------
# 4. Exploratory Data Analysis (EDA)
# -------------------------------

# Basic info
print(df.info())
print(df.describe())

# Missing values
print(df.isnull().sum())

# Class balance
print(df['Churn'].value_counts(normalize=True))

# Visualizations
sns.countplot(x='Churn', data=df)
plt.show()

# Example: Histogram of numeric columns
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
df[numeric_cols].hist(bins=20, figsize=(12, 8))
plt.show()

# -------------------------------
# 5. Preprocessing / Feature Engineering
# -------------------------------

X, y, artifacts = prepare_features(df)

joblib.dump(artifacts, MODELS_DIR / "preprocess_artifacts.joblib")


#joblib.dump(artifacts,"/content/drive/MyDrive/ml_artifacts/preprocess_artifacts.joblib")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# -------------------------------
# 6. Baseline Models
# -------------------------------
# XGBoost
#Training Module #Evaluation Module

model = train_model(X_train, y_train, MODELS_DIR)


#joblib.dump(artifacts,"/content/drive/MyDrive/ml_artifacts/churn_model.joblib")

joblib.dump(artifacts, MODELS_DIR / "churn_model.joblib")

metrics = evaluate_model(model, X_test, y_test)
print(metrics["roc_auc"])
print(metrics["report"])



y_pred_xgb = model.predict(X_test)




# Confusion matrix example
cm = confusion_matrix(y_test, y_pred_xgb)
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# -------------------------------
# 7. Notes & Next Steps
# -------------------------------
# - Check feature importance
# - Handle class imbalance if needed
# - Prepare pipeline for Phase 2 (production-ready code)


In [None]:
from src.models.predict import load_artifacts, predict
from src.utils.paths import MODELS_DIR

model, artifacts = load_artifacts(
    MODELS_DIR / "churn_model.joblib",
    MODELS_DIR / "preprocess_artifacts.joblib"
)

sample = df.sample(5, random_state=42)
preds = predict(sample, model, artifacts)

preds
