In [28]:
# Cell 1: Import Libraries
import pandas as pd
import joblib
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE

# For reproducibility
import numpy as np
np.random.seed(42)


In [29]:
# Cell 2: Load Preprocessed & Feature-Selected Data
# Make sure you have run your merged data_preprocessing.py before this notebook
X_train = pd.read_csv("../data/processed/X_train_selected.csv")
X_test  = pd.read_csv("../data/processed/X_test_selected.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").squeeze()
y_test  = pd.read_csv("../data/processed/y_test.csv").squeeze()

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train distribution:")
print(y_train.value_counts())


X_train shape: (5634, 10)
X_test shape: (1409, 10)
y_train distribution:
Churn
0    4139
1    1495
Name: count, dtype: int64


In [30]:
# Cell 3: Hyperparameter Tuning with XGBoost
# Create a pipeline using XGBClassifier
pipeline = Pipeline([
    ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

# Define the hyperparameter grid
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.1, 0.2]
}

# Set up grid search (using F1 score for evaluation)
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best hyperparameters (XGBoost):", grid.best_params_)

# Save the best estimator (optional)
joblib.dump(grid.best_estimator_, '../models/best_model_xgb.pkl')


Best hyperparameters (XGBoost): {'model__learning_rate': 0.2, 'model__max_depth': 5, 'model__n_estimators': 50}


Parameters: { "use_label_encoder" } are not used.



['../models/best_model_xgb.pkl']

In [31]:
# Cell 4: Handle Class Imbalance with SMOTE and Train Final Model
# Apply SMOTE to training data to handle class imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Load best estimator from hyperparameter tuning (or use grid.best_estimator_)
model = joblib.load('../models/best_model_xgb.pkl')
model.fit(X_train_smote, y_train_smote)

# Evaluate the model on test data
y_pred = model.predict(X_test)
print("Classification Report (XGBoost):")
print(classification_report(y_test, y_pred))
print(f"ROC AUC: {roc_auc_score(y_test, y_pred):.4f}")

# Save the final model
joblib.dump(model, '../models/final_model_xgb.pkl')
print("Final XGBoost model saved as 'models/final_model_xgb.pkl'")


Classification Report (XGBoost):
              precision    recall  f1-score   support

           0       0.91      0.70      0.79      1035
           1       0.49      0.80      0.61       374

    accuracy                           0.73      1409
   macro avg       0.70      0.75      0.70      1409
weighted avg       0.80      0.73      0.74      1409

ROC AUC: 0.7523
Final XGBoost model saved as 'models/final_model_xgb.pkl'


Parameters: { "use_label_encoder" } are not used.



In [32]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 🚀 Load dataset
df = pd.read_csv("../data/raw/churn.csv")  # Make sure this file exists in your directory

# 🔍 Check data types (to confirm categorical variables)
print("Data Types:\n", df.dtypes)

# ✅ Convert categorical variables using One-Hot Encoding
df = pd.get_dummies(df, drop_first=True)

# 🎯 Define Features & Target
TARGET = "Churn_Yes"  # After one-hot encoding, "Churn" becomes "Churn_Yes"
FEATURES = [col for col in df.columns if col != TARGET]  # Select all columns except target

X = df[FEATURES]
y = df[TARGET]

# 🔀 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 📏 Scale the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 🌲 Train a RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# 📈 Evaluate the model
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# 💾 Save the model & scaler
with open("model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)

print("✅ Model & Scaler saved successfully!")


Data Types:
 customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object
Model Accuracy: 0.80
✅ Model & Scaler saved successfully!
