In [None]:
import os
from pathlib import Path

import matplotlib.pyplot as plt
import mlflow
import pandas as pd
import seaborn as sns
from churn_training import evaluate_model
from churn_training import prepare_data
from churn_training import train_model
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'modeling'

In [None]:
CUSTOMER_CHURN_DATASET = "../../../data/customer_churn_0.csv"
TARGET_COLUMN = "Churn"
df = pd.read_csv(CUSTOMER_CHURN_DATASET)

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
df.shape

In [None]:
len(df.isnull().sum().loc[lambda x: x > 0])

In [None]:
df.duplicated().sum()

In [None]:
def plot_numerical_variables(numerical_df):
    numerical_cols = numerical_df.select_dtypes(include=["float", "int"]).columns

    n_cols = 3
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    _, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
    axes = axes.flatten()
    for idx, col in enumerate(numerical_cols):
        sns.histplot(data=numerical_df, x=col, kde=True, ax=axes[idx])
        axes[idx].set_title(f"Distribution of {col}")
        axes[idx].tick_params(axis="x", rotation=45)
    # Hide any unused subplots
    for ax in axes[len(numerical_cols) :]:
        ax.set_visible(False)
    plt.tight_layout()
    plt.show()

In [None]:
plot_numerical_variables(df)

In [None]:
print(Path().resolve().parents[2])
env_path = Path().resolve().parents[2] / ".env"
load_dotenv(dotenv_path=env_path)
MLFLOW_TRACKING_URI = os.getenv(
    "MLFLOW_TRACKING_URI"
)  # This should be set in your .env file
print(f"MLFLOW_TRACKING_URI: {MLFLOW_TRACKING_URI}")
if not MLFLOW_TRACKING_URI:
    raise ValueError("MLFLOW_TRACKING_URI is not set. Please check your .env file.")

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("mlops-churn-pipeline")

In [None]:
X, y = prepare_data(df)

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
base_params = {
    "n_estimators": 100,
    "max_depth": 3,
    "learning_rate": 0.1,
    "random_state": 42,
}
clf = train_model(X_train, y_train, base_params)

In [None]:
# To run hyperparameter tuning, import the function and uncomment the following line:
# clf = tune_model_with_cv(X_train, y_train)

In [None]:
# Train final model with best tuned hyperparameters to-date
# These parameters are based on the best results from previous tuning runs
# X_test precision/recall/f1: 0.92 0.81 0.86
best_params_to_date = {
    "n_estimators": 352,
    "learning_rate": 0.07154324375438634,
    "max_depth": 7,
    "min_child_weight": 1,
    "gamma": 0.23500630396472585,
    "subsample": 0.9472361823473306,
    "colsample_bytree": 0.6149847610884563,
    "reg_alpha": 0.029080723124195962,
    "reg_lambda": 1.9394489642211972,
}
clf = train_model(X_train, y_train, best_params_to_date)

In [None]:
# First evaluate tuned model on training data to check for bias
evaluate_model(clf, X_train, y_train, "X_train")

In [None]:
# Next evaluate tuned model on test data to check for variance
evaluate_model(
    clf,
    X_test,
    y_test,
    "X_test",
    log_model=True,
    log_model_X_train=X_train,
    log_model_y_train=y_train,
)