In [123]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

In [124]:
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['MedHouseVal']  = housing.target

In [125]:
lowcorricated = ['AveBedrms', 'Population', 'AveOccup','Longitude']
df = df.drop(columns=lowcorricated)

In [126]:
x = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

In [127]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [135]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from mlflow.tracking import MlflowClient
import yaml

# Replace these with your actual training data
# X_train, X_test, y_train, y_test must be defined before running this script

# Set MLflow experiment
mlflow.set_experiment("Model_Comparison_Regression_MLOPs")

best_overall_model = None
best_overall_r2 = float('-inf')
best_model_name = ""
best_run_id = ""
best_artifact_path = ""

# --- Linear Regression ---
with mlflow.start_run(run_name="Linear_Regression") as run_lr:
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    lr_preds = lr_model.predict(X_test)

    mse_lr = float(mean_squared_error(y_test, lr_preds))
    r2_lr = float(r2_score(y_test, lr_preds))

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_metric("mse", mse_lr)
    mlflow.log_metric("r2", r2_lr)

    artifact_path_lr = "linear_regression_model"
    mlflow.sklearn.log_model(
        sk_model=lr_model,
        artifact_path=artifact_path_lr,
        registered_model_name="LinearRegressionModel"
    )

    print("\nLinear Regression:")
    print(f"  MSE: {mse_lr:.4f}")
    print(f"  R²: {r2_lr:.4f}")

    if r2_lr > best_overall_r2:
        best_overall_model = lr_model
        best_overall_r2 = r2_lr
        best_model_name = "LinearRegressionModel"
        best_run_id = run_lr.info.run_id
        best_artifact_path = artifact_path_lr

# --- Decision Tree with Grid Search ---
with mlflow.start_run(run_name="Tuned_Decision_Tree") as run_dt:
    param_grid = {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    grid_search = GridSearchCV(
        estimator=DecisionTreeRegressor(random_state=42),
        param_grid=param_grid,
        scoring='r2',
        cv=5,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    best_dt_model = grid_search.best_estimator_
    dt_preds = best_dt_model.predict(X_test)

    mse_dt = float(mean_squared_error(y_test, dt_preds))
    r2_dt = float(r2_score(y_test, dt_preds))

    best_params = {
        k: (str(v) if v is None else v)
        for k, v in grid_search.best_params_.items()
    }

    mlflow.log_param("model_type", "DecisionTreeRegressor")
    mlflow.log_params(best_params)
    mlflow.log_metric("mse", mse_dt)
    mlflow.log_metric("r2", r2_dt)

    artifact_path_dt = "decision_tree_model"
    mlflow.sklearn.log_model(
        sk_model=best_dt_model,
        artifact_path=artifact_path_dt,
        registered_model_name="DecisionTreeRegressorModel"
    )

    print("\nTuned Decision Tree Regressor:")
    print(f"  Best Parameters: {best_params}")
    print(f"  MSE: {mse_dt:.4f}")
    print(f"  R²: {r2_dt:.4f}")

    if r2_dt > best_overall_r2:
        best_overall_model = best_dt_model
        best_overall_r2 = r2_dt
        best_model_name = "DecisionTreeRegressorModel"
        best_run_id = run_dt.info.run_id
        best_artifact_path = artifact_path_dt

# --- Export metrics safely to YAML ---
client = MlflowClient()

mse_history = client.get_metric_history(best_run_id, "mse")
r2_history = client.get_metric_history(best_run_id, "r2")

# Convert to primitive dictionaries
mse_safe = [{
    "key": m.key,
    "value": m.value,
    "step": m.step,
    "timestamp": m.timestamp,
    "run_id": m.run_id,
    "model_id": getattr(m, "model_id", None)
} for m in mse_history]

r2_safe = [{
    "key": m.key,
    "value": m.value,
    "step": m.step,
    "timestamp": m.timestamp,
    "run_id": m.run_id,
    "model_id": getattr(m, "model_id", None)
} for m in r2_history]

# Save to YAML
with open("best_model_metrics.yaml", "w") as f:
    yaml.dump({
        "best_model_name": best_model_name,
        "best_run_id": best_run_id,
        "mse_history": mse_safe,
        "r2_history": r2_safe
    }, f)


# --- Final Output ---
print(f"\n✅ Best Model: {best_model_name} with R² = {best_overall_r2:.4f}")
print(f"   Run ID: {best_run_id}")
print("   Metrics saved to: best_model_metrics.yaml")



Registered model 'LinearRegressionModel' already exists. Creating a new version of this model...
Created version '15' of model 'LinearRegressionModel'.



Linear Regression:
  MSE: 0.6495
  R²: 0.5043





Tuned Decision Tree Regressor:
  Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
  MSE: 0.5480
  R²: 0.5818

✅ Best Model: DecisionTreeRegressorModel with R² = 0.5818
   Run ID: c817527b086e4650b4fb21fc381cf3b5
   Metrics saved to: best_model_metrics.yaml
   Model registered as: ACaliforniaHousingModel (from: runs:/1450f1c9110640c89419761e465d777c/decision_tree_model)


Registered model 'DecisionTreeRegressorModel' already exists. Creating a new version of this model...
Created version '14' of model 'DecisionTreeRegressorModel'.


In [136]:
# --- Register the best model ---
model_uri = f"runs:/{best_run_id}/{best_artifact_path}"
registration_result = mlflow.register_model(
    model_uri=model_uri,
    name="HousingPricePredication"
)


Successfully registered model 'HousingPricePredication'.
Created version '1' of model 'HousingPricePredication'.


In [141]:
import pickle

with open("../models/model.pkl", "wb") as f:
    pickle.dump(best_dt_model, f)