### Prepare environment

In [0]:
%run ../environment/prepare_environment

# Linear Regression - California Housing Price Prediction

This notebook will cover
- Training and evaluating a linear regression model using scikit-learn
- Tracking experiments and results with MLflow
- Visualizing model performance with prediction plots and residuals

**Why linear regression?**
- It's the gold standard for interpretable regression
- Fast, robust, and a great baseline for many problems
- Easy to explain to business stakeholders

In [0]:
# Set imports and logger configuration
import os
import logging
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("california-lr-pipeline")

## 1. Load and Explore the Dataset

The California housing dataset is a classic regression dataset with 13 features describing housing in California. We will start with simple data exploration.

In [0]:
def load_california_data():
    california = fetch_california_housing()
    X = pd.DataFrame(california.data, columns=california.feature_names)
    y = pd.Series(california.target, name='MEDV')
    logger.info(f"Loaded California housing data: {X.shape[0]} rows, {X.shape[1]} features.")
    return X, y

X, y = load_california_data()
print(X.head())
print(y.head())

## 2. Train/Test Split

We will split the data into training and test sets (80/20). This is a standard practice to evaluate model generalization and avoid overfitting.

In [0]:
def split_data(X, y, test_size=0.2, seed=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
    logger.info(f"Train: {X_train.shape[0]}, Test: {X_test.shape[0]}")
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(X, y)

## 3. Model Training and MLflow Logging

Train a linear regression model and log parameters, metrics, and artifacts to MLflow using the scikit-learn flavor.

In [0]:
def train_and_log(X_train, y_train, X_test, y_test):
    with mlflow.start_run(run_name="california_linear_regression") as run:
        # Train a model
        model = LinearRegression()
        model.fit(X_train, y_train)

        # Evaluate the model using Mean Squared Error
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mlflow.log_metrics({
            'mse': mse,
            'r2': r2
        })
        logger.info(f"Test MSE: {mse:.2f}, R2: {r2:.2f}")

        # Plot predictions vs actual
        fig, ax = plt.subplots(figsize=(6, 6))
        ax.scatter(y_test, y_pred, alpha=0.7)
        ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
        ax.set_xlabel('Actual Median House Value')
        ax.set_ylabel('Predicted Median House Value')
        ax.set_title('Linear Regression: Actual vs Predicted')
        plt.tight_layout()
        os.makedirs('mlflow_artifacts', exist_ok=True)
        plot_path = 'mlflow_artifacts/actual_vs_pred.png'
        fig.savefig(plot_path)
        plt.close(fig)
        mlflow.log_artifact(plot_path)

        # Residuals plot
        fig, ax = plt.subplots(figsize=(6, 4))
        residuals = y_test - y_pred
        ax.scatter(y_pred, residuals, alpha=0.7)
        ax.axhline(0, color='red', linestyle='--')
        ax.set_xlabel('Predicted Median House Value')
        ax.set_ylabel('Residuals')
        ax.set_title('Residuals Plot')
        plt.tight_layout()
        resid_path = 'mlflow_artifacts/residuals.png'
        fig.savefig(resid_path)
        plt.close(fig)
        mlflow.log_artifact(resid_path)

        # Feature Importance plot
        coef_df = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
        coef_df = coef_df.sort_values('Coefficient', key=abs, ascending=False)
        fig, ax = plt.subplots(figsize=(8, 6))
        coef_df.plot.bar(x='Feature', y='Coefficient', ax=ax, legend=False, color='teal')
        ax.set_title('Feature Importances (Linear Regression Coefficients)')
        ax.set_ylabel('Coefficient Value')
        plt.tight_layout()
        coef_path = 'mlflow_artifacts/feature_importance.png'
        fig.savefig(coef_path)
        plt.close(fig)
        mlflow.log_artifact(coef_path)
        
        # Log the model to MLFlow
        mlflow.sklearn.log_model(
            model,
            name='california_lr',
            input_example=X_test[:5]
            )
        
        # Register the model in Unity Catalog
        run_id = mlflow.active_run().info.run_id
        logged_model_uri = f"runs:/{run_id}/california_lr"

        mlflow.register_model(
            logged_model_uri,
            name="ai_ml_in_practice.telco_customer_churn_silver.california_lr_model"
        )
        logger.info('MLflow run completed. Run ID: %s', run.info.run_id)

        return model

model = train_and_log(X_train, y_train, X_test, y_test)

## 4. Batch Inference and Model Loading

In production, you often need to load a model and run batch inference. Here is how you do it with MLflow and scikit-learn.

In [0]:
loaded_model = mlflow.sklearn.load_model(
    "models:/ai_ml_in_practice.telco_customer_churn_silver.california_lr_model/1"
)

# Batch inference example
sample = X_test.iloc[:5]
pred = loaded_model.predict(sample)
print("Sample predictions:", pred)