# Experiment Machine Learning

In [0]:
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pyspark.sql.functions import col
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

from databricks.feature_store import FeatureStoreClient
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
fs = FeatureStoreClient()

In [0]:
# Load feature store table
taxi_data = fs.read_table("autoguidovie.nycity_taxi.fs_taxi_data")

In [0]:
# Conver pySpark DataFrame to Pandas DataFrame
df = taxi_data.toPandas()
Y = df['Y']
X = df.drop(columns=['Y', 'pr_key'])

In [0]:
# One-hot encoding for categorical features
categorical_features = ['vendor_id', 'store_and_fwd_flag', 'payment_type']
encoder = OneHotEncoder(sparse=False)
X_encoded = pd.DataFrame(
    encoder.fit_transform(X[categorical_features]),
    columns=encoder.get_feature_names_out(categorical_features)
)
# Concatenate one-hot encoded columns back to the dataset and drop original categorical columns
X = pd.concat([
    X.drop(columns=categorical_features),
    X_encoded],
    axis=1
)

In [0]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [0]:
# Split the dataset into 80% train and 20% test
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

# Start MLflow experiment tracking
mlflow.set_experiment("/Experiments/nycity-taxi/NyCity Taxi - Base Reference")


In [0]:
with mlflow.start_run(run_name='DecisionTreeRegressor'):
    # Train a regression tree model with default parameters
    reg_tree = DecisionTreeRegressor()
    reg_tree.fit(X_train, Y_train)

    # Test the model on the test set
    Y_pred = reg_tree.predict(X_test)

    # Evaluate the model
    r2 = r2_score(Y_test, Y_pred)
    rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
    mae = mean_absolute_error(Y_test, Y_pred)

    # Log the model and metrics in MLflow
    mlflow.log_metric('R2', r2)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('mae', mae)
    mlflow.sklearn.log_model(reg_tree, "model")

    # Plot feature importance
    importance = reg_tree.feature_importances_
    features = X.columns
    indices = np.argsort(importance)

    plt.figure(figsize=(10, 6))
    plt.title('Feature Importances')
    plt.barh(range(len(indices)), importance[indices], align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.xlabel('Relative Importance')
    plt.show()

    # Log the feature importance plot in MLflow
    fig, ax = plt.subplots()
    ax.barh(range(len(indices)), importance[indices], align='center')
    ax.set_yticks(range(len(indices)))
    ax.set_yticklabels([features[i] for i in indices])
    ax.set_xlabel('Relative Importance')
    ax.set_title('Feature Importances')
    mlflow.log_figure(fig, "feature_importances.png")
mlflow.end_run()

In [0]:
with mlflow.start_run(run_name='Hypertuned XGBoost'):
    xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

    # Set up GridSearchCV for hyperparameter tuning
    param_distributions = {
        'n_estimators': np.arange(100, 301, 100),
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': np.arange(3, 8),
        'subsample': [0.8, 1],
        'colsample_bytree': [0.8, 1]
    }

    # Set up RandomizedSearchCV for hyperparameter tuning
    random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_distributions, 
                                    n_iter=10, scoring='r2', cv=3, verbose=2, random_state=42)


    # Train the model with RandomizedSearchCV
    random_search.fit(X_train, Y_train)

    # Get the best model from RandomizedSearchCV
    best_xgb = random_search.best_estimator_

    # Make predictions on the test set
    Y_pred = best_xgb.predict(X_test)

    # Evaluate the model
    r2 = r2_score(Y_test, Y_pred)
    rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
    mae = mean_absolute_error(Y_test, Y_pred)

    # Log the model and metrics in MLflow
    mlflow.log_metric('R2', r2)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('mae', mae)

    mlflow.sklearn.log_model(best_xgb, "model")

    # Plot feature importance
    importance = best_xgb.feature_importances_
    features = X.columns
    indices = np.argsort(importance)

    plt.figure(figsize=(10, 6))
    plt.title('Feature Importances (Best XGBoost)')
    plt.barh(range(len(indices)), importance[indices], align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.xlabel('Relative Importance')
    plt.show()

    # Log the feature importance plot in MLflow
    fig, ax = plt.subplots()
    ax.barh(range(len(indices)), importance[indices], align='center')
    ax.set_yticks(range(len(indices)))
    ax.set_yticklabels([features[i] for i in indices])
    ax.set_xlabel('Relative Importance')
    ax.set_title('Feature Importances (Best XGBoost)')
    mlflow.log_figure(fig, "xgboost_feature_importances.png")

    # Run SHAP values
    explainer = shap.TreeExplainer(best_xgb)
    shap_values = explainer.shap_values(X_test)

    # Shap Summary plot
    shap.summary_plot(shap_values, X_test, feature_names=X.columns)

    # Shap Dependence plots
    shap.dependence_plot("trip_distance", shap_values, X_test, feature_names=X.columns)

    shap.dependence_plot("pickup_day_of_week", shap_values, X_test, feature_names=X.columns)

    shap.dependence_plot("pickup_hour", shap_values, X_test, feature_names=X.columns)

    shap.dependence_plot("dropoff_clusters_m", shap_values, X_test, feature_names=X.columns)

    shap.dependence_plot("pickup_clusters_m", shap_values, X_test, feature_names=X.columns)
mlflow.end_run()

In [0]:
# model_uri = f"runs:/{run.info.e88a9f75f759457395c3f71af4f17c1c}/best_xgboost_model"
# model_name = "XGBoostTaxiFareModel" 

# registered_model = mlflow.register_model(model_uri, model_name)

# client = MlflowClient()
# client.transition_model_version_stage(
#     name=model_name,
#     version=registered_model.version,
#     stage="Staging"
# )

# Load the model artifact from a specific run
# run_id = "e88a9f75f759457395c3f71af4f17c1c"
# model_uri = f"runs:/{run_id}/best_xgboost_model"
# model = mlflow.sklearn.load_model(model_uri)