### Creating an online feature table. 
**Relevant docs**:
 - [Databricks online feature stores](https://docs.databricks.com/aws/en/machine-learning/feature-store/online-feature-store)
 - [Prerequisites for publishing to an online store](https://docs.databricks.com/aws/en/machine-learning/feature-store/online-feature-store#prerequisites-for-publishing-to-online-stores)
 - [Push a table docs](https://docs.databricks.com/aws/en/machine-learning/feature-store/online-feature-store#publish-a-feature-table)
 - [Publish modes](https://docs.databricks.com/aws/en/machine-learning/feature-store/online-feature-store#publish-modes)
 - [Feature engineering client](https://api-docs.databricks.com/python/feature-engineering/latest/feature_engineering.client.html)

In [0]:
%pip install databricks-feature-engineering==0.14.0 mlflow==3.0.1 xgboost==3.0.0
%restart_python

In [0]:
import time
import requests
import re
from io import StringIO
from typing import Iterable, Tuple
import pandas as pd
import mlflow
from mlflow.models import infer_signature

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from xgboost import XGBClassifier

from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
)

import pyspark.sql.functions as func
from pyspark.sql.functions import col

from databricks.feature_engineering import FeatureEngineeringClient, FeatureLookup

In [0]:
dbutils.widgets.text('online_store_name','','Enter online store name')
dbutils.widgets.text('catalog_name','','Enter catalog name')
dbutils.widgets.text('schema_name','','Enter schema name')
dbutils.widgets.text('model_name','','Enter model name')

In [0]:
#dbutils.widgets.removeAll()

In [0]:
online_store_name = dbutils.widgets.get('online_store_name')
catalog_name = dbutils.widgets.get('catalog_name')
schema_name = dbutils.widgets.get('schema_name')
table_name = "advanced_churn_bronze_customers"
uc_model_name = f"{catalog_name}.{schema_name}.{dbutils.widgets.get('model_name')}"
uc_location = f"{catalog_name}.{schema_name}.{table_name}"
print(f"UC location: {uc_location}\nModel name: {uc_model_name}\nOnline store name: {online_store_name}")

In [0]:
mlflow.sklearn.autolog(disable=True)
exerpiment_path = "/Users/marshall.carter@databricks.com/workshop_experiment_mlc"
mlflow.set_experiment(exerpiment_path) 
mlflow.set_registry_uri("databricks-uc")

The capacity options correspond to different provisioned instance performance tiers: "CU_1", "CU_2", "CU_4", and "CU_8". Each capacity unit allocates about 16GB of RAM to the provisioned database instance, along with all associated CPU and local SSD resources. Scaling up increases these resources linearly

In [0]:
fe = FeatureEngineeringClient()

In [0]:
# Create an online store with specified capacity
fe.create_online_store(
    name=online_store_name,
    capacity="CU_1"  # Valid options: "CU_1", "CU_2", "CU_4", "CU_8"
)

In [0]:
while True:
  store = fe.get_online_store(name=online_store_name)
  state = store.state.value

  if state == "AVAILABLE":
      print("Online store is ready ðŸŽ‰")
      break

  if state == "FAILED":
      raise RuntimeError(f"Online store {online_store_name} failed to provision")

  time.sleep(5)

In [0]:
spark.sql(f"ALTER TABLE {uc_location}_demographic_features SET TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true')")
spark.sql(f"ALTER TABLE {uc_location}_demographic_features ALTER COLUMN customer_i_d SET NOT NULL")

In [0]:
online_store = fe.get_online_store(name=online_store_name)

# Publish the feature table to the online store
fe.publish_table(
    online_store=online_store,
    source_table_name=f"{uc_location}_demographic_features",
    online_table_name=f"{uc_location}_demographic_features_online",
    # `publish_mode` argument is optional and defaults to "TRIGGERED" mode if not specified
)

In [0]:
spark.sql(f"ALTER TABLE {uc_location}_service_features SET TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true')")
spark.sql(f"ALTER TABLE {uc_location}_service_features ALTER COLUMN customer_i_d SET NOT NULL")

In [0]:
online_store = fe.get_online_store(name=online_store_name)

# Publish the feature table to the online store
fe.publish_table(
    online_store=online_store,
    source_table_name=f"{uc_location}_service_features",
    online_table_name=f"{uc_location}_service_features_online",
    # `publish_mode` argument is optional and defaults to "TRIGGERED" mode if not specified
)

#### Train and register a model that uses feature lookups

In [0]:
feature_lookups = [
  FeatureLookup(
    table_name=f"{uc_location}_demographic_features",
    lookup_key="customer_i_d"
  ),
  FeatureLookup(
    table_name=f"{uc_location}_service_features",
    lookup_key="customer_i_d"
  )
]

target = spark.table(uc_location).select("customer_i_d", "churn")

training_set = fe.create_training_set(
                  df=target,
                  feature_lookups=feature_lookups,
                  exclude_columns=[],
                  label="churn")

training_df = training_set.load_df()
display(training_df)

In [0]:
df = training_df.toPandas()

y = df["churn"].map({"Yes": 1, "No": 0}).astype("int8")
X = df.drop(columns=["churn", "customer_i_d"])

In [0]:
def create_preprocessor(
    numeric_features: Iterable[str], categorical_features: Iterable[str]
) -> ColumnTransformer:
    """Build a ColumnTransformer that prepares features for XGBoost."""
    numeric_transformer = Pipeline(
        steps=[
            (
                "coerce_numeric",
                FunctionTransformer(
                    lambda data: data.apply(pd.to_numeric, errors="coerce"), validate=False
                ),
            ),
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )

    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]
    )

    return ColumnTransformer(
        transformers=[
            ("numeric", numeric_transformer, list(numeric_features)),
            ("categorical", categorical_transformer, list(categorical_features)),
        ]
    )

In [0]:
numeric_features = [col for col in ["senior_citizen", "tenure", "monthly_charges", "total_charges"] if col in X.columns]

categorical_features = [col for col in X.columns if col not in numeric_features]

preprocessor = create_preprocessor(numeric_features, categorical_features)

xgb_classifier = XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss")

pipeline = Pipeline(
            steps=[
                ("preprocessor", preprocessor),
                ("model", xgb_classifier),
            ]
        )

In [0]:
with mlflow.start_run(run_name="feature_store") as run:
  mlflow.autolog(disable=False)

  run_id = run.info.run_id

  X_train, X_val, y_train, y_val = train_test_split(
          X, y, test_size=0.2, stratify=y, random_state=42
      )
  
  pipeline.fit(X_train, y_train)
  val_predictions = pipeline.predict(X_val)
  auc = roc_auc_score(y_val, val_predictions)

  mlflow.log_metric("val_roc_auc", auc)
  signature = infer_signature(X, y)
  
  artifact_path = "pipeline"
  fe.log_model(
    model=pipeline,
    artifact_path=artifact_path,
    flavor=mlflow.sklearn,
    training_set=training_set,
    registered_model_name=uc_model_name
  )

  model_uri = f"runs:/{run_id}/{artifact_path}"
  print(model_uri)

#### Register model to UC

In [0]:
from mlflow.tracking import MlflowClient
client = MlflowClient()

In [0]:
model_info = mlflow.register_model(model_uri, 
                                   name = uc_model_name,
                                   tags={"feature_store": True})

print(model_info)

####Serve model and query model serving endpoint

In [0]:
df = spark.table(uc_location).select("customer_i_d").toPandas()
df.head()

In [0]:
# Authenticate using service principal credentials saved as Databricks secrets
CLIENT_ID = dbutils.secrets.get(scope="scope_name", key=f"secret_key")
CLIENT_SECRET = dbutils.secrets.get(scope="scope_name", key=f"secret_key")
DATABRICKS_HOST = "" #Enter the databricks workspace_url

In [0]:
from databricks.sdk import WorkspaceClient

In [0]:
w = WorkspaceClient(
  host          = DATABRICKS_HOST,
  client_id     = CLIENT_ID,
  client_secret = CLIENT_SECRET)

predictions = w.serving_endpoints_data_plane.query(name = "basic_model_endpoint",
                     dataframe_records = df[:3].to_dict(orient='records')
                     )

predictions.predictions