### Integrate Feature Store tables and feature lookup. 
**Relevant docs**:

 - [Using features to train models](https://docs.databricks.com/aws/en/machine-learning/feature-store/train-models-with-feature-store#train-models-and-perform-batch-inference-with-feature-tables)

In [0]:
%pip install databricks-feature-engineering==0.14.0 mlflow==3.0.1 xgboost==3.0.0

In [0]:
import requests
import re
from io import StringIO
from typing import Iterable, Tuple
from sklearn.datasets import load_iris
import pandas as pd
import mlflow
from mlflow.models import infer_signature

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from xgboost import XGBClassifier

from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
)

import pyspark.sql.functions as func
from pyspark.sql.functions import col

from databricks.feature_engineering import FeatureEngineeringClient, FeatureLookup

In [0]:
mlflow.sklearn.autolog(disable=True)
exerpiment_path = "/Users/marshall.carter@databricks.com/workshop_experiment_mlc"
mlflow.set_experiment(exerpiment_path) 
mlflow.set_registry_uri("databricks-uc")

In [0]:
dbutils.widgets.text('catalog_name','','Enter catalog name')
dbutils.widgets.text('schema_name','','Enter schema name')
dbutils.widgets.text('model_name','','Enter model name')

In [0]:
#dbutils.widgets.removeAll()

In [0]:
catalog_name = dbutils.widgets.get('catalog_name')
schema_name = dbutils.widgets.get('schema_name')
table_name = "advanced_churn_bronze_customers"
uc_model_name = f"{catalog_name}.{schema_name}.{dbutils.widgets.get('model_name')}"
uc_location = f"{catalog_name}.{schema_name}.{table_name}"
print(f"UC location: {uc_location}\nModel name: {uc_model_name}")

In [0]:
features = spark.table(uc_location)
display(features)

In [0]:
demographic_cols = ["gender", "senior_citizen", "partner", "dependents"]
service_cols = ["tenure", "phone_service", "multiple_lines", "internet_service", "online_security", 
                    "online_backup", "device_protection", "tech_support", "streaming_t_v", 
                    "streaming_movies", "contract", "paperless_billing", "payment_method", 
                    "total_charges"]

target = features.select("customer_i_d", "churn")
demographic_features = features.select(["customer_i_d"] + demographic_cols)
service_features = features.select(["customer_i_d"] + service_cols)

display(target)

#### Create the Feature Store tables

In [0]:
fe = FeatureEngineeringClient()

In [0]:
f"{uc_location}_demographic_features"

In [0]:
fe.create_table(
    name=f"{uc_location}_demographic_features",
    primary_keys=["customer_i_d"],
    df=demographic_features,
    schema=demographic_features.schema,
    description="demographic features"
)

In [0]:
fe.create_table(
    name=f"{uc_location}_service_features",
    primary_keys=["customer_i_d"],
    df=service_features,
    schema=service_features.schema,
    description="service features"
)

####Perform a feature lookup and join

Create the feature lookup spec and training dataset

In [0]:
feature_lookups = [

  FeatureLookup(
    table_name=f"{uc_location}_demographic_features",
    feature_names=demographic_cols,
    lookup_key="customer_i_d"
  ),
  FeatureLookup(
    table_name=f"{uc_location}_service_features",
    feature_names=service_cols,
    lookup_key="customer_i_d"
  )
]

training_set = fe.create_training_set(
                  df=target,
                  feature_lookups=feature_lookups,
                  exclude_columns=[],
                  label="churn")

training_df = training_set.load_df()
display(training_df)

#### Train model

In [0]:
df = training_set.load_df().toPandas()

In [0]:
y = df["churn"].map({"Yes": 1, "No": 0}).astype("int8")
X = df.drop(columns=["churn", "customer_i_d"])

In [0]:
def create_preprocessor(
    numeric_features: Iterable[str], categorical_features: Iterable[str]
) -> ColumnTransformer:
    """Build a ColumnTransformer that prepares features for XGBoost."""
    numeric_transformer = Pipeline(
        steps=[
            (
                "coerce_numeric",
                FunctionTransformer(
                    lambda data: data.apply(pd.to_numeric, errors="coerce"), validate=False
                ),
            ),
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )

    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]
    )

    return ColumnTransformer(
        transformers=[
            ("numeric", numeric_transformer, list(numeric_features)),
            ("categorical", categorical_transformer, list(categorical_features)),
        ]
    )

In [0]:
numeric_features = [col for col in ["senior_citizen", "tenure", "monthly_charges", "total_charges"] if col in X.columns]

categorical_features = [col for col in X.columns if col not in numeric_features]

preprocessor = create_preprocessor(numeric_features, categorical_features)

xgb_classifier = XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss")

pipeline = Pipeline(
            steps=[
                ("preprocessor", preprocessor),
                ("model", xgb_classifier),
            ]
        )

Log the model using the feature engineering client

In [0]:
with mlflow.start_run(run_name="feature_store") as run:

  mlflow.autolog(disable=False)

  run_id = run.info.run_id

  X_train, X_val, y_train, y_val = train_test_split(
          X, y, test_size=0.2, stratify=y, random_state=42
      )
  
  pipeline.fit(X_train, y_train)
  val_predictions = pipeline.predict(X_val)
  auc = roc_auc_score(y_val, val_predictions)

  mlflow.log_metric("val_roc_auc", auc)
  signature = infer_signature(X, y)
  
  artifact_path = "pipeline"
  model_info = fe.log_model(
    model=pipeline,
    artifact_path=artifact_path,
    flavor=mlflow.sklearn,
    training_set=training_set,
  )

  model_uri = f"runs:/{run_id}/{artifact_path}"
  print(model_uri)


In [0]:
display(target)

In [0]:
predictions = fe.score_batch(
    model_uri=model_uri,
    df=target
)

display(predictions)