In [0]:
%run ./_common

In [0]:
# create a sample table to be used for the lesson
def create_details_table(self):
    from pyspark.sql.functions import col

    df_customer_details = spark.read.csv(
        f"{DA.paths.datasets}/telco/customer-details.csv", header=True, inferSchema=True
    ).withColumnRenamed("CustomerID", "id")

    # save to catalog
    spark.sql(f"USE CATALOG {DA.catalog_name}")
    df_customer_details.write.mode("overwrite").saveAsTable("customer_details")


DBAcademyHelper.monkey_patch(create_details_table)

In [0]:
def preprocess_customer_data(self):
    import pyspark.pandas as ps
    import re
    from pyspark.sql.functions import col
    from databricks.feature_store import FeatureStoreClient
    fs = FeatureStoreClient()

    # Read data sets and join them by customer id
    sdf_customer_demographics = spark.read.csv(
        f"{DA.paths.datasets}/telco/customer-demographics.csv",
        header=True,
        inferSchema=True
    )
    sdf_customer_details = spark.read.csv(
        f"{DA.paths.datasets}/telco/customer-details.csv",
        header=True,
        inferSchema=True
    ).withColumnRenamed("customerID", "id")

    # Join the two datasets on customerID
    sdf_customers = sdf_customer_demographics.join(
        sdf_customer_details, col("customerID") == col("id")
    )

    # Convert Spark DataFrame to pandas-on-Spark DataFrame
    df_customers = ps.DataFrame(sdf_customers)

    # Exclude columns that are not needed
    df_customers = df_customers.drop(
        columns=[
            "Dependents",
            "id",
            "MultipleLines",
            "OnlineSecurity",
            "OnlineBackup",
            "DeviceProtection",
            "TechSupport",
            "PaperlessBilling",
            "TotalCharges",
        ]
    )

    # One-hot encode categorical features
    df_customers_ohe = ps.get_dummies(
        df_customers,
        columns=[
            "gender",
            "SeniorCitizen",
            "Partner",
            "PhoneService",
            "InternetService",
            "StreamingTV",
            "StreamingMovies",
            "Contract",
            "PaymentMethod",
        ],
        dtype="float64",
    )

    # Convert integer columns to float64 (best practice for MLflow)
    df_customers_ohe['tenure'] = df_customers_ohe['tenure'].astype('float64')

    # Clean up column names by replacing non-alphanumeric characters with underscores and lowercasing them
    df_customers_ohe.columns = [
        re.sub(r"[^a-zA-Z0-9_]", "_", col).lower() for col in df_customers_ohe.columns
    ]
    
    #drop table if exists
    try:
        fs.drop_table('customer_features')
    except:
        pass

    # exclude prediction column and save the features to the feature table
    df_customers_ohe = df_customers_ohe.drop(columns=["churn"])

    fs.create_table(
    name = "customer_features",
    primary_keys = ["customerid"],
    schema = df_customers_ohe.spark.schema(),
    description="This customer-level table contains one-hot encoded and numeric features."
    )

    # Set catalog
    spark.sql(f"USE CATALOG {DA.catalog_name}")

    fs.write_table(df=df_customers_ohe.to_spark(), name="customer_features", mode="overwrite")

# Register the function to the DBAcademyHelper class
DBAcademyHelper.monkey_patch(preprocess_customer_data)

In [0]:
from databricks.feature_store import FeatureLookup, FeatureStoreClient
from sklearn.model_selection import train_test_split
def create_feature_store(self):
    fs = FeatureStoreClient()

    # Read in customer_details table as a Spark DataFrame
    sdf_lookup = spark.read.table('customer_details').select(['id','Churn'])

    feature_lookups = [
        FeatureLookup(
        table_name = 'customer_features',
        lookup_key = 'id'
        )
    ]

    training_set = fs.create_training_set(
        df = sdf_lookup,
        feature_lookups = feature_lookups,
        label = 'Churn',
        exclude_columns = ['id']
    )

    training_df = training_set.load_df().toPandas()

    return training_df 
    
DBAcademyHelper.monkey_patch(create_feature_store)

In [0]:
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from mlflow.deployments import get_deploy_client
from mlflow.tracking.client import MlflowClient

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

import pyspark.pandas as ps

import re

from pyspark.sql.functions import col

from databricks.feature_store import FeatureStoreClient

import requests
import json



def create_model(self):
    fs = FeatureStoreClient()

    # Read in customer_details table as a Spark DataFrame
    sdf_lookup = spark.read.table('customer_details').select(['id','Churn'])

    feature_lookups = [
        FeatureLookup(
        table_name = 'customer_features',
        lookup_key = 'id'
        )
    ]

    training_set = fs.create_training_set(
        df = sdf_lookup,
        feature_lookups = feature_lookups,
        label = 'Churn',
        exclude_columns = ['id']
    )

    training_df = training_set.load_df().toPandas()



    # Use the training dataset to store variables X, the features, and y, the target variable. 
    X = training_df.drop("Churn", axis=1)
    y = training_df["Churn"]

    # Convert categorical labels to numerical labels
    y = y.map({'Yes': 1.0, 'No': 0.0})

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    # set the path for mlflow experiment
    mlflow.set_experiment(f"/Users/{DA.username}/get-started-ml-experiment")

    with mlflow.start_run(run_name = 'end_to_end_ml_on_databricks_run') as run:  
        # Initialize the Random Forest classifier
        rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

        # Fit the model on the training data
        rf_classifier.fit(X_train, y_train)

        # Make predictions on the test data
        y_pred = rf_classifier.predict(X_test)

        # Enable automatic logging of input samples, metrics, parameters, and models
        mlflow.sklearn.autolog(
            log_input_examples = True,
            silent = True
        )

        mlflow.log_metric("test_f1", f1_score(y_test, y_pred))
            
        mlflow.sklearn.log_model(
            rf_classifier,
            artifact_path = "model-artifacts", 
            input_example=X_train[:3],
            signature=infer_signature(X_train, y_train)
        )

        model_uri = f"runs:/{run.info.run_id}/model-artifacts"

    # Modify the registry uri to point to Unity Catalog
    mlflow.set_registry_uri("databricks-uc")

    # Define the model name 
    model_name = f"{DA.catalog_name}.{DA.schema_name}.my_model_{DA.unique_name('-')}"

    # Register the model in the model registry
    registered_model = mlflow.register_model(model_uri=model_uri, name=model_name)

    # Initialize an MLflow Client
    client = MlflowClient()

    # Assign a "staging" alias to model version 1
    client.set_registered_model_alias(
        name= registered_model.name,  # The registered model name
        alias="staging",  # The alias representing the staging environment
        version= 1 # The version of the model you want to move to "staging"
    )

    client = get_deploy_client("databricks")
    endpoint_name = f"get-started-model-serving-endpoint"
    endpoint_name = endpoint_name.replace("@databricks.com", "").replace('.', '-')

    try:
        # Attempt to get the endpoint
        existing_endpoint = client.get_endpoint(endpoint_name)
        print(f"Endpoint '{endpoint_name}' already exists.")
    except Exception as e:
        # If not found, create the endpoint
        if "RESOURCE_DOES_NOT_EXIST" in str(e):
            print(f"Creating a new endpoint: {endpoint_name}")
            endpoint = client.create_endpoint(
                name=endpoint_name,
                config={
                    "served_entities": [
                        {
                            "name": "my-model",
                            "entity_name": model_name,
                            "entity_version": "1",
                            "workload_size": "Small",
                            "scale_to_zero_enabled": True
                        }
                    ],
                    "traffic_config": {
                        "routes": [
                            {
                                "served_model_name": "my-model",
                                "traffic_percentage": 100
                            }
                        ]
                    }
                }
            )
        else:
            print(f"An error occurred: {e}")

    #update permissions so all users can query the endpoint

    client = get_deploy_client("databricks")
    endpoint_id = client.get_endpoint(endpoint=endpoint_name).id
    DATABRICKS_INSTANCE = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().get()
    TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
    # API endpoint for retrieving permissions
    url = f"{DATABRICKS_INSTANCE}/api/2.0/permissions/serving-endpoints/{endpoint_id}"

    # Headers for the request
    headers = {
        "Authorization": f"Bearer {TOKEN}",
        "Content-Type": "application/json"
    }

    # Payload to set permissions for the 'users' group
    payload = {
        "access_control_list": [
            {
                "group_name": "users",
                "permission_level": "CAN_QUERY"
            }
        ]
    }

    # Send the PATCH request to update permissions
    response = requests.patch(url, headers=headers, data=json.dumps(payload))

    # Check the response
    if response.status_code == 200:
        print("Permissions updated successfully.")
    else:
        print(f"Failed to update permissions: {response.text}")

DBAcademyHelper.monkey_patch(create_model)

In [0]:
DA = DBAcademyHelper(course_config, lesson_config)  # Create the DA object
DA.reset_lesson()                                   # Reset the lesson to a clean state
DA.init()                                           # Performs basic intialization including creating schemas and catalogs
DA.conclude_setup()                                 # Finalizes the state and prints the config for the student

# Create customer_details table
DA.create_details_table()

# Create feature table
DA.preprocess_customer_data()

# Create model and serving endpoint for instruction
DA.create_model()