In [0]:
# Install required package
%pip install databricks-feature-engineering

from pyspark.sql import SparkSession

# Load data from Delta table
wine_df = spark.table("jpmc_group_catalog.mlops.wine_data_poc_dabs")


In [0]:
from pyspark.sql.functions import col, log, when
from pyspark.sql.types import DoubleType

# Log-transform skewed features 

wine_df = wine_df.withColumn("log_residual_sugar", log(col("residual_sugar") + 1)) 

wine_df = wine_df.withColumn("log_chlorides", log(col("chlorides") + 1))

# Create interaction features
wine_df = wine_df.withColumn("sulphates_alcohol", col("sulphates") * col("alcohol"))
wine_df = wine_df.withColumn("acidity_ratio", col("fixed_acidity") / (col("volatile_acidity") + 1e-5))

# Binning pH values into categorical groups: low (<3), medium (3-3.5),high (>3.5) 
wine_df = wine_df.withColumn("pH_bin", when(col("PH") < 3, "low")
                                      .when(col("PH") < 3.5, "medium") 
                                      .otherwise("high"))

# Feature Engineering: One-hot encode pH_bin to create binary indicator columns
wine_df = wine_df.withColumn("pH_bin_low", (col("pH_bin") == "low").cast (DoubleType())) 
wine_df = wine_df.withColumn("pH_bin_medium", (col("pH_bin") == "medium").cast (DoubleType())) 
wine_df = wine_df.withColumn("pH_bin_high", (col("pH_bin") == "high").cast (DoubleType()))

In [0]:
# Feature Engineering: Create a reusable Python UDF in Unity Catalog 
# This UDF calculates the average of three acidity-related features:
# - fixed_acidity, volatile_acidity, citric acid
# The UDF is registered in Unity Catalog for governance and reusability across teams 

avg_acidity_udf_fn_dabs = """
CREATE OR REPLACE FUNCTION jpmc_group_catalog.mlops.avg_acidity_udf_fn_dabs(fixed_acidity float, volatile_acidity float, citric_acid float)
RETURNS float
LANGUAGE PYTHON
COMMENT "Extract AVG values of acidity columns"
AS $$
def avg_acidity_udf_fn_dabs(fixed_acidity: float, volatile_acidity: float, citric_acid: float) -> float:
    avg = (fixed_acidity + volatile_acidity + citric_acid)/3
    return avg
return avg_acidity_udf_fn_dabs(fixed_acidity, volatile_acidity, citric_acid)
$$
"""

spark.sql(avg_acidity_udf_fn_dabs)

In [0]:

from pyspark.sql.functions import monotonically_increasing_id
wine_df = wine_df.withColumn("wine_id", monotonically_increasing_id())


In [0]:
# Select all engineered features and original features (except target) 

feature_columns = [
    "wine_id",
    "residual_sugar",
    "chlorides",
    "free_sulfur_dioxide",
    "total_sulfur_dioxide",
    "density",
    "PH",
    "sulphates",
    "alcohol",
    "log_residual_sugar",
    "log_chlorides",
    "sulphates_alcohol",
    "acidity_ratio",
    "PH_bin_low",
    "PH_bin_medium", 
    "PH_bin_high"
]

features_df = wine_df.select(*feature_columns)




In [0]:
import sys
import subprocess

# Install the package
subprocess.check_call([sys.executable, "-m", "pip", "install", "databricks-feature-engineering"])

# Restart Python
dbutils.library.restartPython()

In [0]:
# Create feature table with wine_id as primary key
from databricks.feature_engineering import FeatureEngineeringClient

fe = FeatureEngineeringClient()
feature_table_name = "jpmc_group_catalog.mlops.wine_features_engineered_dabs"

fe.create_table(
    name=feature_table_name,
    primary_keys=["wine_id"],
    df=features_df,
    schema=features_df.schema,
    description="Engineered features for wine quality Random Forest model"
)

In [0]:

display (features_df)