# Phase 3: Gold Layer - Feature Engineering

This notebook creates model-ready features from cleaned Silver data.

## Import Libraries

In [0]:
from datetime import datetime
from pyspark.sql import functions as F

from pyspark.sql.functions import (
    col,
    count,
    current_timestamp,
    lit,
    log,
    median,
    regexp_extract,
    trim,
    when,
)
from pyspark.sql.types import *
from pyspark.sql.window import Window

## Configuration

In [0]:
# Source and target tables
SILVER_TABLE_NAME = "silver_vehicles"
GOLD_TABLE_NAME = "gold_vehicles"

# Train/test split
TRAIN_TEST_SPLIT = 0.8
RANDOM_SEED = 42

# Current year for age calculation
CURRENT_YEAR = datetime.now().year

## Read Silver Data

In [0]:
df_silver = spark.table(SILVER_TABLE_NAME)
print(f"Silver data loaded: {df_silver.count()} rows")
df_silver.printSchema()

Silver data loaded: 19461 rows
root
 |-- record_id: long (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)
 |-- source_file: string (nullable = true)
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Kilometres: double (nullable = true)
 |-- BodyType: string (nullable = true)
 |-- Engine: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Drivetrain: string (nullable = true)
 |-- ExteriorColour: string (nullable = true)
 |-- InteriorColour: string (nullable = true)
 |-- Passengers: double (nullable = true)
 |-- Doors: double (nullable = true)
 |-- FuelType: string (nullable = true)
 |-- City: double (nullable = true)
 |-- Highway: double (nullable = true)
 |-- Price: float (nullable = true)
 |-- silver_processing_timestamp: timestamp (nullable = true)



## Step 1: Create Derived Features

In [0]:
# Vehicle age
df_gold = df_silver.withColumn("vehicle_age", lit(CURRENT_YEAR) - col("Year"))

# Average fuel efficiency
df_gold = df_gold.withColumn(
    "avg_fuel_efficiency", (col("City") + col("Highway")) / 2.0
)

# Price per kilometre (log-transform if needed to handle skew)
df_gold = df_gold.withColumn(
    "price_per_km",
    when(col("Kilometres") > 0, col("Price") / col("Kilometres")).otherwise(None),
)

# Extract engine displacement from Engine column (if available)
# Pattern: look for numbers followed by "L" (e.g., "2.0L", "3.5L")
# Handle empty strings gracefully
df_gold = df_gold.withColumn(
    "engine_displacement",
    when(
        (regexp_extract(col("Engine"), r"(\d+\.?\d*)\s*L", 1) != "")
        & (regexp_extract(col("Engine"), r"(\d+\.?\d*)\s*L", 1).isNotNull()),
        regexp_extract(col("Engine"), r"(\d+\.?\d*)\s*L", 1).cast("float"),
    ).otherwise(None),
)

# Extract cylinder count from Engine column
# Handle empty strings gracefully
df_gold = df_gold.withColumn(
    "cylinder_count",
    when(
        col("Engine").rlike(r"(\d+)\s*cyl")
        & (regexp_extract(col("Engine"), r"(\d+)\s*cyl", 1) != ""),
        regexp_extract(col("Engine"), r"(\d+)\s*cyl", 1).cast("int"),
    )
    .when(
        col("Engine").rlike(r"V(\d+)")
        & (regexp_extract(col("Engine"), r"V(\d+)", 1) != ""),
        regexp_extract(col("Engine"), r"V(\d+)", 1).cast("int"),
    )
    .when(
        col("Engine").rlike(r"I-(\d+)")
        & (regexp_extract(col("Engine"), r"I-(\d+)", 1) != ""),
        regexp_extract(col("Engine"), r"I-(\d+)", 1).cast("int"),
    )
    .when(
        col("Engine").rlike(r"(\d+)\s*Cylinder")
        & (regexp_extract(col("Engine"), r"(\d+)\s*Cylinder", 1) != ""),
        regexp_extract(col("Engine"), r"(\d+)\s*Cylinder", 1).cast("int"),
    )
    .otherwise(None),
)

print("Derived features sample:")
display(df_gold.select(
    "Year",
    "vehicle_age",
    "City",
    "Highway",
    "avg_fuel_efficiency",
    "Price",
    "Kilometres",
    "price_per_km",
    "Engine",
    "engine_displacement",
    "cylinder_count",
).limit(10))

Derived features sample:


Year,vehicle_age,City,Highway,avg_fuel_efficiency,Price,Kilometres,price_per_km,Engine,engine_displacement,cylinder_count
2007,18,9.099999904632568,6.450000047683716,7.774999976158142,5599.0,192845.0,0.0290336798983639,4 CYLINDER ENGINE,,
2009,16,9.099999904632568,6.450000047683716,7.774999976158142,7899.0,196000.0,0.0403010204081632,4 CYLINDER ENGINE,,
2011,14,9.5,6.5,8.0,12900.0,129000.0,0.1,4 CYLINDER ENGINE,,
2011,14,8.699999809265137,6.400000095367432,7.549999952316284,8888.0,218000.0,0.0407706422018348,4 CYLINDER ENGINE,,
2002,23,8.199999809265137,6.099999904632568,7.1499998569488525,6999.0,123338.0,0.0567465014837276,4 CYLINDER ENGINE,,
2002,23,8.199999809265137,6.099999904632568,7.1499998569488525,2998.0,229000.0,0.0130917030567685,1.7L L4 SOHC 16V,1.7,
2003,22,8.199999809265137,6.099999904632568,7.1499998569488525,3200.0,240000.0,0.0133333333333333,1.7L L4 SOHC 16V,1.7,
2003,22,8.199999809265137,6.099999904632568,7.1499998569488525,2800.0,240500.0,0.0116424116424116,1.7L L4 SOHC 16V,1.7,
2013,12,9.800000190734863,6.800000190734863,8.300000190734863,11000.0,130000.0,0.0846153846153846,4 CYLINDER ENGINE 2.0L,2.0,
2014,11,9.800000190734863,6.800000190734863,8.300000190734863,11199.0,194000.0,0.0577268041237113,4 CYLINDER ENGINE 2.0L,2.0,


## Step 2: Feature Selection

In [0]:
# Select features to keep (drop Exterior Colour, Interior Colour, Passengers, Doors, Engine)
df_gold = df_gold.select(
    col("record_id"),
    col("ingestion_timestamp"),
    col("source_file"),
    col("Make"),
    col("Model"),
    col("Year"),
    col("vehicle_age"),
    col("Kilometres"),
    col("BodyType"),
    col("engine_displacement"),
    col("cylinder_count"),
    col("Transmission"),
    col("Drivetrain"),
    col("FuelType"),
    col("City"),
    col("Highway"),
    col("avg_fuel_efficiency"),
    col("Price"),
    current_timestamp().alias("gold_processing_timestamp"),
)

print("Selected features:")
df_gold.printSchema()

Selected features:
root
 |-- record_id: long (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)
 |-- source_file: string (nullable = true)
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- vehicle_age: integer (nullable = true)
 |-- Kilometres: double (nullable = true)
 |-- BodyType: string (nullable = true)
 |-- engine_displacement: float (nullable = true)
 |-- cylinder_count: integer (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Drivetrain: string (nullable = true)
 |-- FuelType: string (nullable = true)
 |-- City: double (nullable = true)
 |-- Highway: double (nullable = true)
 |-- avg_fuel_efficiency: double (nullable = true)
 |-- Price: float (nullable = true)
 |-- gold_processing_timestamp: timestamp (nullable = false)



## Step 3: Handle Missing Values in Derived Features

In [0]:
# Impute missing values in derived features
# For engine_displacement and cylinder_count, use median by Make/Model

window_make_model = Window.partitionBy("Make", "Model")

# Impute engine_displacement
df_gold = df_gold.withColumn(
    "median_displacement", median("engine_displacement").over(window_make_model)
)
overall_median_displacement = df_gold.agg(
    median("engine_displacement").alias("median")
).collect()[0]["median"]

df_gold = df_gold.withColumn(
    "engine_displacement",
    when(
        col("engine_displacement").isNull(),
        when(
            col("median_displacement").isNotNull(), col("median_displacement")
        ).otherwise(overall_median_displacement),
    ).otherwise(col("engine_displacement")),
)

# Impute cylinder_count
df_gold = df_gold.withColumn(
    "median_cylinders", median("cylinder_count").over(window_make_model)
)
overall_median_cylinders = df_gold.agg(
    median("cylinder_count").alias("median")
).collect()[0]["median"]

df_gold = df_gold.withColumn(
    "cylinder_count",
    when(
        col("cylinder_count").isNull(),
        when(col("median_cylinders").isNotNull(), col("median_cylinders")).otherwise(
            overall_median_cylinders
        ),
    ).otherwise(col("cylinder_count")),
)

# Drop temporary columns
df_gold = df_gold.drop("median_displacement", "median_cylinders")

## Step 4: Train/Test Split

In [0]:
# Perform 80/20 split with random seed (BEFORE frequency encoding to avoid data leakage)
train_df, test_df = df_gold.randomSplit(
    [TRAIN_TEST_SPLIT, 1 - TRAIN_TEST_SPLIT], seed=RANDOM_SEED
)

## Step 5: Frequency Encoding for Model

In [0]:
# Calculate frequency encoding for Model (high cardinality categorical)
# IMPORTANT: Calculate frequency on TRAINING data only to avoid data leakage
# This converts Model into a numerical feature based on how common each model is

# Calculate frequency of each Model in TRAINING data only
model_freq = train_df.groupBy("Model").agg(count("*").alias("model_frequency"))

# Join frequency back to both train and test dataframes
train_df = train_df.join(model_freq, "Model", "left")
test_df = test_df.join(model_freq, "Model", "left")

# Handle models in test set that weren't in training set (set frequency to 1)
train_df = train_df.withColumn(
    "model_frequency",
    when(col("model_frequency").isNull(), 1).otherwise(col("model_frequency")),
)
test_df = test_df.withColumn(
    "model_frequency",
    when(col("model_frequency").isNull(), 1).otherwise(col("model_frequency")),
)

# Log transform frequency to reduce impact of very common models
train_df = train_df.withColumn(
    "model_frequency_log",
    log(col("model_frequency") + 1),  # +1 to avoid log(0)
)
test_df = test_df.withColumn("model_frequency_log", log(col("model_frequency") + 1))

print("Model frequency encoding sample (training data):")
# train_df.select("Make", "Model", "model_frequency", "model_frequency_log").show(
#     10, truncate=False
# )
display(
    train_df.select("Make", "Model", "model_frequency", "model_frequency_log").limit(10)
)

# Calculate counts
train_count = train_df.count()
test_count = test_df.count()
total_count = train_count + test_count

print(f"\nTraining set size: {train_count} rows")
print(f"Test set size: {test_count} rows")
print(f"Train percentage: {round(train_count / total_count * 100, 2)}%")
print(f"Test percentage: {round(test_count / total_count * 100, 2)}%")

# Add split indicator
train_df = train_df.withColumn("split", lit("train"))
test_df = test_df.withColumn("split", lit("test"))

# Combine for storage (after frequency encoding)
df_gold_with_split = train_df.union(test_df)

Model frequency encoding sample (training data):


Make,Model,model_frequency,model_frequency_log
Acura,MDX,88,4.48863636973214
Acura,MDX,88,4.48863636973214
Acura,RDX,93,4.543294782270004
Acura,RDX,93,4.543294782270004
Acura,MDX,88,4.48863636973214
Acura,TLX,56,4.04305126783455
Acura,TSX,13,2.6390573296152584
Acura,TLX,56,4.04305126783455
Acura,TLX,56,4.04305126783455
Acura,RDX,93,4.543294782270004



Training set size: 15634 rows
Test set size: 3827 rows
Train percentage: 80.34%
Test percentage: 19.66%


## Step 6: Categorical Encoding Setup

Note: We'll prepare the data for encoding, but actual encoding will be done in the model training pipeline

In [0]:
# Identify categorical columns (Model removed - using frequency encoding instead)
categorical_cols = ["Make", "BodyType", "FuelType", "Transmission", "Drivetrain"]

# Identify numerical columns for scaling (includes model_frequency_log)
numerical_cols = [
    "Kilometres",
    "City",
    "Highway",
    "vehicle_age",
    "avg_fuel_efficiency",
    "engine_displacement",
    "cylinder_count",
    "model_frequency_log",
]

# Target variable
target_col = "Price"

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)
print("Target column:", target_col)

Categorical columns: ['Make', 'BodyType', 'FuelType', 'Transmission', 'Drivetrain']
Numerical columns: ['Kilometres', 'City', 'Highway', 'vehicle_age', 'avg_fuel_efficiency', 'engine_displacement', 'cylinder_count', 'model_frequency_log']
Target column: Price


## Step 7: Prepare Data for ML Pipeline

In [0]:
# Ensure all categorical columns are strings and handle nulls
for cat_col in categorical_cols:
    df_gold_with_split = df_gold_with_split.withColumn(
        cat_col,
        when(col(cat_col).isNull() | (col(cat_col) == ""), "Unknown").otherwise(
            col(cat_col).cast("string")
        ),
    )

# Ensure all numerical columns are numeric and handle nulls/empty strings
# Check for empty strings before casting
for num_col in numerical_cols:
    df_gold_with_split = df_gold_with_split.withColumn(
        num_col,
        when(
            (col(num_col).isNotNull()) & (trim(col(num_col).cast("string")) != ""),
            col(num_col).cast("double"),
        ).otherwise(None),
    )

# Ensure target is numeric (handle empty strings safely)
df_gold_with_split = df_gold_with_split.withColumn(
    target_col,
    when(
        (col(target_col).isNotNull()) & (trim(col(target_col).cast("string")) != ""),
        col(target_col).cast("double"),
    ).otherwise(None),
)

# Filter out any rows with null target
df_gold_with_split = df_gold_with_split.filter(col(target_col).isNotNull())

print("Data preparation complete")
print(f"Final row count: {df_gold_with_split.count()}")

Data preparation complete
Final row count: 19461


## Step 7: Write to Delta Lake

In [0]:
# Write to Delta table (managed table in Unity Catalog)
df_gold_with_split.write.format("delta").mode("overwrite").option(
    "overwriteSchema", "true"
).saveAsTable(GOLD_TABLE_NAME)

print(f"Gold table '{GOLD_TABLE_NAME}' created successfully!")
print("Table is managed by Unity Catalog")

# Verify table creation
print("\nVerification - Sample from Gold table:")
display(spark.table(GOLD_TABLE_NAME).orderBy(F.rand()).limit(5))

print(f"\nFinal row count: {spark.table(GOLD_TABLE_NAME).count()}")

Gold table 'gold_vehicles' created successfully!
Table is managed by Unity Catalog

Verification - Sample from Gold table:


Model,record_id,ingestion_timestamp,source_file,Make,Year,vehicle_age,Kilometres,BodyType,engine_displacement,cylinder_count,Transmission,Drivetrain,FuelType,City,Highway,avg_fuel_efficiency,Price,gold_processing_timestamp,model_frequency,model_frequency_log,split
GLK-Class,7980,2025-12-05T04:03:04.121Z,ensf612project-data.csv,Mercedes-Benz,2013,12.0,149985.0,SUV,2.5,6.0,AUTOMATIC,AWD,Gasoline,11.100000381469728,8.100000381469727,9.600000381469728,16485.0,2025-12-05T08:48:39.744Z,6,1.9459101490553128,train
Escape,13018,2025-12-05T04:03:04.121Z,ensf612project-data.csv,Ford,2022,3.0,3474.0,SUV,2.0,6.0,CVT,4WD,Gasoline,5.5,6.400000095367432,5.950000047683716,46924.0,2025-12-05T08:48:39.744Z,219,5.393627546352362,train
A5,1021,2025-12-05T04:03:04.121Z,ensf612project-data.csv,Audi,2023,2.0,90.0,Hatchback,2.0,4.0,AUTOMATIC,AWD,Gasoline,10.0,7.0,8.5,63221.0,2025-12-05T08:48:39.744Z,70,4.262679877041316,test
Levante,5785,2025-12-05T04:03:04.121Z,ensf612project-data.csv,Maserati,2017,8.0,63319.0,SUV,3.0,6.0,AUTOMATIC,AWD,Gasoline,16.799999237060547,12.399999618530272,14.59999942779541,59800.0,2025-12-05T08:48:39.744Z,40,3.713572066704308,train
XC60,1959,2025-12-05T04:03:04.121Z,ensf612project-data.csv,Volvo,2020,5.0,54065.0,SUV,2.0,4.0,AUTOMATIC,AWD,Gasoline,9.800000190734863,8.699999809265137,9.25,48880.0,2025-12-05T08:48:39.744Z,102,4.634728988229636,train



Final row count: 19461


## Step 8: Create Separate Train/Test Tables (Optional but useful)

In [0]:
# Create separate tables for train and test sets
train_table_name = f"{GOLD_TABLE_NAME}_train"
test_table_name = f"{GOLD_TABLE_NAME}_test"

# Write train set (managed table in Unity Catalog)
train_df.write.format("delta").mode("overwrite").option(
    "overwriteSchema", "true"
).saveAsTable(train_table_name)

# Write test set (managed table in Unity Catalog)
test_df.write.format("delta").mode("overwrite").option(
    "overwriteSchema", "true"
).saveAsTable(test_table_name)

print(
    f"Train table '{train_table_name}' created: {spark.table(train_table_name).count()} rows"
)
print(
    f"Test table '{test_table_name}' created: {spark.table(test_table_name).count()} rows"
)

Train table 'gold_vehicles_train' created: 15634 rows
Test table 'gold_vehicles_test' created: 3827 rows


## Summary

In [0]:
print("=" * 80)
print("GOLD LAYER FEATURE ENGINEERING COMPLETE")
print("=" * 80)
print(f"Source: {SILVER_TABLE_NAME}")
print(f"Target: {GOLD_TABLE_NAME}")
total_final = df_gold_with_split.count()
train_final = train_df.count()
test_final = test_df.count()
print(f"Total rows: {total_final}")
print(f"Training rows: {train_final}")
print(f"Test rows: {test_final}")
print("Features created:")
print("  - vehicle_age")
print("  - avg_fuel_efficiency")
print("  - price_per_km")
print("  - engine_displacement")
print("  - cylinder_count")
print("  - model_frequency_log (frequency encoding for Model)")
print(
    f"Categorical features: {len(categorical_cols)} (Model removed - using frequency encoding)"
)
print(f"Numerical features: {len(numerical_cols)}")
print(f"Processing timestamp: {datetime.now()}")
print("=" * 80)

GOLD LAYER FEATURE ENGINEERING COMPLETE
Source: silver_vehicles
Target: gold_vehicles
Total rows: 19461
Training rows: 15634
Test rows: 3827
Features created:
  - vehicle_age
  - avg_fuel_efficiency
  - price_per_km
  - engine_displacement
  - cylinder_count
  - model_frequency_log (frequency encoding for Model)
Categorical features: 5 (Model removed - using frequency encoding)
Numerical features: 8
Processing timestamp: 2025-12-05 08:49:03.175356
