In [0]:
from pyspark.sql.functions import col, to_timestamp, year, month
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler

# 1. Load the Parquet data we just created
df = spark.read.parquet("/Volumes/workspace/default/uk_land_registry/bronze_parquet")

# 2. Cleaning & Temporal Considerations (Requirement 4a)
# We handle missing values and extract the Year to help with 'Temporal' analysis
silver_df = df.select(
    col("Price").cast("double"), # Casting to double for the Scaler later
    to_timestamp(col("Date"), "yyyy-MM-dd HH:mm").alias("Sale_Date"),
    col("Property_Type"),
    col("Old_New"),
    col("Town_City")
).dropna()

silver_df = silver_df.withColumn("Sale_Year", year(col("Sale_Date")))

# 3. Feature Engineering (Requirement 2a)
# Converting Categorical 'Property_Type' to numeric
indexer = StringIndexer(inputCol="Property_Type", outputCol="type_label")
indexed_df = indexer.fit(silver_df).transform(silver_df)

# 4. Normalization/Scaling (Requirement 2a - 'Scaling/Normalization')
# We assemble features then use StandardScaler so the 'Price' doesn't bias the model
assembler = VectorAssembler(inputCols=["Price"], outputCol="unscaled_features")
assembled_df = assembler.transform(indexed_df)

scaler = StandardScaler(inputCol="unscaled_features", outputCol="scaled_features", withStd=True, withMean=True)
scaler_model = scaler.fit(assembled_df)
final_engineered_df = scaler_model.transform(assembled_df)

# 5. Save the Silver Layer
final_engineered_df.write.mode("overwrite").parquet("/Volumes/workspace/default/uk_land_registry/silver_engineered_parquet")

print(f"Notebook 2 Complete: Silver Layer stored with Scaling and Feature Engineering applied.")
final_engineered_df.select("Price", "scaled_features", "type_label").show(5)

Notebook 2 Complete: Silver Layer stored with Scaling and Feature Engineering applied.
+--------+--------------------+----------+
|   Price|     scaled_features|type_label|
+--------+--------------------+----------+
|156000.0|[-0.0795096213290...|       3.0|
| 82500.0|[-0.1544214218193...|       2.0|
|165000.0|[-0.0703367477996...|       1.0|
|205000.0|[-0.0295684210021...|       1.0|
|117000.0|[-0.1192587399565...|       0.0|
+--------+--------------------+----------+
only showing top 5 rows
