In [0]:
# --- ADLS via SAS (FixedSASTokenProvider) — short & robust -----------------
storage_account = "stnzrentdev"
container = "nz-rent"
dfs_fqdn = f"{storage_account}.dfs.core.windows.net"
abfss_url = f"abfss://{container}@{dfs_fqdn}/"

# Use token WITHOUT leading '?'
sas_token_raw = "sv=2024-11-04&ss=bfqt&srt=co&sp=rwdlacupyx&se=2025-10-25T16:26:07Z&st=2025-10-15T08:11:07Z&spr=https&sig=Xddwgamve%2Fr6c2FKAWLKWax2cOWBZwUJ5t%2BpmxPWOdg%3D"

# 0) Clear possible conflicting configs (ignore errors if not set)
for k in [
    f"fs.azure.account.key.{dfs_fqdn}",
    f"fs.azure.sas.{container}.{storage_account}.dfs.core.windows.net",
]:
    try: spark.conf.unset(k)
    except Exception: pass

# 1) Tell Spark to use SAS with FixedSASTokenProvider
spark.conf.set(f"fs.azure.account.auth.type.{dfs_fqdn}", "SAS")
spark.conf.set(f"fs.azure.sas.token.provider.type.{dfs_fqdn}",
               "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set(f"fs.azure.sas.fixed.token.{dfs_fqdn}", sas_token_raw)

# 2) List container; bootstrap /bronze if empty
entries = dbutils.fs.ls(abfss_url)
if len(entries) == 0:
    dbutils.fs.mkdirs(abfss_url + "bronze")
    dbutils.fs.put(abfss_url + "bronze/_sanity.txt", "hello databricks", overwrite=True)
    entries = dbutils.fs.ls(abfss_url)

# 3) Show as table
rows = [(e.path, e.size, e.modificationTime) for e in entries]
display(spark.createDataFrame(rows, ["path", "size", "mtime"]).orderBy("path"))

In [0]:
from pyspark.sql.functions import avg, round as rd

fact = spark.read.format("delta").load(f"{abfss_url}gold/fact_rent")
dim_t = spark.read.format("delta").load(f"{abfss_url}gold/dim_time")
dim_s = spark.read.format("delta").load(f"{abfss_url}gold/dim_suburb")

# Trendline: average rent per month per region
trend_region = (
    fact.join(dim_t, "time_id")
        .join(dim_s, "suburb_id")
        .groupBy("region", "date_month")
        .agg(rd(avg("median_rent"), 2).alias("avg_rent"))
        .orderBy("region", "date_month")
)

display(trend_region)


In [0]:
import os

# Đường dẫn DBFS (Databricks FileStore path)
save_dir = "/dbfs/FileStore/reports/"

# Tạo thư mục nếu chưa có
os.makedirs(save_dir, exist_ok=True)
print("Directory ready:", save_dir)

In [0]:
import matplotlib.pyplot as plt
import pandas as pd

# Convert Spark DataFrame -> Pandas
pd_trend = trend_region.toPandas()

plt.figure(figsize=(8,5))
for region, g in pd_trend.groupby("region"):
    plt.plot(g["date_month"], g["avg_rent"], marker="o", label=region)

plt.title("Average Median Rent by Region (NZ)")
plt.xlabel("Month")
plt.ylabel("Average Rent (NZD)")
plt.xticks(rotation=45)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("/dbfs/FileStore/reports/trendline_region.png", dpi=120)
plt.show()

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, round as rd, col

w = Window.partitionBy("suburb_id").orderBy("date_month")

df_mom = (
    fact.join(dim_t, "time_id")
        .join(dim_s, "suburb_id")
        .withColumn("prev_rent", lag("median_rent").over(w))
        .withColumn("mom_pct", rd((col("median_rent") - col("prev_rent")) / col("prev_rent") * 100, 2))
        .filter(col("mom_pct").isNotNull())
)

latest_month = df_mom.agg({"date_month": "max"}).collect()[0][0]
top_gainers = (
    df_mom.filter(col("date_month") == latest_month)
           .orderBy(col("mom_pct").desc())
           .limit(10)
)

display(top_gainers.select("region", "suburb_name", "mom_pct", "median_rent"))

In [0]:
pd_top = top_gainers.toPandas()

plt.figure(figsize=(7,5))
plt.barh(pd_top["suburb_name"], pd_top["mom_pct"], color="skyblue")
plt.title("Top Month-over-Month Gainers – Latest Month")
plt.xlabel("Change vs Previous Month (%)")
plt.ylabel("Suburb")
plt.tight_layout()
plt.savefig("/dbfs/FileStore/reports/top_mom_gainers.png", dpi=120)
plt.show()

In [0]:
from pyspark.sql.functions import stddev_samp

w3 = Window.partitionBy("suburb_id").orderBy("date_month").rowsBetween(-2, 0)

df_vol = (
    fact.join(dim_t, "time_id")
        .join(dim_s, "suburb_id")
        .withColumn("rolling3_std", rd(stddev_samp("median_rent").over(w3), 2))
)

latest_month = df_vol.agg({"date_month": "max"}).collect()[0][0]
top_vol = (
    df_vol.filter(col("date_month") == latest_month)
           .orderBy(col("rolling3_std").desc())
           .limit(10)
)

display(top_vol.select("region", "suburb_name", "rolling3_std"))

In [0]:
pd_vol = top_vol.toPandas()

plt.figure(figsize=(7,5))
plt.scatter(pd_vol["suburb_name"], pd_vol["rolling3_std"], s=80, alpha=0.7)
plt.title("Top Volatile Suburbs (3-Month Rolling STD)")
plt.xlabel("Suburb")
plt.ylabel("Rolling 3-Month Std (NZD)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("/dbfs/FileStore/reports/volatility.png", dpi=120)
plt.show()