In [0]:
from pyspark.sql.functions import md5, concat_ws, col, explode, split

In [0]:
wifi_locations_df = spark.read \
    .table("mta_bronze.wifi_location")

dim_lines_df = spark.read \
    .table("mta_silver.dim_line")

In [0]:
wifi_line_df = wifi_locations_df.select(
    'wfi_station',
    'wfi_station_complex',
    'wfi_lines') \
    .withColumn("wfi_sk", md5(concat_ws("_", col("wfi_station"), col("wfi_station_complex"), col("wfi_lines")))) \
    .withColumn("wfi_lines", explode(split(wifi_locations_df["wfi_lines"], ""))) \
    .select(
        "wfi_sk",
        "wfi_lines"
    )

In [0]:
wifi_line_final_df = wifi_line_df \
    .join(dim_lines_df, wifi_line_df.wfi_lines == dim_lines_df.lin_nk, "left") \
    .select(
        "wfi_sk",
        "lin_sk"
    )

In [0]:
wifi_line_final_df.write \
    .mode("overwrite") \
    .format("delta") \
    .option("mergeSchema", "true") \
    .saveAsTable("mta_silver.lkp_wifi_line")

In [0]:
%sql
SELECT * FROM mta_silver.lkp_wifi_line LIMIT 10;

wfi_sk,lin_sk
a7707dd7765cb454e09efa86d46fef95,dfcf28d0734569a6a693bc8194de62bf
f510bc5509ee070031b4c645bc05d70a,3a3ea00cfc35332cedf6e5e9a32e94da
f510bc5509ee070031b4c645bc05d70a,800618943025315f869e4e1f09471012
79bceca414d3e33c3ec11a0d0747bd6e,3a3ea00cfc35332cedf6e5e9a32e94da
79bceca414d3e33c3ec11a0d0747bd6e,800618943025315f869e4e1f09471012
79bceca414d3e33c3ec11a0d0747bd6e,69691c7bdcc3ce6d5d8a1361f22d04ac
79bceca414d3e33c3ec11a0d0747bd6e,e1e1d3d40573127e9ee0480caf1283d6
4b4eb79d0671cfd3b9747363c0aed70b,800618943025315f869e4e1f09471012
07d28580235bc37a78f7d0ede7364665,800618943025315f869e4e1f09471012
c63e4538a48dafaeb4e53db882c379a3,8f14e45fceea167a5a36dedd4bea2543


In [0]:
dbutils.notebook.exit("Success")