In [0]:
%run "../utils/custom_functions"

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, BooleanType
from pyspark.sql.functions import lit

In [0]:
wifi_location_schema = StructType(fields=[
    StructField("STATION", StringType(), False),
    StructField("STATION_COMPLEX", StringType(), False),
    StructField("LINES", StringType(), False),
    StructField("IS_HISTORICAL", StringType(), False),
    StructField("BOROUGH", StringType(), False),
    StructField("COUNTY", StringType(), False),
    StructField("LATITUDE", DoubleType(), False),
    StructField("LONGITUDE", DoubleType(), False),
    StructField("WIFI_AVAILABLE", StringType(), False),
    StructField("ATT", StringType(), False),
    StructField("SPRINT", StringType(), False),
    StructField("TMOBILE", StringType(), False),
    StructField("VERIZON", StringType(), False),
    StructField("Location", StringType(), False),
    StructField("Georeference", StringType(), False)
    ])

In [0]:
wifi_location_df = spark.read \
    .option("header", True) \
    .option("schema", wifi_location_schema) \
    .csv("/mnt/mtasubwaydl/raw/mta_wi-fi_locations.csv")

In [0]:
wifi_location_renamed_df = wifi_location_df \
    .withColumnRenamed("STATION", "wfi_station") \
    .withColumnRenamed("STATION_COMPLEX", "wfi_station_complex") \
    .withColumnRenamed("LINES", "wfi_lines") \
    .withColumnRenamed("IS_HISTORICAL", "wfi_is_historical") \
    .withColumnRenamed("BOROUGH", "wfi_borough") \
    .withColumnRenamed("COUNTY", "wfi_county") \
    .withColumnRenamed("LATITUDE", "wfi_latitude") \
    .withColumnRenamed("LONGITUDE", "wfi_longitude") \
    .withColumnRenamed("WIFI_AVAILABLE", "wfi_wifi_available") \
    .withColumnRenamed("ATT", "wfi_att") \
    .withColumnRenamed("SPRINT", "wfi_sprint") \
    .withColumnRenamed("TMOBILE", "wfi_tmobile") \
    .withColumnRenamed("VERIZON", "wfi_verizon") \
    .withColumnRenamed("Location", "wfi_location") \
    .withColumnRenamed("Georeference", "wfi_georeference")

In [0]:
wifi_location_final_df = add_ingestion_date(wifi_location_renamed_df, alias="wfi") \
    .withColumn("wfi_source", lit("data.gov"))

In [0]:
wifi_location_final_df.write.mode("overwrite").format("delta").partitionBy("wfi_borough").save("/mnt/mtasubwaydl/bronze/wifi_location")

if not spark.catalog.tableExists("mta_bronze.wifi_location"):
    spark.catalog.createTable(
        tableName="mta_bronze.wifi_location",
        path="/mnt/mtasubwaydl/bronze/wifi_location",
        source="delta"
)

In [0]:
%sql
SELECT * FROM mta_bronze.wifi_location LIMIT 20;

In [0]:
dbutils.notebook.exit("Success")