In [0]:
from pyspark.sql.types import StructType, \
                              StructField, \
                              StringType, \
                              IntegerType, \
                              FloatType, \
                              DateType, \
                              ByteType, \
                              TimestampType

from pyspark.sql import functions as F

In [0]:
%fs ls  "/FileStore/data/"

path,name,size,modificationTime
dbfs:/FileStore/data/payments.csv,payments.csv,57666115,1688666479000
dbfs:/FileStore/data/riders.csv,riders.csv,5594949,1688666472000
dbfs:/FileStore/data/stations.csv,stations.csv,49552,1688666473000
dbfs:/FileStore/data/trips.csv,trips.csv,440125504,1688666512000


In [0]:
path = "/FileStore/data/"

In [0]:
def write_data(data, table_name):
    """
    This function helps write delta format to the bronze store.
    """
    data.write.format("delta")\
            .mode("overwrite")\
            .save(f"/bronze_data_store/{table_name}data/")
    return f"Final save path for {data} is: /bronze_data_store/{table_name}data/"

"""
def write_data(data, format_type, table_name):
    '''
    This function helps write delta format to the bronze store.
    '''
    data.write.format(str(format_type))\
            .mode("overwrite")\
            .save(f"/bronze_data_store/{table_name}data/")
"""

Out[3]: '\ndef write_data(data, format_type, table_name):\n    \'\'\'\n    This function helps write delta format to the bronze store.\n    \'\'\'\n    data.write.format(str(format_type))            .mode("overwrite")            .save(f"/bronze_data_store/{table_name}data/")\n'

In [0]:
# Create a gold data store in Delta Lake tables

def read_create_gold_table(table_name):
    """
    This function reads bronze data store and 
    then writes gold level tables.
    """
    df = spark.read.format("delta")\
            .load(f"/bronze_data_store/{table_name}data/")

    # Save as table
    df.write.format("delta")\
        .mode("overwrite")\
        .saveAsTable(f"gold_{table_name}")
    return df

In [0]:
def write_starTables(data, table_name):
    data.write.format("delta")\
        .mode("overwrite")\
        .saveAsTable(table_name)

In [0]:
schema_payment = StructType([ \
    StructField("payment_id",IntegerType(),False), \
    StructField("date",DateType(),True), \
    StructField("amount",FloatType(),True), \
    StructField("ride_id", IntegerType(), True)
  ])

paymentDf = spark.read.format("csv") \
        .option("inferSchema","false") \
        .option("header", "false") \
        .option("sep", ",") \
        .schema(schema_payment) \
        .load(path + "payments.csv")
display(paymentDf.head(5))

# Write payment df to bronze
write_data(paymentDf, "paymentDf")

payment_id,date,amount,ride_id
1,2019-05-01,9.0,1000
2,2019-06-01,9.0,1000
3,2019-07-01,9.0,1000
4,2019-08-01,9.0,1000
5,2019-09-01,9.0,1000


In [0]:
schema_rider = StructType([ \
    StructField("rider_id",IntegerType(),False), \
    StructField("first",StringType(),True), \
    StructField("last",StringType(),True), \
    StructField("address",StringType(),True), \
    StructField("birthday", DateType(),True), \
    StructField("account_start_date", DateType(),True), \
    StructField("account_end_date", DateType(),True), \
    StructField("is_member", StringType(),True)
  ])

riderDf = spark.read.format("csv") \
        .option("inferSchema","false") \
        .option("header", "false") \
        .option("sep", ",") \
        .schema(schema_rider) \
        .load(path + "riders.csv")
display(riderDf.head(5))

# Write rider df to bronze
write_data(riderDf, "riderDf")

rider_id,first,last,address,birthday,account_start_date,account_end_date,is_member
1000,Diana,Clark,1200 Alyssa Squares,1989-02-13,2019-04-23,,True
1001,Jennifer,Smith,397 Diana Ferry,1976-08-10,2019-11-01,2020-09-01,True
1002,Karen,Smith,644 Brittany Row Apt. 097,1998-08-10,2022-02-04,,True
1003,Bryan,Roberts,996 Dickerson Turnpike,1999-03-29,2019-08-26,,False
1004,Jesse,Middleton,7009 Nathan Expressway,1969-04-11,2019-09-14,,True


In [0]:
schema_station = StructType([ \
    StructField("station_id",StringType(),False), \
    StructField("name",StringType(),True), \
    StructField("latitude",FloatType(),True), \
    StructField("longitude", FloatType(), True)
  ])

stationDf = spark.read.format("csv") \
        .option("inferSchema","false") \
        .option("header", "false") \
        .option("sep", ",") \
        .schema(schema_station) \
        .load(path + "stations.csv")
display(stationDf.head(5))

# Write station df to bronze
write_data(stationDf, "stationDf")

station_id,name,latitude,longitude
525,Glenwood Ave & Touhy Ave,42.01269912719727,-87.66606140136719
KA1503000012,Clark St & Lake St,41.88579559326172,-87.631103515625
637,Wood St & Chicago Ave,41.895633697509766,-87.67206573486328
13216,State St & 33rd St,41.83473205566406,-87.62582397460938
18003,Fairbanks St & Superior St,41.895809173583984,-87.62025451660156


In [0]:
display (
    stationDf.select('station_id').distinct()
)

station_id
13192
KA1504000162
15634
15539
TA1305000022
KA1503000075
13133
TA1305000009
KA1504000135
17660


In [0]:
schema_trip = StructType([ \
    StructField("trip_id",StringType(),False), \
    StructField("rideable_type", StringType(),True), \
    StructField("started_at", TimestampType(),True), \
    StructField("ended_at", TimestampType(),True), \
    StructField("start_station_id",StringType(),True), \
    StructField("end_station_id",StringType(),True), \
    StructField("rider_id",IntegerType(),True)
  ])

tripsDf = spark.read.format("csv") \
        .option("inferSchema","false") \
        .option("header", "false") \
        .option("sep", ",") \
        .schema(schema_trip) \
        .load(path + "trips.csv")

display(tripsDf.head(5))

# Write trips df to bronze
write_data(tripsDf, "tripsDf")

trip_id,rideable_type,started_at,ended_at,start_station_id,end_station_id,rider_id
89E7AA6C29227EFF,classic_bike,2021-02-12T16:14:56.000+0000,2021-02-12T16:21:43.000+0000,525,660,71934
0FEFDE2603568365,classic_bike,2021-02-14T17:52:38.000+0000,2021-02-14T18:12:09.000+0000,525,16806,47854
E6159D746B2DBB91,electric_bike,2021-02-09T19:10:18.000+0000,2021-02-09T19:19:10.000+0000,KA1503000012,TA1305000029,70870
B32D3199F1C2E75B,classic_bike,2021-02-02T17:49:41.000+0000,2021-02-02T17:54:06.000+0000,637,TA1305000034,58974
83E463F23575F4BF,electric_bike,2021-02-23T15:07:23.000+0000,2021-02-23T15:22:37.000+0000,13216,TA1309000055,39608


In [0]:
# Create a gold data store in Delta Lake tables
read_create_gold_table("paymentDf")
read_create_gold_table("stationDf")
read_create_gold_table("riderDf")
read_create_gold_table("tripsDf")

Out[32]: DataFrame[trip_id: string, rideable_type: string, started_at: timestamp, ended_at: timestamp, start_station_id: string, end_station_id: string, rider_id: int]

In [0]:
#Transform the data into the star schema for a Gold data store

In [0]:
payment_table = spark.table("gold_paymentdf")
print(f"Payment table count is: {payment_table.count()}")
rider_table = spark.table("gold_riderdf")
print(f"Rider table count is: {rider_table.count()}")
station_table = spark.table("gold_stationdf")
print(f"Station table count is:{station_table.count()}")
trips_table = spark.table("gold_tripsdf")
print(f"Trips table count is: {trips_table.count()}")

Payment table count is: 1946607
Rider table count is: 75000
Station table count is:838
Trips table count is: 4584921


In [0]:
rider_table.columns

Out[68]: ['rider_id',
 'first',
 'last',
 'address',
 'birthday',
 'account_start_date',
 'account_end_date',
 'is_member']

In [0]:
rider_df = spark.table("gold_riderdf")
print(f"Rider table count is: {rider_df.count()}")

dimRider = rider_df.withColumn('rider_key', F.col('rider_id'))\
        .select(['rider_id',
                'first',
                'last',
                'address',
                'account_start_date',
                'account_end_date',
                'is_member']
                )
display(dimRider.head(5))

# write_starTables(dimRider, "dimRider")

Rider table count is: 75000


rider_id,first,last,address,account_start_date,account_end_date,is_member
1000,Diana,Clark,1200 Alyssa Squares,2019-04-23,,True
1001,Jennifer,Smith,397 Diana Ferry,2019-11-01,2020-09-01,True
1002,Karen,Smith,644 Brittany Row Apt. 097,2022-02-04,,True
1003,Bryan,Roberts,996 Dickerson Turnpike,2019-08-26,,False
1004,Jesse,Middleton,7009 Nathan Expressway,2019-09-14,,True


In [0]:
station_df = spark.table("gold_stationdf")
# print(type(station_df))
print(f"Station table count is: {station_table.count()}")

dimStation = station_df.withColumn('station_key', F.col('station_id'))\
        .select("*")
# display(dimStation.head(5))

write_starTables(dimStation, "dimStation")

Station table count is: 838


In [0]:
# Create trip fact table
trips_df = spark.table("gold_tripsdf")
print(f"Trips table count is: {trips_table.count()}")

trip_facts = trips_df.select("*")\
        .withColumn('station_key', F.col('station_id'))
# display(dimStation.head(5))

write_starTables(trip_facts, "trip_facts")

In [0]:
# Payment Fact table
payment_df = spark.table("gold_paymentdf")
print(f"Payment table count is: {payment_table.count()}")

payment_df = payment_df.withColumnRenamed("payment_key", "payment_id")

payment_facts = payment_df.withColumn("payment_key", F.col('payment_id'))\
    .select("*")
display(payment_facts.head(5))


Payment table count is: 1946607


payment_id,date,amount,ride_id,payment_key
1,2019-05-01,9.0,1000,1
2,2019-06-01,9.0,1000,2
3,2019-07-01,9.0,1000,3
4,2019-08-01,9.0,1000,4
5,2019-09-01,9.0,1000,5


In [0]:
payment_df_demo = payment_df.join(rider_df, 
                                  rider_df.rider_id == payment_df.ride_id, 
                                  how="inner")\
                            .join("2nd_table",
                                  "station_df.station_id == 2nd_table_id"
                                  how="inner")
                            .select('payment_id', 'is_member')
payment_df_demo.display()

payment_id,is_member
1,True
2,True
3,True
4,True
5,True
6,True
7,True
8,True
9,True
10,True


In [0]:
# Create dimDate table




In [0]:
payment_table.createOrReplaceTempView("payment_tbl")

In [0]:
spark.sql(
    '''
        SELECT *
        FROM payment_tbl 
        LIMIT 10
    ''').display()

payment_id,date,amount,ride_id
1,2019-05-01,9.0,1000
2,2019-06-01,9.0,1000
3,2019-07-01,9.0,1000
4,2019-08-01,9.0,1000
5,2019-09-01,9.0,1000
6,2019-10-01,9.0,1000
7,2019-11-01,9.0,1000
8,2019-12-01,9.0,1000
9,2020-01-01,9.0,1000
10,2020-02-01,9.0,1000


In [0]:
%sql
SELECT * 
FROM payment_tbl

payment_id,date,amount,ride_id
1,2019-05-01,9.0,1000
2,2019-06-01,9.0,1000
3,2019-07-01,9.0,1000
4,2019-08-01,9.0,1000
5,2019-09-01,9.0,1000
6,2019-10-01,9.0,1000
7,2019-11-01,9.0,1000
8,2019-12-01,9.0,1000
9,2020-01-01,9.0,1000
10,2020-02-01,9.0,1000
