In [0]:
from pyspark.sql.types import StructType, \
                              StructField, \
                              StringType, \
                              IntegerType, \
                              FloatType, \
                              DateType, \
                              ByteType, \
                              TimestampType
from pyspark.sql.functions import *

In [0]:
%fs ls  "/FileStore/raw_data"

path,name,size,modificationTime
dbfs:/FileStore/raw_data/payments.csv,payments.csv,57666115,1688746378000
dbfs:/FileStore/raw_data/riders.csv,riders.csv,5594949,1688746366000
dbfs:/FileStore/raw_data/stations.csv,stations.csv,49552,1688746366000
dbfs:/FileStore/raw_data/trips.csv,trips.csv,440125504,1688746435000


In [0]:
path = "/FileStore/raw_data/"

In [0]:
def write_data(data, table_name):
    """
    This function helps write delta format to the bronze store.
    """
    data.write.format("delta")\
            .mode("overwrite")\
            .save(f"/bronze_data_store/{table_name}data/")
    return f"Final save path for {table_name} is: /bronze_data_store/{table_name}data/"

# """
# def write_data(data, format_type, table_name):
#     '''
#     This function helps write delta format to the bronze store.
#     '''
#     data.write.format(str(format_type))\
#             .mode("overwrite")\
#             .save(f"/bronze_data_store/{table_name}data/")
# """

In [0]:
# Create a gold data store in Delta Lake tables

def read_create_gold_table(table_name):
    """
    This function reads bronze data store and 
    then writes gold level tables.
    """
    df = spark.read.format("delta")\
            .load(f"/bronze_data_store/{table_name}data/")

    # Save as table
    df.write.format("delta")\
        .mode("overwrite")\
        .saveAsTable(f"gold_{table_name}")
    return df

In [0]:
def write_starTables(data, table_name):
    data.write.format("delta")\
        .mode("overwrite")\
        .saveAsTable(table_name)

In [0]:
schema_payment = StructType([ \
    StructField("payment_id",IntegerType(),False), \
    StructField("date",DateType(),True), \
    StructField("amount",FloatType(),True), \
    StructField("ride_id", IntegerType(), True)
  ])

paymentDf = spark.read.format("csv") \
        .option("inferSchema","false") \
        .option("header", "false") \
        .option("sep", ",") \
        .schema(schema_payment) \
        .load(path + "payments.csv")
display(paymentDf.head(5))

# Write payment df to bronze
write_data(paymentDf, "paymentDf")

payment_id,date,amount,ride_id
1,2019-05-01,9.0,1000
2,2019-06-01,9.0,1000
3,2019-07-01,9.0,1000
4,2019-08-01,9.0,1000
5,2019-09-01,9.0,1000


Out[6]: 'Final save path for paymentDf is: /bronze_data_store/paymentDfdata/'

In [0]:
schema_rider = StructType([ \
    StructField("rider_id",IntegerType(),False), \
    StructField("first",StringType(),True), \
    StructField("last",StringType(),True), \
    StructField("address",StringType(),True), \
    StructField("birthday", DateType(),True), \
    StructField("account_start_date", DateType(),True), \
    StructField("account_end_date", DateType(),True), \
    StructField("is_member", StringType(),True)
  ])

riderDf = spark.read.format("csv") \
        .option("inferSchema","false") \
        .option("header", "false") \
        .option("sep", ",") \
        .schema(schema_rider) \
        .load(path + "riders.csv")
display(riderDf.head(5))

# Write rider df to bronze
write_data(riderDf, "riderDf")

rider_id,first,last,address,birthday,account_start_date,account_end_date,is_member
1000,Diana,Clark,1200 Alyssa Squares,1989-02-13,2019-04-23,,True
1001,Jennifer,Smith,397 Diana Ferry,1976-08-10,2019-11-01,2020-09-01,True
1002,Karen,Smith,644 Brittany Row Apt. 097,1998-08-10,2022-02-04,,True
1003,Bryan,Roberts,996 Dickerson Turnpike,1999-03-29,2019-08-26,,False
1004,Jesse,Middleton,7009 Nathan Expressway,1969-04-11,2019-09-14,,True


Out[7]: 'Final save path for riderDf is: /bronze_data_store/riderDfdata/'

In [0]:
schema_station = StructType([ \
    StructField("station_id",StringType(),False), \
    StructField("name",StringType(),True), \
    StructField("latitude",FloatType(),True), \
    StructField("longitude", FloatType(), True)
  ])

stationDf = spark.read.format("csv") \
        .option("inferSchema","false") \
        .option("header", "false") \
        .option("sep", ",") \
        .schema(schema_station) \
        .load(path + "stations.csv")
display(stationDf.head(5))

# Write station df to bronze
write_data(stationDf, "stationDf")

station_id,name,latitude,longitude
525,Glenwood Ave & Touhy Ave,42.01269912719727,-87.66606140136719
KA1503000012,Clark St & Lake St,41.88579559326172,-87.631103515625
637,Wood St & Chicago Ave,41.895633697509766,-87.67206573486328
13216,State St & 33rd St,41.83473205566406,-87.62582397460938
18003,Fairbanks St & Superior St,41.895809173583984,-87.62025451660156


Out[8]: 'Final save path for stationDf is: /bronze_data_store/stationDfdata/'

In [0]:
schema_trip = StructType([ \
    StructField("trip_id",StringType(),False), \
    StructField("rideable_type", StringType(),True), \
    StructField("started_at", TimestampType(),True), \
    StructField("ended_at", TimestampType(),True), \
    StructField("start_station_id",StringType(),True), \
    StructField("end_station_id",StringType(),True), \
    StructField("rider_id",IntegerType(),True)
  ])

tripsDf = spark.read.format("csv") \
        .option("inferSchema","false") \
        .option("header", "false") \
        .option("sep", ",") \
        .schema(schema_trip) \
        .load(path + "trips.csv")

display(tripsDf.head(5))

# Write trips df to bronze
write_data(tripsDf, "tripsDf")

trip_id,rideable_type,started_at,ended_at,start_station_id,end_station_id,rider_id
89E7AA6C29227EFF,classic_bike,2021-02-12T16:14:56.000+0000,2021-02-12T16:21:43.000+0000,525,660,71934
0FEFDE2603568365,classic_bike,2021-02-14T17:52:38.000+0000,2021-02-14T18:12:09.000+0000,525,16806,47854
E6159D746B2DBB91,electric_bike,2021-02-09T19:10:18.000+0000,2021-02-09T19:19:10.000+0000,KA1503000012,TA1305000029,70870
B32D3199F1C2E75B,classic_bike,2021-02-02T17:49:41.000+0000,2021-02-02T17:54:06.000+0000,637,TA1305000034,58974
83E463F23575F4BF,electric_bike,2021-02-23T15:07:23.000+0000,2021-02-23T15:22:37.000+0000,13216,TA1309000055,39608


Out[9]: 'Final save path for tripsDf is: /bronze_data_store/tripsDfdata/'

In [0]:
# Create a gold data store in Delta Lake tables
read_create_gold_table("paymentDf")
read_create_gold_table("stationDf")
read_create_gold_table("riderDf")
read_create_gold_table("tripsDf")

Out[10]: DataFrame[trip_id: string, rideable_type: string, started_at: timestamp, ended_at: timestamp, start_station_id: string, end_station_id: string, rider_id: int]

In [0]:
#Transform the data into the star schema for a Gold data store

In [0]:
payment_table = spark.table("gold_paymentdf")
print(f"Payment table count is: {payment_table.count()}")
rider_table = spark.table("gold_riderdf")
print(f"Rider table count is: {rider_table.count()}")
station_table = spark.table("gold_stationdf")
print(f"Station table count is:{station_table.count()}")
trips_table = spark.table("gold_tripsdf")
print(f"Trips table count is: {trips_table.count()}")

Payment table count is: 1946607
Rider table count is: 75000
Station table count is:838
Trips table count is: 4584921


In [0]:
rider_df = spark.table("gold_riderdf")
print(f"Rider table count is: {rider_df.count()}")

dimRider = rider_df.withColumn('rider_key', col('rider_id'))\
        .select(['rider_id',
                'first',
                'last',
                'address',
                'account_start_date',
                'account_end_date',
                'is_member']
                )
display(dimRider.head(50))

write_starTables(dimRider, "dimRider")

Rider table count is: 75000


rider_id,first,last,address,account_start_date,account_end_date,is_member
1000,Diana,Clark,1200 Alyssa Squares,2019-04-23,,True
1001,Jennifer,Smith,397 Diana Ferry,2019-11-01,2020-09-01,True
1002,Karen,Smith,644 Brittany Row Apt. 097,2022-02-04,,True
1003,Bryan,Roberts,996 Dickerson Turnpike,2019-08-26,,False
1004,Jesse,Middleton,7009 Nathan Expressway,2019-09-14,,True
1005,Christine,Rodriguez,224 Washington Mills Apt. 467,2020-03-24,,False
1006,Alicia,Taylor,1137 Angela Locks,2020-11-27,2021-12-01,True
1007,Benjamin,Fernandez,979 Phillips Ways,2016-12-11,,False
1008,John,Crawford,7691 Evans Court,2021-03-28,2021-07-01,True
1009,Victoria,Ritter,9922 Jim Crest Apt. 319,2020-06-12,2021-11-01,True


In [0]:
station_df = spark.table("gold_stationdf")
# print(type(station_df))
print(f"Station table count is: {station_table.count()}")

dimStation = station_df.withColumn('station_key', col('station_id'))\
        .select("*")
display(dimStation.head(5))

write_starTables(dimStation, "dimStation")

Station table count is: 838


station_id,name,latitude,longitude,station_key
525,Glenwood Ave & Touhy Ave,42.01269912719727,-87.66606140136719,525
KA1503000012,Clark St & Lake St,41.88579559326172,-87.631103515625,KA1503000012
637,Wood St & Chicago Ave,41.895633697509766,-87.67206573486328,637
13216,State St & 33rd St,41.83473205566406,-87.62582397460938,13216
18003,Fairbanks St & Superior St,41.895809173583984,-87.62025451660156,18003


In [0]:
# Payment Fact table
payment_df = spark.table("gold_paymentdf")
print(f"Payment table count is: {payment_table.count()}")

payment_df = payment_df.withColumnRenamed("payment_key", "payment_id")

payment_facts = payment_df.select(
                        col("payment_id").alias("payment_key"),
                        col("amount").alias("amount"),
                        col("ride_id").alias("rider_key"),
                        date_format(col("date"), "yyyyMMdd").alias("date_key"))
display(payment_facts.head(15))

write_starTables(payment_facts, "payment_facts")

Payment table count is: 1946607


payment_key,amount,rider_key,date_key
1,9.0,1000,20190501
2,9.0,1000,20190601
3,9.0,1000,20190701
4,9.0,1000,20190801
5,9.0,1000,20190901
6,9.0,1000,20191001
7,9.0,1000,20191101
8,9.0,1000,20191201
9,9.0,1000,20200101
10,9.0,1000,20200201


In [0]:
# Create dimDate table
distinct_date_key = payment_df.select(date_format(col("date"), "yyyyMMdd").alias("date_key")).distinct()

# Calculate other columns and join with distinct_date_key
dimDate = payment_df.join(distinct_date_key,
                          (date_format(payment_df.date, "yyyyMMdd")) == distinct_date_key.date_key, 
                          "inner") \
    .select(
        col("date_key"), 
        col("date"), 
        year("date").alias("year"), 
        quarter("date").alias("quarter"),
        month("date").alias("month"),                
        dayofweek("date").alias("dayofweek"),
        weekofyear("date").alias("week")
    ) \
    .withColumn("is_weekend", when((dayofweek("date") == 6) | (dayofweek("date") == 7), True).otherwise(False))
dimDate.display(50)

write_starTables(dimDate, "dimDate")

date_key,date,year,quarter,month,dayofweek,week,is_weekend
20130601,2013-06-01,2013,2,6,7,22,True
20130601,2013-06-01,2013,2,6,7,22,True
20130601,2013-06-01,2013,2,6,7,22,True
20130601,2013-06-01,2013,2,6,7,22,True
20130601,2013-06-01,2013,2,6,7,22,True
20130601,2013-06-01,2013,2,6,7,22,True
20130601,2013-06-01,2013,2,6,7,22,True
20130601,2013-06-01,2013,2,6,7,22,True
20130601,2013-06-01,2013,2,6,7,22,True
20130601,2013-06-01,2013,2,6,7,22,True


In [0]:
# Create trip fact table
trips_df = spark.table("gold_tripsdf")
print(f"Trips table count is: {trips_table.count()}")

trip_facts = trips_df.join(
    rider_df, 
    rider_df.rider_id == trips_df.rider_id,
    how="inner")\
    .join(
        payment_df,
        payment_df.ride_id== trips_df.rider_id,
        "inner")\
    .select(
    col("trip_id").alias("trip_key")
    , ((col("ended_at").cast("long") - col("started_at").cast("long"))/60).alias("trip_duration")
    , (year(trips_df.ended_at) - year(rider_df.birthday)).alias("rider_age_at_trip")
    , col("start_station_id").alias("station_key")
    , (rider_df.rider_id).alias("rider_key")
    , (date_format(payment_df.date, "yyyyMMdd")).alias("date_key")
    , (trips_df.rideable_type).alias("rideable_type")
    , col("started_at").alias("started_at")
    , col("ended_at").alias("ended_at")
    , (trips_df.start_station_id).alias("start_station_id")
    , (trips_df.end_station_id).alias("end_station_id")
    )
display(trip_facts.head(500))

write_starTables(trip_facts, "trip_facts")

Trips table count is: 4584921


trip_key,trip_duration,rider_age_at_trip,station_key,rider_key,date_key,rideable_type,started_at,ended_at,start_station_id,end_station_id
89E7AA6C29227EFF,6.783333333333333,38,525,71934,20220201,classic_bike,2021-02-12T16:14:56.000+0000,2021-02-12T16:21:43.000+0000,525,660
89E7AA6C29227EFF,6.783333333333333,38,525,71934,20220101,classic_bike,2021-02-12T16:14:56.000+0000,2021-02-12T16:21:43.000+0000,525,660
89E7AA6C29227EFF,6.783333333333333,38,525,71934,20211201,classic_bike,2021-02-12T16:14:56.000+0000,2021-02-12T16:21:43.000+0000,525,660
89E7AA6C29227EFF,6.783333333333333,38,525,71934,20211101,classic_bike,2021-02-12T16:14:56.000+0000,2021-02-12T16:21:43.000+0000,525,660
89E7AA6C29227EFF,6.783333333333333,38,525,71934,20211001,classic_bike,2021-02-12T16:14:56.000+0000,2021-02-12T16:21:43.000+0000,525,660
89E7AA6C29227EFF,6.783333333333333,38,525,71934,20210901,classic_bike,2021-02-12T16:14:56.000+0000,2021-02-12T16:21:43.000+0000,525,660
89E7AA6C29227EFF,6.783333333333333,38,525,71934,20210801,classic_bike,2021-02-12T16:14:56.000+0000,2021-02-12T16:21:43.000+0000,525,660
89E7AA6C29227EFF,6.783333333333333,38,525,71934,20210701,classic_bike,2021-02-12T16:14:56.000+0000,2021-02-12T16:21:43.000+0000,525,660
89E7AA6C29227EFF,6.783333333333333,38,525,71934,20210601,classic_bike,2021-02-12T16:14:56.000+0000,2021-02-12T16:21:43.000+0000,525,660
89E7AA6C29227EFF,6.783333333333333,38,525,71934,20210501,classic_bike,2021-02-12T16:14:56.000+0000,2021-02-12T16:21:43.000+0000,525,660


In [0]:
trips_df.columns

Out[18]: ['trip_id',
 'rideable_type',
 'started_at',
 'ended_at',
 'start_station_id',
 'end_station_id',
 'rider_id']

In [0]:
# print(payment_df.columns)
# print(rider_df.columns)
# print(trips_df.columns)

In [0]:
# display(
#     payment_df.select(
#                     F.col("date"), 
#                     year("date").alias("year"), 
#                     month("date").alias("month"),
#                     quarter("date").alias("quarter"),
#                     dayofweek("date").alias("dayofweek"),
#                     weekofyear("date").alias("week")
#                   )\
#               .withColumn("date_key", F.date_format(F.col("date"), "yyyyMMdd"))\
#               .withColumn("is_weekend", when((dayofweek("date") == 6) | (dayofweek("date") == 7), True).otherwise(False))
#         )

In [0]:
# payment_table.createOrReplaceTempView("payment_tbl")

In [0]:
# spark.sql(
#     '''
#         SELECT *
#         FROM payment_tbl 
#         LIMIT 10
#     ''').display()

In [0]:
# %sql
# SELECT * 
# FROM payment_tbl

## Business Questions

The business outcomes you are designing for:

  1.	Analyze how much time is spent per ride 
    o	Based on date and time factors such as day of week and time of day

    o	Based on which station is the starting and / or ending station

    o	Based on age of the rider at time of the ride

    o	Based on whether the rider is a member or a casual rider

In [0]:
cte = trip_facts.join(broadcast(dimDate),
                      trip_facts.date_key == dimDate.date_key) \
    .select(
        dimDate.dayofweek.alias("day_of_week")
        , (minute(trip_facts.started_at) - minute(trip_facts.ended_at)).alias("ride_time")
        , (when((hour(trip_facts.started_at) > 5) & (hour(trip_facts.started_at) < 12),"morning"
        ).when(
            (hour(trip_facts.started_at) >= 12) & (hour(trip_facts.started_at) < 17),"afternoon"        )
        .when(
            (hour(trip_facts.started_at) >= 17) & (hour(trip_facts.started_at) <= 23),"evening"
        ).when(
            (hour(trip_facts.started_at) >= 0) & (hour(trip_facts.started_at) < 4),"early_morning"
        ).otherwise("NA").alias("time_of_day"))
    )
display(cte)

day_of_week,ride_time,time_of_day
3,-7,afternoon
3,-7,afternoon
3,-7,afternoon
3,-7,afternoon
3,-7,afternoon
3,-7,afternoon
3,-7,afternoon
3,-7,afternoon
3,-7,afternoon
3,-7,afternoon


In [0]:
"""
The business outcomes you are designing for are as follows:
    Analyze how much time is spent per ride
        Based on date and time factors such as day of week and time of day
        Based on which station is the starting and / or ending station
        Based on age of the rider at time of the ride
        Based on whether the rider is a member or a casual rider

-- Analyze how much time is spent per ride,
--  Based on date and time factors such as day of week and time of day

"""

# Create DataFrames for trip_facts and dimDate tables
trip_facts = spark.table("trip_facts")
dimDate = spark.table("dimDate")

# Perform the necessary transformations
cte = trip_facts.join(broadcast(dimDate),
                      trip_facts.date_key == dimDate.date_key) \
    .select(
        dimDate.dayofweek.alias("day_of_week")
        , (minute(trip_facts.ended_at) - minute(trip_facts.started_at)).alias("ride_time")
        , (when((hour(trip_facts.started_at) > 5) & (hour(trip_facts.started_at) < 12),"morning"
        ).when(
            (hour(trip_facts.started_at) >= 12) & (hour(trip_facts.started_at) < 17),"afternoon"        )
        .when(
            (hour(trip_facts.started_at) >= 17) & (hour(trip_facts.started_at) <= 23),"evening"
        ).when(
            (hour(trip_facts.started_at) >= 0) & (hour(trip_facts.started_at) < 4),"early_morning"
        ).otherwise("NA").alias("time_of_day"))
    )
# cte.show(5)

# Perform the final aggregation
result = cte.groupBy(["day_of_week", "time_of_day", "ride_time"]).agg({"ride_time": "avg"})\
        .select("day_of_week"
                , "time_of_day"
                , "ride_time"
                )
# Show the result
result.show(5)

+-----------+---------+-----------+
|day_of_week|ride_time|time_of_day|
+-----------+---------+-----------+
|          3|       33|  afternoon|
|          3|       33|  afternoon|
|          3|       33|  afternoon|
|          3|       33|  afternoon|
|          3|       33|  afternoon|
+-----------+---------+-----------+
only showing top 5 rows



In [0]:
"""
----------------------------------------------------
----------------------------------------------------
-- Analyze how much time is spent per ride,
--	Based on which station is the starting and / or ending station
"""

# Create DataFrames
trip_facts = spark.table("trip_facts")

cte = trip_facts.select(
    col("start_station_id").alias("starting_station")
    , col("end_station_id").alias("end_station")
    , (minute(col("ended_at")) - minute(col("started_at"))).alias("ride_time")
)

res = cte.groupBy(["starting_station", "end_station", "ride_time"]).agg({'ride_time':'avg'}).sort(col("ride_time").desc())\
        .select(
            "starting_station"
            , "end_station"
            , col("avg(ride_time)").alias("avg_ride_time")
                )
res.show(5)

+----------------+------------+-------------+
|starting_station| end_station|avg_ride_time|
+----------------+------------+-------------+
|    TA1308000001|       13022|         59.0|
|           13022|       13042|         59.0|
|    TA1306000029|       13245|         59.0|
|           13008|TA1305000029|         59.0|
|           13074|TA1309000061|         59.0|
+----------------+------------+-------------+
only showing top 5 rows



In [0]:
'''
----------------------------------------------------
----------------------------------------------------
-- Analyze how much time is spent per ride,
--	   Based on age of the rider at time of the ride
'''

# Create DataFrames
trip_facts = spark.table("trip_facts")

cte = trip_facts.select(

    (trip_facts.rider_age_at_trip).alias("age")
    , (trip_facts.trip_duration).alias("time_per_ride")
)
# cte.show(5)

# res = cte.groupBy(["starting_station", "end_station", "ride_time"]).agg({'ride_time':'avg'}).sort(col("ride_time").desc())

result = cte.groupBy(["age", "time_per_ride"]).agg({"time_per_ride":"avg"}).sort(col("age").asc(), col("time_per_ride").desc())\
            .select(
                "age"
                , col("avg(time_per_ride)").alias("avg_time_per_ride")
                    )
result.show(5)

+---+------------------+
|age| avg_time_per_ride|
+---+------------------+
| 15|25730.066666666655|
| 15|          14458.25|
| 15|           2774.25|
| 15|1733.9166666666665|
| 15|            1596.7|
+---+------------------+
only showing top 5 rows



## Business Questions

The business outcomes you are designing for:

  2.	Analyze how much money is spent 
  
    o	Per month, quarter, year

    o	Per member, based on the age of the rider at account start

In [0]:
dimDate.columns

Out[54]: ['date_key',
 'date',
 'year',
 'quarter',
 'month',
 'dayofweek',
 'week',
 'is_weekend']

In [0]:
dimDate.select("date_key").show(5)

+--------+
|date_key|
+--------+
|20130601|
|20130601|
|20130601|
|20130601|
|20130601|
+--------+
only showing top 5 rows



In [0]:
payment_facts.columns

Out[55]: ['payment_key', 'amount', 'rider_key', 'date_key']

In [0]:
'''
----------------------------------------------------
----------------------------------------------------
-- 2. Analyze how much money is spent
--		Per month, quarter, year
'''

# Create DataFrames for trip_facts and dimDate tables
payment_facts = spark.table("payment_facts")
dimDate = spark.table("dimDate")

temp_result = payment_facts.join(
    dimDate
    , dimDate.date_key == payment_facts.date_key
    , how="inner")\
    .select(
    	dimDate.year.alias("year")
		, dimDate.quarter.alias("quarter")
		, dimDate.month.alias("month")
		, payment_facts.amount.alias("amount")
) 
# temp_result.show(5)

result = temp_result.groupBy(["year","month","quarter"]).agg({"amount":"sum"}).sort(col("year").asc(),
                                                                                    col("quarter").asc(),
                                                                                    col("month").asc(),
                                                                                    (col("sum(amount)")).alias("money_spent").desc())
																				
result.show(5)														

+----+-----+-------+------------------+
|year|month|quarter|       sum(amount)|
+----+-----+-------+------------------+
|2013|    2|      1|12.899999618530273|
|2013|    3|      1| 66237.75005793571|
|2013|    4|      2| 282677.8488073349|
|2013|    5|      2| 717211.4406394958|
|2013|    6|      2|1404411.6014900208|
+----+-----+-------+------------------+
only showing top 5 rows



In [0]:
'''
----------------------------------------------------
----------------------------------------------------
-- 2. Analyze how much money is spent
--		Per member, based on the age of the rider at account start
'''
# Create DataFrames for trip_facts and dimDate tables
payment_facts = spark.table("payment_facts")
trip_facts = spark.table("trip_facts")
dimDate = spark.table("dimDate")

temp_df = payment_facts.join(
        dimDate
        , dimDate.date_key == payment_facts.date_key
        , how="inner")\
    .join(
        trip_facts
        , trip_facts.rider_key == payment_facts.rider_key
    )\
    .select(
        (trip_facts.rider_key).alias("rider_id")
        , (trip_facts.rider_age_at_trip).alias("age")
        , (payment_facts.amount).alias("amount")
    )

result_df = temp_df.groupBy(["rider_id","age"]).agg({"amount":"sum"}).sort(col("age").asc(),
                                                                           (col("sum(amount)")).alias("money_spent").desc())
result_df.show(5)

+--------+---+--------------------+
|rider_id|age|         sum(amount)|
+--------+---+--------------------+
|   12040| 15|3.184502533410892E11|
|   15523| 15|     3.1425487236E11|
|   50037| 15|    2.87918184771E11|
|   36209| 15|     2.6463128895E11|
|   43813| 15|2.492455014036479...|
+--------+---+--------------------+
only showing top 5 rows



## Business Questions

The business outcomes you are designing for:

  3.	EXTRA CREDIT - Analyze how much money is spent per member 
  
    o	Based on how many rides the rider averages per month
    
    o	Based on how many minutes the rider spends on a bike per month

In [0]:
dimRider.columns

Out[70]: ['rider_id',
 'first',
 'last',
 'address',
 'account_start_date',
 'account_end_date',
 'is_member']

In [0]:
'''
-- 3. EXTRA CREDIT - Analyze how much money is spent per member
--	    Based on how many rides the rider averages per month
''' 
# SELECT 
# 	DISTINCT r.rider_id as rider,
# 	dt.month
# 	, COUNT(*) as num_of_rides
# 	, SUM(fp.amount) AS total_spent_monthly
# 	--, SUM(fp.amount)/COUNT(*) as avg_amount -- 356790
# 	 , AVG(fp.amount) as avg_spent_monthly_per_ride
# FROM fact_payment fp
# 	INNER JOIN dimRiders r
# 		ON r.rider_key = fp.rider_key
# 	INNER JOIN dimDate dt
# 			ON fp.date_key = dt.date_key
# GROUP BY r.rider_id, dt.month
# ORDER BY 3 DESC, 4 DESC, 5 DESC

# Create DataFrames for trip_facts and dimDate tables
payment_facts = spark.table("payment_facts")
dimRider = spark.table("dimRider")
dimDate = spark.table("dimDate")

temp_df = payment_facts.join(
    dimRider,
    dimRider.rider_id == payment_facts.rider_key,
    how = "inner")\
    .join(
        dimDate,
        dimDate.date_key == payment_facts.date_key
    )\
    .select(
        (dimRider.rider_id).alias("rider")
        , (dimDate.month).alias("month")
        , (payment_facts.amount).alias("amount")
    )
temp_df.show(3)

resultDf = temp_df.groupBy(
    ["rider", "month"]
).agg(
    count("*").alias("cnt"),
    sum(col("amount")).alias("money_spent"),
    avg(col("amount")).alias("avg_money_spent")

    # {"amount"}:{"avg"}
).sort(
    col("cnt").desc(),
    col("money_spent").desc(),
    col("avg_money_spent").desc()
)
resultDf.show(5)

+-----+-----+------+
|rider|month|amount|
+-----+-----+------+
| 1000|    5|   9.0|
| 1000|    5|   9.0|
| 1000|    5|   9.0|
+-----+-----+------+
only showing top 3 rows



In [0]:
'''
-- 3. EXTRA CREDIT - Analyze how much money is spent per member
--	    Based on how many minutes the rider spends on a bike per month
'''
# SELECT
# 	DISTINCT t.rider_key as rider
# 	, dt.month
# 	, t.trip_duration AS min_on_bike
# 	, COUNT(*) as num_of_rides
# 	, SUM(fp.amount) AS total_spent_monthly
# 	--, SUM(fp.amount)/COUNT(*) as avg_amount -- 356790
# 	 , AVG(fp.amount) as avg_spent_amt_per_ride
# FROM fact_payment fp
# 	INNER JOIN fact_trip t
# 		ON t.rider_key = fp.rider_key
# 	INNER JOIN dimDate dt
# 			ON fp.date_key = dt.date_key
# GROUP BY t.rider_key, dt.month, t.trip_duration
# ORDER BY 1 ASC, 2 ASC, 3 DESC, 4 DESC, 5 DESC, 6 DESC


# Create DataFrames for trip_facts and dimDate tables
payment_facts = spark.table("payment_facts")
trip_facts = spark.table("trip_facts")
dimDate = spark.table("dimDate")

temp_df = payment_facts.join(
    trip_facts,
    trip_facts.rider_key == payment_facts.rider_key,
    how = "inner")\
    .join(
        dimDate,
        dimDate.date_key == payment_facts.date_key
    )\
    .select(
        (dimRider.rider_id).alias("rider")
        , (dimDate.month).alias("month")
        , (trip_facts.trip_duration).alias("trip_duration")
        , (payment_facts.amount).alias("amount")
    )
temp_df.show(3)

resultDf = temp_df.groupBy(
    ["rider", "month"]
).agg(
    count("*").alias("cnt"),
    {"amount"}:{"sum"},
    {"amount"}:{"avg"}
).sort(
    col("cnt").desc(),
    col("sum(amount)").desc(),
    col("avg(amount)").desc()
)
resultDf.show(5)
