In [1]:
import ConnectionConfig as cc
from pyspark.sql.functions import arrays_overlap, split, when, col, expr
from pyspark.sql.functions import date_format
import math
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from delta import DeltaTable

cc.setupEnvironment()

In [2]:
spark = cc.startLocalCluster("analyse", 4)
spark.getActiveSession()

In [3]:
# EXTRACT

factRide = spark.read.format("delta").load("./spark-warehouse/fact_rit")

dim_weather = spark.read.format("delta").load("spark-warehouse/dim_weather")
dim_klant = spark.read.format("delta").load("spark-warehouse/dim_klant")
dim_slot = spark.read.format("delta").load("spark-warehouse/dim_slot")
dim_date = spark.read.format("delta").load("spark-warehouse/dimdate")

factRide.createOrReplaceTempView("viewRide")
dim_weather.createOrReplaceTempView("viewWeather")
dim_klant.createOrReplaceTempView("viewKlant")
dim_slot.createOrReplaceTempView("viewSlot")
dim_date.createOrReplaceTempView("viewDate")

#  Wat zijn de drukke momenten (op dagbasis) in de week t.o.v. het weekend? 

In [4]:
# TRANSFORM

druk = spark.sql("""
select count(*) as count, d.CalendarDate, date_format(d.CalendarDate, 'EEEE') as CalendarDay from viewRide r
inner join viewDate d on r.date_SK = d.date_SK
group by d.CalendarDate order by count desc
""")
druk.show()

+-----+------------+-----------+
|count|CalendarDate|CalendarDay|
+-----+------------+-----------+
| 7891|  2022-06-21|    Tuesday|
| 7891|  2021-02-14|     Sunday|
| 7407|  2020-10-13|    Tuesday|
| 7084|  2023-01-30|     Monday|
| 7058|  2022-09-12|     Monday|
| 7030|  2020-08-04|    Tuesday|
| 6871|  2022-08-01|     Monday|
| 6761|  2020-07-27|     Monday|
| 6688|  2021-11-23|    Tuesday|
| 6505|  2022-01-11|    Tuesday|
| 6366|  2021-02-28|     Sunday|
| 6332|  2020-07-20|     Monday|
| 6330|  2023-07-30|     Sunday|
| 6221|  2020-04-21|    Tuesday|
| 6179|  2021-05-24|     Monday|
| 6139|  2022-12-06|    Tuesday|
| 6139|  2023-07-09|     Sunday|
| 6126|  2021-05-18|    Tuesday|
| 6053|  2022-05-15|     Sunday|
| 6022|  2021-09-12|     Sunday|
+-----+------------+-----------+
only showing top 20 rows



In [5]:


druk.write.format("delta").mode("overwrite").save("spark-warehouse/analyse/drukke_momenten_per_dag")

#### De drukste momenten zijn op maandag en dinsdag in week en zondag in het weekend.

# Hebben datumparameters invloed op de afgelegde afstand?

In [6]:
distance_traveled = spark.sql("""
select sum(distance_km) as distance, d.CalendarDate, date_format(d.CalendarDate, 'EEEE') as CalendarDay from viewRide r
inner join viewDate d on r.date_SK = d.date_SK
group by d.CalendarDate
order by distance desc
""")
distance_traveled.show()

+------------------+------------+-----------+
|          distance|CalendarDate|CalendarDay|
+------------------+------------+-----------+
|21576.858531696795|  2022-06-21|    Tuesday|
|21185.390099687498|  2021-02-14|     Sunday|
| 20341.04235682846|  2020-10-13|    Tuesday|
|19528.076133437404|  2020-08-04|    Tuesday|
|18925.975675413167|  2022-09-12|     Monday|
|18557.910692016463|  2021-11-23|    Tuesday|
| 18453.26930116918|  2023-01-30|     Monday|
|  18236.3413793343|  2020-07-27|     Monday|
|18230.477983224206|  2022-08-01|     Monday|
|18020.809159663004|  2022-01-11|    Tuesday|
| 17357.25206993964|  2020-07-20|     Monday|
|  17248.8494071148|  2021-02-28|     Sunday|
|16890.022233199972|  2020-04-21|    Tuesday|
|16852.087513168794|  2022-12-06|    Tuesday|
|16784.411539023462|  2020-01-12|     Sunday|
|16759.522001199268|  2022-04-25|     Monday|
| 16739.50464120908|  2023-07-30|     Sunday|
| 16737.26978563482|  2021-05-18|    Tuesday|
| 16658.61235561652|  2021-05-24| 

In [7]:
distance_traveled.write.format("delta").mode("overwrite").save("spark-warehouse/analyse/afgelegde_afstand_per_dag")

#### De afgelegde afstand is het grootst op maandag en dinsdag in de week en zondag in het weekend.

# Heeft weer invloed op ritten?

In [10]:
weer = spark.sql("""
select count(*) as count, weather_descriptor_SK from viewRide
group by weather_descriptor_SK
""")
weer.show()

+-------+---------------------+
|  count|weather_descriptor_SK|
+-------+---------------------+
|3828645|    weertype onbekend|
|    129|            aangenaam|
|    159|          onaangenaam|
|    154|             neutraal|
+-------+---------------------+



In [11]:
weer.write.format("delta").mode("overwrite").save("spark-warehouse/analyse/weer_invloed_op_ritten")

#### Het weer heeft geen invloed op de ritten. (bij ons)

# Wat is de invloed van de woonplaats van de gebruikers op het gebruik van de vehicles?

In [12]:
invloed = spark.sql("""
select count(*) as count, r.klant_id_SK, k.address from viewRide r
inner join viewKlant k on r.klant_id_SK = k.userid
group by r.klant_id_SK, k.address
order by count desc
""")
invloed.show()

+-----+-----------+--------------------+
|count|klant_id_SK|             address|
+-----+-----------+--------------------+
| 2672|      43968|Oudesteenweg 108 ...|
| 2552|      19109|Levergem 150 0005...|
| 2456|      15229|Marnix Gijsenlaan...|
| 2432|        355|Kapeldijkstraat 1...|
| 2392|      37367|Mexicobruggen 120...|
| 2376|      28331|Raapstraat 62 , 2...|
| 2360|      37660|Kunstambachtslaan...|
| 2352|      29289|Blokkersdijkstraa...|
| 2344|       9344|Haantjeslei 155 ,...|
| 2328|      54381|Jan van de Wervel...|
| 2320|       6859|Salamander 47 , 2...|
| 2280|      54620|Stokerijstraat 12...|
| 2264|      25607|Kloetstraat 52 , ...|
| 2256|       5750|Kattendijkbrug 14...|
| 2248|      26437|Hazarddam 156 , 9...|
| 2232|      48252|Kapellenboslei 10...|
| 2192|      25327|Melgesdreef 70 , ...|
| 2184|      51860|Laurentia Poststr...|
| 2144|      34805|De Winkelhaak 194...|
| 2080|      19475|Eiermarkt 17 0105...|
+-----+-----------+--------------------+
only showing top

In [13]:
# LOAD

invloed.write.format("delta").mode("overwrite").save("spark-warehouse/analyse/invloed_woonplaats_adres")

### User 43968 pakt de meeste ritten.

# We willen voorspellen welke sloten preventief onderhoud nodig hebben. Bekijk hoe vaak slotnummers relatief gezien gebruikt worden.

In [14]:
# TRANSFORM

onderhoud = spark.sql("""
select count(*) as count, r.start_slot_id_SK from viewRide r
group by r.start_slot_id_SK
order by count desc
""")
onderhoud.show()

+-----+----------------+
|count|start_slot_id_SK|
+-----+----------------+
|12078|            1548|
|11996|            3258|
|11956|            3024|
|11939|             270|
|11928|            2682|
|11899|            2772|
|11890|            2718|
|11873|            2790|
|11867|            2825|
|11863|             738|
|11858|             774|
|11856|            2538|
|11840|            2969|
|11825|            2988|
|11825|            2358|
|11824|            2430|
|11813|            1530|
|11804|             809|
|11799|             702|
|11793|            1926|
+-----+----------------+
only showing top 20 rows



In [15]:
# LOAD

onderhoud.write.format("delta").mode("overwrite").save("spark-warehouse/analyse/slot_onderhoud")

#### Slot 1548 wordt het meest gebruikt.

# Als een klant zijn abonnement stopzet, willen we kunnen voorspellen op welke stations dit het meeste effect zal hebben.

In [16]:
# TRANSFORM

stop = spark.sql("""
select count(r.klant_id_SK) as count, s.zipcode from viewRide r
inner join viewKlant k on r.klant_id_SK = k.userid
inner join viewSlot s on r.start_slot_id_SK = s.lockid
group by s.zipcode
order by count desc
""")
stop.show()

+-------+-------+
|  count|zipcode|
+-------+-------+
|1070672|   2000|
| 727079|   2018|
| 408388|   2060|
| 342507|   2600|
| 263608|   2140|
| 239664|   2100|
| 198620|   2030|
| 179201|   2660|
| 127082|   2610|
| 103545|   2050|
|  96081|   2170|
|  67579|   2020|
+-------+-------+



In [17]:
# LOAD

stop.write.format("delta").mode("overwrite").save("spark-warehouse/analyse/stop_abonnement_station")

#### Het meeste effect zal het hebben op 2020.

# Eigen Analyse Vragen

# Welk abonnementstype wordt het meest gebruikt?

In [18]:
# TRANSFORM

abo = spark.sql("""
select count(*) as count, subscriptiontypeid from viewKlant group by subscriptiontypeid order by count desc
""")
abo.show()

+-----+------------------+
|count|subscriptiontypeid|
+-----+------------------+
|43037|                 3|
| 8724|                 2|
| 8507|                 1|
+-----+------------------+



In [19]:
# LOAD

abo.write.format("delta").mode("overwrite").save("spark-warehouse/analyse/pop_subscription_type")

#### Het abonnementstype 3 wordt veruit het meest gebruikt.

# Welke stations worden het meest gebruikt?

In [20]:
# TRANSFORM

pop = spark.sql("""
select count(*) as count, s.stationnr from viewRide r
inner join viewSlot s on r.start_slot_id_SK = s.lockid
group by stationnr
order by count desc
""")
pop.show()

+-----+---------+
|count|stationnr|
+-----+---------+
|41558|      227|
|41406|      305|
|41370|      225|
|40062|      307|
|39893|      033|
|38919|      123|
|38767|      043|
|38660|      059|
|38447|      061|
|38252|      055|
|38179|      137|
|38162|      203|
|38156|      050|
|38088|      105|
|38030|      038|
|38023|      127|
|37910|      133|
|37910|      224|
|37870|      051|
|37558|      048|
+-----+---------+
only showing top 20 rows



In [21]:
# LOAD

pop.write.format("delta").mode("overwrite").save("spark-warehouse/analyse/station_usage")

#### Het station met nummer 227 wordt het meest gebruikt.

# In welke straat wonen de meeste mensen?

In [22]:
# TRANSFORM

street = spark.sql("""
select count(*) as count, 
    trim(trailing ' 1234567890 ,' from substr(address, 0, instr(address, ','))) as street, 
    substr(address, instr(address, ',')+2, 4) as postcode 
from viewKlant 
group by street, postcode 
order by count desc
""")
street.show()

+-----+--------------------+--------+
|count|              street|postcode|
+-----+--------------------+--------+
|   36|                Wijk|    2530|
|   28|         Reepkenslei|    2550|
|   28| Prins Boudewijnlaan|    2650|
|   26|         Kanaaldok B|    2040|
|   25|          Liersebaan|    2240|
|   25|         Heidestraat|    2070|
|   24|          Krijgsbaan|    2640|
|   24|         Toeffelhoek|    2530|
|   23|Jozef de Veusters...|    2650|
|   23|          Sint Jozef|    2520|
|   22|     Antwerpsestraat|    2640|
|   21|           Koggelaan|    2050|
|   21|            Keerbaan|    2520|
|   21|  Broechemsesteenweg|    2520|
|   21|         Binnenbeemd|    2550|
|   21|Frans De Vriendts...|    2140|
|   21|         Van Eycklei|    2018|
|   20|       Melselestraat|    9120|
|   20|                Wijk|    2531|
|   20|      Boereveldseweg|    2070|
+-----+--------------------+--------+
only showing top 20 rows



In [23]:
# LOAD

street.write.format("delta").mode("overwrite").save("spark-warehouse/analyse/street_population")

#### De meeste mensen wonen in Wijk van 2530

# Hoeveel ritten worden er per maand gemaakt?

In [24]:
# TRANSFORM

monthly = spark.sql("""
    SELECT 
        YEAR(d.CalendarDate) as year,
        MONTH(d.CalendarDate) as month,
        COUNT(*) as monthly_rides
    FROM viewRide r
    INNER JOIN viewDate d ON r.date_SK = d.date_SK
    GROUP BY year, month
    ORDER BY year, month
""")
monthly.show()

+----+-----+-------------+
|year|month|monthly_rides|
+----+-----+-------------+
|2019|    9|        35241|
|2019|   10|        86627|
|2019|   11|        84729|
|2019|   12|        80170|
|2020|    1|        80265|
|2020|    2|        68930|
|2020|    3|        83958|
|2020|    4|        73587|
|2020|    5|        78325|
|2020|    6|        71876|
|2020|    7|        90183|
|2020|    8|        72259|
|2020|    9|        74006|
|2020|   10|        84996|
|2020|   11|        81580|
|2020|   12|        76010|
|2021|    1|        76534|
|2021|    2|        80567|
|2021|    3|        87935|
|2021|    4|        79074|
+----+-----+-------------+
only showing top 20 rows



In [25]:
# LOAD

monthly.write.format("delta").mode("overwrite").save("spark-warehouse/analyse/monthly_rides")

#### In september 2019 worden meer dan 35000 ritten gemaakt terwijl in oktober van hetzelfde jaar worden meer dan 86000 ritten gemaakt

In [26]:
spark.stop()