In [1]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
import math
import ConnectionConfig as cc
from pyspark.sql.functions import *
from datetime import *
import pandas as pd
from pyspark.sql.types import *
import openmeteo_requests
from retry_requests import retry
import requests_cache
import os
import json
import pandas as pd
import time
from pathlib import Path


cc.setupEnvironment()

In [2]:
spark = cc.startLocalCluster("factTable",4)
spark.getActiveSession()


In [3]:
#EXTRACT
cc.set_connectionProfile("VeloBike")
print(cc.create_jdbc())

jdbc:postgresql://localhost:5433/velodb


In [4]:
## ADDING FORMULAS
def fetch_weather_data(end_date:str, latitude, longitude):
    session = requests_cache.CachedSession('.cache', expire_after=-1)
    retry_session = retry(session, retries=5, backoff_factor=0.2)
    openmeteo = openmeteo_requests.Client(session=retry_session)

    end_date_dt = datetime.strptime(end_date, "%Y-%m-%d")
    start_date_dt = end_date_dt - timedelta(days=30)
    start_date = start_date_dt.strftime("%Y-%m-%d")
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": latitude,
        "longitude": longitude,
        "start_date": start_date,
        "end_date": end_date,
        "hourly": ["temperature_2m", "weather_code"],
        "timezone": "auto"
    }
    responses = openmeteo.weather_api(url, params=params)
    if not responses:
        return pd.DataFrame({
            "date": [],
            "temperature_2m": [],
            "weather_code": [],
            "error": ["No data available for the specified dates"]
        })
    response = responses[0]
    hourly = response.Hourly()
    hourly_timestamps = pd.date_range(
        start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
        end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=hourly.Interval()),
        inclusive="left"
    )

    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    hourly_weather_code = hourly.Variables(1).ValuesAsNumpy()

    df = pd.DataFrame({
        "date": hourly_timestamps,
        "temperature_2m": hourly_temperature_2m,
        "weather_code": hourly_weather_code
    })

    return df


def to_weather_code_convertion(weather_dict):
    temp = weather_dict["temperature_2m"]
    code = weather_dict["weather_code"]
    print(f"Temperature: {temp}. Code: {code}")
    if float(temp) > 14 and float(code) < 1:
        return 2
    elif float(code) > 50:
        return 1
    else:
        return 3



# Haversine formula
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)
    a = math.sin(delta_phi / 2.0) ** 2 + \
        math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2.0) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c

haversine_udf = udf(haversine, DoubleType())
spark.udf.register("haversine_km", haversine_udf)

def convert_lat_long(coord_str):
    coord_str = coord_str.strip("()")
    latitude, longitude = map(float, coord_str.split(","))
    return latitude, longitude

def convert_to_nearest_hour(dt):
    return dt.replace(minute=0, second=0, microsecond=0)


seasonal_dates = ["2019-02-22", "2019-06-22", "2019-09-22"]
last_date_col = to_date(lit(seasonal_dates[:10]))

haversine_udf = udf(haversine, DoubleType())
spark.udf.register("haversine_km", haversine_udf)

weather_schema = StructType([
    StructField("rideid", IntegerType(), True),
    StructField("weather", IntegerType(), True)
])



rides_df = spark.read.format("jdbc").option("driver" , "org.postgresql.Driver").option("url", cc.create_jdbc()).option("dbtable", "rides").option("user", cc.get_Property("username")).option("password", cc.get_Property("password"))    .load()

locks_df = spark.read.format("jdbc").option("driver" , "org.postgresql.Driver").option("url", cc.create_jdbc()).option("dbtable", "locks").option("user", cc.get_Property("username")).option("password", cc.get_Property("password"))    .load()

stations_df  = spark.read.format("jdbc").option("driver" , "org.postgresql.Driver").option("url", cc.create_jdbc()).option("dbtable", "stations").option("user", cc.get_Property("username")).option("password", cc.get_Property("password"))    .load()

subscriptions_df = spark.read.format("jdbc").option("driver" , "org.postgresql.Driver").option("url", cc.create_jdbc()).option("dbtable", "subscriptions").option("user", cc.get_Property("username")).option("password", cc.get_Property("password"))    .load()

vehicle_df = spark.read.format("jdbc").option("driver" , "org.postgresql.Driver").option("url", cc.create_jdbc()).option("dbtable", "vehicles").option("user", cc.get_Property("username")).option("password", cc.get_Property("password"))    .load()


bikelots_df = spark.read.format("jdbc").option("driver" , "org.postgresql.Driver").option("url", cc.create_jdbc()).option("dbtable", "bikelots").option("user", cc.get_Property("username")).option("password", cc.get_Property("password"))    .load()

biketype_df = spark.read.format("jdbc").option("driver" , "org.postgresql.Driver").option("url", cc.create_jdbc()).option("dbtable", "bike_types").option("user", cc.get_Property("username")).option("password", cc.get_Property("password"))    .load()



In [5]:
valid_rides_df = rides_df.filter(rides_df.endtime >= rides_df.starttime)
valid_rides_df = valid_rides_df.filter(valid_rides_df.vehicleid.isNotNull())
valid_rides_df.count()

4138218

In [17]:
# EXTRACT
dim_lock = spark.read.format("delta").load("spark-warehouse/dimlock")
dim_date = spark.read.format("delta").load("spark-warehouse/dimdate")
dim_weather = spark.read.format("delta").load("spark-warehouse/dimweather")
dim_vehicle = spark.read.format("delta").load("spark-warehouse/dimvehicle")
dim_user = spark.read.format("delta").load("spark-warehouse/userdim")


subscriptions_df.createOrReplaceTempView("subscriptions")
valid_rides_df.createOrReplaceTempView("rides")
vehicle_df.createOrReplaceTempView("vehicles")
bikelots_df.createOrReplaceTempView("bikelots")
stations_df.createOrReplaceTempView("stations")
locks_df.createOrReplaceTempView("locks")
biketype_df.createOrReplaceTempView("bike_types")

dim_date.createOrReplaceTempView("dimDate")
dim_user.createOrReplaceTempView("dimUser")
dim_lock.createOrReplaceTempView("dimLock")
dim_weather.createOrReplaceTempView("dimWeather")
dim_vehicle.createOrReplaceTempView("dimVehicle")



In [18]:
## UNIQUE zipcodes
zipcodes_gpscoord = spark.sql("SELECT MIN(gpscoord) as gpscoord, zipcode FROM stations group by zipcode")

In [19]:

os.makedirs("weather_data", exist_ok=True)


wmo_to_main = {
    0: ("Clear", "clear sky", "01d"),
    1: ("Mainly clear", "few clouds", "02d"),
    2: ("Partly cloudy", "scattered clouds", "03d"),
    3: ("Overcast", "overcast clouds", "04d"),
    45: ("Fog", "fog", "50d"),
    48: ("Rime fog", "rime fog", "50d"),
    51: ("Drizzle", "light drizzle", "09d"),
    53: ("Drizzle", "moderate drizzle", "09d"),
    55: ("Drizzle", "dense drizzle", "09d"),
    61: ("Rain", "slight rain", "10d"),
    63: ("Rain", "moderate rain", "10d"),
    65: ("Rain", "heavy rain", "10d"),
    71: ("Snow", "slight snow", "13d"),
    73: ("Snow", "moderate snow", "13d"),
    75: ("Snow", "heavy snow", "13d"),
    80: ("Rain showers", "slight rain showers", "09d"),
    81: ("Rain showers", "moderate rain showers", "09d"),
    82: ("Rain showers", "violent rain showers", "09d"),
    95: ("Thunderstorm", "thunderstorm", "11d")
}

weather_by_zipcode = {}

for date_str in seasonal_dates:
    for row in zipcodes_gpscoord.collect():
        lat, lon = convert_lat_long(row.gpscoord)
        weather_data = fetch_weather_data(date_str, lat, lon)
        if not weather_data.empty:
            weather_data["zipcode"] = row.zipcode

            for _, weather_row in weather_data.iterrows():
                timestamp = weather_row["date"]
                key = (row.zipcode, timestamp.strftime("%Y-%m-%d %H:%M:%S"))
                weather_by_zipcode[key] = weather_row

                code = int(weather_row["weather_code"])
                temp_raw = weather_row.get("temperature_2m", None)
                try:
                    temp = round(float(temp_raw), 1) if temp_raw is not None else None
                except (ValueError, TypeError):
                    temp = None

                main, description, icon = wmo_to_main.get(code, ("Unknown", "unknown", "50d"))

                json_obj = {
                    "zipCode": int(row.zipcode),
                    "coord": {"lon": lon, "lat": lat},
                    "weather": [{
                        "id": code,
                        "main": main,
                        "description": description,
                        "icon": icon
                    }],
                    "base": "stations",
                    "dt": timestamp.isoformat(),
                    "temp": temp
                }

                file_name = f"weather_data/{row.zipcode}_{timestamp.strftime('%Y-%m-%dT%H-%M')}.json"
                with open(file_name, "w") as f:
                    json.dump(json_obj, f, indent=4)

rides_filtered = valid_rides_df.filter(
    to_date(valid_rides_df["starttime"]).isin(seasonal_dates)
).select("rideid", "starttime", "startlockid")


rides_with_locks = rides_filtered.join(
    locks_df,
    rides_filtered.startlockid == locks_df.lockid,
    "left"
).withColumnRenamed("stationid", "lock_stationid")

rides_with_stations = rides_with_locks.join(
    stations_df,
    rides_with_locks.lock_stationid == stations_df.stationid,
    "left"
)

# === Step 5: Match ride + zip + time with weather ===
weather_schema = StructType([
    StructField("rideid", IntegerType(), True),
    StructField("weather", IntegerType(), True)
])

weather_results = []

for row in rides_with_stations.collect():
    current_zipcode = row["zipcode"]
    if current_zipcode is None:
        continue

    rounded_starttime = convert_to_nearest_hour(row.starttime)
    rounded_str = rounded_starttime.strftime("%Y-%m-%d %H:%M:%S")
    key = (current_zipcode, rounded_str)

    weather_info = weather_by_zipcode.get(key)
    if weather_info is not None:
        print(f"Weather for ride {row['rideid']} at {rounded_str}:")
        print(f"Temperature: {weather_info['temperature_2m']}°C, Weather Code: {weather_info['weather_code']}")
        weather_results.append({
            "rideid": row.rideid,
            "weather": to_weather_code_convertion(weather_info)
        })
    else:
        print(f"No weather data available for ride {row['rideid']} at {rounded_str}")
        weather_results.append({"rideid": row.rideid, "weather": 4})

# === Step 6: Save Results as Spark View ===
if weather_results:
    weather_df = spark.createDataFrame(weather_results, schema=weather_schema)
    print("Saved weather data into the dataframe")
else:
    weather_df = spark.createDataFrame([], schema=weather_schema)
    print(" Created empty dataframe.")

weather_df.createOrReplaceTempView("weather_api")

Weather for ride 18 at 2019-09-22 08:00:00:
Temperature: 20.66950035095215°C, Weather Code: 3.0
Temperature: 20.66950035095215. Code: 3.0
Weather for ride 23 at 2019-09-22 08:00:00:
Temperature: 20.682498931884766°C, Weather Code: 3.0
Temperature: 20.682498931884766. Code: 3.0
Weather for ride 26 at 2019-09-22 08:00:00:
Temperature: 20.682498931884766°C, Weather Code: 3.0
Temperature: 20.682498931884766. Code: 3.0
Weather for ride 27 at 2019-09-22 08:00:00:
Temperature: 20.272001266479492°C, Weather Code: 3.0
Temperature: 20.272001266479492. Code: 3.0
Weather for ride 34 at 2019-09-22 08:00:00:
Temperature: 20.66950035095215°C, Weather Code: 3.0
Temperature: 20.66950035095215. Code: 3.0
Weather for ride 36 at 2019-09-22 08:00:00:
Temperature: 20.70199966430664°C, Weather Code: 3.0
Temperature: 20.70199966430664. Code: 3.0
Weather for ride 42 at 2019-09-22 08:00:00:
Temperature: 20.682498931884766°C, Weather Code: 3.0
Temperature: 20.682498931884766. Code: 3.0
Weather for ride 43 at 201

In [20]:
rides_fact_df = spark.sql("""
WITH base AS (
  SELECT
     haversine_km(
          CAST(split(regexp_replace(CAST(startpoint AS STRING), '[()]', ''), ',')[0] AS DOUBLE),
          CAST(split(regexp_replace(CAST(startpoint AS STRING), '[()]', ''), ',')[1] AS DOUBLE),
          CAST(split(regexp_replace(CAST(endpoint AS STRING), '[()]', ''), ',')[0] AS DOUBLE),
          CAST(split(regexp_replace(CAST(endpoint AS STRING), '[()]', ''), ',')[1] AS DOUBLE)
      ) AS ride_distance,
     (unix_timestamp(r.endtime) - unix_timestamp(r.starttime)) / 60 AS ride_duration,
     r.rideid,
     usrdim.user_sk AS user_sk,
     COALESCE(dimlk.lockid, 0) AS start_lock_id,
     COALESCE(dimlok.lockid, 0) AS end_lock_id,
     dd.dateSK as date_sk,
     dv.vehicle_SK as vehicle_sk,
     COALESCE(wapi.weather, 0) AS weather_id
  FROM rides r
  JOIN subscriptions sub ON sub.subscriptionid = r.subscriptionid

  JOIN dimUser usrdim ON sub.userid = usrdim.userid

  LEFT OUTER JOIN dimLock dimlk ON dimlk.lockid = r.startlockid
  LEFT OUTER JOIN dimLock dimlok ON dimlok.lockid = r.endlockid


  JOIN dimDate dd ON to_date(r.starttime) = dd.date

  JOIN vehicles veh on r.vehicleid = veh.vehicleid
  JOIN bikelots bkl on veh.bikelotid = bkl.bikelotid
  JOIN dimVehicle dv on bkl.biketypeid = dv.vehicle_SK

  LEFT OUTER JOIN weather_api wapi ON wapi.rideid = r.rideid
)
SELECT
   ride_distance,
   ride_duration,
   rideid,
   user_sk,
   start_lock_id,
   end_lock_id,
   date_sk,
   vehicle_sk,
   weather_id,
   md5(CONCAT(
       CAST(ride_distance AS STRING),
       CAST(ride_duration AS STRING),
       rideid,
       user_sk,
       CAST(weather_id AS STRING),
       CAST(start_lock_id AS STRING),
       CAST(end_lock_id AS STRING),
       CAST(date_sk AS STRING),
       CAST(vehicle_sk AS STRING)
   )) AS md5_hash
FROM base
""")

rides_fact_df.show(50)

+-------------------+-------------------+------+--------------------+-------------+-----------+-------+----------+----------+--------------------+
|      ride_distance|      ride_duration|rideid|             user_sk|start_lock_id|end_lock_id|date_sk|vehicle_sk|weather_id|            md5_hash|
+-------------------+-------------------+------+--------------------+-------------+-----------+-------+----------+----------+--------------------+
| 0.6990513194391876|  2.783333333333333|    17|7bb40033-08ee-477...|         2046|       1951|   1462|         1|         3|de0910aad562535a6...|
| 1.0614201762594762|  4.516666666666667|    25|605fd1a0-6bbe-4c1...|          985|       2148|   1462|         1|         3|211f7cb3059617df7...|
|  5.575740415478196|               19.6|    27|ff2042c7-6063-4c4...|         5619|       2717|   1462|         2|         3|bbfea59c83faad8bc...|
|0.48963521951239497| 1.9166666666666667|    33|72459a6f-b347-494...|         6038|       5980|   1462|         2|    

In [21]:
# removing the outliers
quantiles_df = rides_fact_df.selectExpr(
    "percentile_approx(ride_distance, array(0.25, 0.75)) as distance_quantiles",
    "percentile_approx(ride_duration, array(0.25, 0.75)) as duration_quantiles"
)

# Extract values
quantiles = quantiles_df.collect()[0]
distance_q1 = quantiles["distance_quantiles"][0]
distance_q3 = quantiles["distance_quantiles"][1]
distance_iqr = distance_q3 - distance_q1

duration_q1 = quantiles["duration_quantiles"][0]
duration_q3 = quantiles["duration_quantiles"][1]
duration_iqr = duration_q3 - duration_q1

# Apply the outlier filter (commonly 1.5 * IQR from Q1 and Q3)
rides_fact_df_no_outliers = rides_fact_df.filter(
    (col("ride_distance") >= distance_q1 - 1.5 * distance_iqr) &
    (col("ride_distance") <= distance_q3 + 1.5 * distance_iqr) &
    (col("ride_duration") >= duration_q1 - 1.5 * duration_iqr) &
    (col("ride_duration") <= duration_q3 + 1.5 * duration_iqr)
)

# formatting to make it to with 2 decimal places
rides_fact_df_final = rides_fact_df_no_outliers.withColumn(
    "ride_distance", round(col("ride_distance"), 2)
).withColumn(
    "ride_duration", round(col("ride_duration"), 2)
)



In [22]:
rides_fact_df_final.groupBy("vehicle_sk").count().show()

+----------+-------+
|vehicle_sk|  count|
+----------+-------+
|         2| 807710|
|         4|  63602|
|         1|2706079|
|         3| 256784|
+----------+-------+



In [None]:
rides_fact_df_final.count()
# result: "3834175"

In [23]:
rides_fact_df_final.show(21)

+-------------+-------------+------+--------------------+-------------+-----------+-------+----------+----------+--------------------+
|ride_distance|ride_duration|rideid|             user_sk|start_lock_id|end_lock_id|date_sk|vehicle_sk|weather_id|            md5_hash|
+-------------+-------------+------+--------------------+-------------+-----------+-------+----------+----------+--------------------+
|          0.7|         2.78|    17|7bb40033-08ee-477...|         2046|       1951|   1462|         1|         3|de0910aad562535a6...|
|         1.06|         4.52|    25|605fd1a0-6bbe-4c1...|          985|       2148|   1462|         1|         3|211f7cb3059617df7...|
|         5.58|         19.6|    27|ff2042c7-6063-4c4...|         5619|       2717|   1462|         2|         3|bbfea59c83faad8bc...|
|         0.49|         1.92|    33|72459a6f-b347-494...|         6038|       5980|   1462|         2|         3|f023390f5bdf7ad65...|
|         2.35|        11.28|    34|4e8cf62a-c4a3-4b8..

In [24]:
rides_fact_df_final.createOrReplaceTempView("rides_fact")


In [25]:
# rides_fact_df_final.write.format("delta").mode("overwrite").saveAsTable("ridesFact")
rides_fact_df_final.repartition(1).write.format("parquet").mode("overwrite").saveAsTable("ridesFact_pq")


In [26]:
spark.stop()