In [1]:
import os
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark import SparkContext
import pyspark.sql.functions as F

# Variables d'environnement
os.environ['PYSPARK_SUBMIT_ARGS'] = (
    '--packages org.apache.hadoop:hadoop-aws:3.2.0,org.apache.hadoop:hadoop-common:3.2.0,io.trino:trino-jdbc:422 '
    'pyspark-shell'
)
os.environ['S3_ENDPOINT'] = "http://minio:9000"
os.environ['AWS_ACCESS_KEY_ID'] = "minio"
os.environ['AWS_SECRET_ACCESS_KEY'] = "minio123"

# On reprend la config S3 de l'autre fichier
spark = (
    SparkSession.builder
    .appName("spark-silver-to-gold")
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID"))
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY"))
    .config("spark.hadoop.fs.s3a.endpoint", os.getenv("S3_ENDPOINT"))
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.attempts.maximum", "1")
    .config("spark.hadoop.fs.s3a.connection.establish.timeout", "5000")
    .config("spark.hadoop.fs.s3a.connection.timeout", "10000")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")

#Comme c'est bien foutu on peut juste lire la table en parquet comme on lisait le json
silver_path = "s3a://velib/silver/velib-disponibilite-en-temps-reel"
df_silver = spark.read.parquet(silver_path)

# Verifications
print("Schema:")
df_silver.printSchema()
print("\nSample data:")
df_silver.show(5, truncate=False)
print("\nTotal records:", df_silver.count())

Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.hadoop#hadoop-common added as a dependency
io.trino#trino-jdbc added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e5117e13-46b7-4cb4-b5db-0e3bce11c304;1.0
	confs: [default]


:: loading settings :: url = jar:file:/usr/local/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.hadoop#hadoop-aws;3.2.0 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.375 in central
	found org.apache.hadoop#hadoop-common;3.2.0 in central
	found org.apache.hadoop#hadoop-annotations;3.2.0 in central
	found com.google.guava#guava;11.0.2 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found commons-cli#commons-cli;1.2 in central
	found org.apache.commons#commons-math3;3.1.1 in central
	found org.apache.httpcomponents#httpclient;4.5.2 in central
	found org.apache.httpcomponents#httpcore;4.4.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.11 in central
	found commons-io#commons-io;2.5 in central
	found commons-net#commons-net;3.6 in central
	found commons-collections#commons-collections;3.2.2 in central
	found javax.servlet#javax.servlet-api;3.1.0 in central
	found org.eclipse.jetty#jetty-server;9.3.24.v20180605 in central
	found org.eclipse.jetty#jetty-http;9.3.24.v20180605 in cen

Schema:
root
 |-- capacity: long (nullable = true)
 |-- duedate: string (nullable = true)
 |-- ebike: long (nullable = true)
 |-- is_installed: string (nullable = true)
 |-- is_renting: string (nullable = true)
 |-- is_returning: string (nullable = true)
 |-- mechanical: long (nullable = true)
 |-- name: string (nullable = true)
 |-- nom_arrondissement_communes: string (nullable = true)
 |-- numbikesavailable: long (nullable = true)
 |-- numdocksavailable: long (nullable = true)
 |-- stationcode: string (nullable = true)
 |-- fill_ratio: double (nullable = true)
 |-- part_minute: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- part_day: date (nullable = true)


Sample data:
+--------+-------------------------+-----+------------+----------+------------+----------+-----------------------------------+---------------------------+-----------------+-----------------+-----------+-------------------+----------------+-----------------+--------




Total records: 977519


                                                                                

## Preprocess data
- No need to take data before 28th october, a lot are erroneous
- Create time column for better visualisation

In [2]:
# Filter df_silver for part_day on or after 2024-10-28
df_silver = df_silver.filter(F.col("part_day") >= "2024-10-28")
df_silver = df_silver.withColumn("time", F.date_format("part_minute", "HH:mm"))

df_silver.show()

+--------+--------------------+-----+------------+----------+------------+----------+--------------------+---------------------------+-----------------+-----------------+-----------+--------------------+----------------+------------------+------------------+----------+-----+
|capacity|             duedate|ebike|is_installed|is_renting|is_returning|mechanical|                name|nom_arrondissement_communes|numbikesavailable|numdocksavailable|stationcode|          fill_ratio|     part_minute|               lat|               lon|  part_day| time|
+--------+--------------------+-----+------------+----------+------------+----------+--------------------+---------------------------+-----------------+-----------------+-----------+--------------------+----------------+------------------+------------------+----------+-----+
|      35|2024-10-30T10:08:...|    1|         OUI|       OUI|         OUI|         2|Benjamin Godard -...|                      Paris|                3|               32|  

## Turnover rate par station par range de temps (à la requête)

In [3]:
from pyspark.sql.window import Window
 
# Define the window partitioned by station and ordered by time

window_spec = Window.partitionBy("name").orderBy("duedate")
 
# Calculate the turnover rate (absolute change in bike availability)

df_turnover = df_silver.withColumn("prev_numbikesavailable", F.lag("numbikesavailable").over(window_spec)) \
.withColumn("turnover_rate", F.abs(F.col("numbikesavailable") - F.col("prev_numbikesavailable"))) \
.groupBy("name", "part_day", "time") \
.agg(F.sum("turnover_rate").alias("total_turnover"))
 
df_turnover.show()
 



+--------------------+----------+-----+--------------+
|                name|  part_day| time|total_turnover|
+--------------------+----------+-----+--------------+
|Camille Groult - ...|2024-10-29|10:07|            10|
|Camille Groult - ...|2024-10-29|12:08|             2|
|Camille Groult - ...|2024-10-29|13:09|            20|
|Camille Groult - ...|2024-10-29|14:09|            20|
|Camille Groult - ...|2024-10-29|15:10|             9|
|Camille Groult - ...|2024-10-30|08:12|             7|
|Camille Groult - ...|2024-10-30|09:12|            33|
|Camille Groult - ...|2024-10-30|10:11|            13|
|Camille Groult - ...|2024-10-30|12:12|             2|
|Camille Groult - ...|2024-10-30|13:13|             1|
|Camille Groult - ...|2024-10-30|14:13|            11|
|Camille Groult - ...|2024-10-30|15:14|             1|
|Camille Groult - ...|2024-10-31|07:07|            12|
|Camille Groult - ...|2024-10-31|09:13|             5|
|Camille Groult - ...|2024-10-31|10:01|             1|
|Camille G

                                                                                

## Stats de base sur l'ensemble des stations par range de temps (5 minutes)

In [5]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType

# Stats par heure sur l'ensemble des stations
df_stats_all = (
    df_silver
    .filter(~(F.col("name").isNull() | (F.lower(F.col("name")) == "None")))
    .filter(
        ~(
            F.col("name").isNull() | 
            (F.lower(F.col("name")) == "none") | 
            (F.to_date(F.substring(F.col("duedate"), 1, 10), "yyyy-MM-dd") < "2024-10-29")
        )
    )
    .withColumn("part_minute", F.to_timestamp("duedate"))
    .withColumn("min_5",
        F.to_timestamp(
            F.floor(F.unix_timestamp("part_minute")/300)*300
        ))
    .groupBy("min_5")
    .agg(
        F.mean("numbikesavailable").alias("avg_bikes"),
        F.expr("percentile(numbikesavailable, 0.5)").alias("median_bikes"),
        F.min("numbikesavailable").alias("min_bikes"),
        F.max("numbikesavailable").alias("max_bikes"),
        F.mean("fill_ratio").alias("avg_fill_ratio"),
    )
    .orderBy("min_5")
)


In [6]:
df_stats_all.show()



+-------------------+------------------+------------+---------+---------+--------------------+
|              min_5|         avg_bikes|median_bikes|min_bikes|max_bikes|      avg_fill_ratio|
+-------------------+------------------+------------+---------+---------+--------------------+
|2024-10-29 10:00:00| 4.782251082251082|         2.0|        0|       46|  0.1685101396756692|
|2024-10-29 10:05:00| 9.766564879879327|         6.0|        0|       65|  0.3186371888021693|
|2024-10-29 10:10:00| 17.13543738910694|        14.0|        0|       74| 0.49606101935808405|
|2024-10-29 12:00:00| 4.433928571428571|         2.0|        0|       25| 0.16109986256218375|
|2024-10-29 12:05:00| 9.066532258064516|         5.0|        0|       56| 0.30904024885069004|
|2024-10-29 12:10:00|15.671435499515034|        12.0|        0|       74|  0.4510865535192377|
|2024-10-29 12:50:00|              0.74|         1.0|        0|        1|0.025517241379310347|
|2024-10-29 13:00:00|5.1321276595744685|         3

                                                                                

## Stats de base par station par range de temps (5 minutes)

In [7]:
df_stats_station = (
    df_silver
    .filter(~(F.col("name").isNull() | (F.lower(F.col("name")) == "None")))
    .filter(
        ~(
            F.col("name").isNull() | 
            (F.lower(F.col("name")) == "none") | 
            (F.to_date(F.substring(F.col("duedate"), 1, 10), "yyyy-MM-dd") < "2024-10-29")
        )
    )
    .withColumn("part_minute", F.to_timestamp("duedate"))
    .withColumn("min_5",
        F.to_timestamp(
            F.floor(F.unix_timestamp("part_minute") / 300) * 300
        ))
    .groupBy("name", "min_5", "lat", "lon")
    .agg(
        F.mean("numbikesavailable").alias("avg_bikes"),
        F.expr("percentile(numbikesavailable, 0.5)").alias("median_bikes"),
        F.min("numbikesavailable").alias("min_bikes"),
        F.max("numbikesavailable").alias("max_bikes"),
        F.mean("fill_ratio").alias("avg_fill_ratio"),
    )
    .orderBy("name", "min_5")
)

In [8]:
df_stats_station.show()



+--------------------+-------------------+-----------------+------------------+-------------------+------------+---------+---------+--------------------+
|                name|              min_5|              lat|               lon|          avg_bikes|median_bikes|min_bikes|max_bikes|      avg_fill_ratio|
+--------------------+-------------------+-----------------+------------------+-------------------+------------+---------+---------+--------------------+
|11 Novembre 1918 ...|2024-10-29 10:05:00|48.80889533774377|2.5382421165704727|               11.0|        11.0|       11|       11|  0.3055555555555555|
|11 Novembre 1918 ...|2024-10-29 12:05:00|48.80889533774377|2.5382421165704727|               11.0|        11.0|       11|       11|  0.3055555555555556|
|11 Novembre 1918 ...|2024-10-29 13:05:00|48.80889533774377|2.5382421165704727|               11.0|        11.0|       11|       11| 0.30555555555555547|
|11 Novembre 1918 ...|2024-10-29 14:05:00|48.80889533774377|2.53824211657047

                                                                                

### Observing for a station

In [9]:
df_stats_station.filter(F.col("name") == "Argenteuil - Voltaire").show()



+--------------------+-------------------+-----------------+------------------+------------------+------------+---------+---------+-------------------+
|                name|              min_5|              lat|               lon|         avg_bikes|median_bikes|min_bikes|max_bikes|     avg_fill_ratio|
+--------------------+-------------------+-----------------+------------------+------------------+------------+---------+---------+-------------------+
|Argenteuil - Volt...|2024-10-29 10:10:00|48.91864865955558|2.2814112156629562|3.2857142857142856|         3.0|        3|        4|0.20535714285714285|
|Argenteuil - Volt...|2024-10-29 12:00:00|48.91864865955558|2.2814112156629562| 7.333333333333333|         7.0|        7|        8| 0.4583333333333333|
|Argenteuil - Volt...|2024-10-29 13:05:00|48.91864865955558|2.2814112156629562|              4.26|         3.5|        2|        8|            0.26625|
|Argenteuil - Volt...|2024-10-29 14:05:00|48.91864865955558|2.2814112156629562| 3.672727

                                                                                

## Update hive metastore

In [14]:
import trino

host, port, user = 'trino-coordinator', 8080, 'trino'
conn = trino.dbapi.connect(host=host, port=port, user=user)
cur = conn.cursor()

def createTable(df, catalog, schema_name, table, schema_location, schema, partitioned_by, external_location):
    os.environ['S3_OUTPUT_PATH'] = external_location

    spark.sparkContext.setLogLevel("WARN")

    # Write DataFrame to S3
    (
        df.write
        .partitionBy(partitioned_by)
        .format("parquet")
        .mode("overwrite")
        .save(os.getenv('S3_OUTPUT_PATH'))
    )
    
    
    host, port, user = 'trino-coordinator', 8080, 'trino'
    conn = trino.dbapi.connect(host=host, port=port, user=user)
    cur = conn.cursor()
    
    queries = [
        f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema_name} WITH (location = '{schema_location}')",
        f"DROP TABLE IF EXISTS {catalog}.{schema_name}.{table}",
        f"""
        CREATE TABLE IF NOT EXISTS {catalog}.{schema_name}.{table} (
        {schema}
        )
        WITH (
            format = 'PARQUET',
            partitioned_by = ARRAY['{partitioned_by}'],
            external_location = '{external_location}'
        )
        """,
        f"USE {catalog}.{schema_name}",
        f"CALL system.sync_partition_metadata('{schema_name}', '{table}', 'ADD')",
        # f"CALL system.sync_partition_metadata('{catalog}.{schema_name}.{table}', 'ADD')",
        f"SELECT * FROM {catalog}.{schema_name}.{table} LIMIT 5"
    ]
    
    # Execute each query in the list
    for query in queries:
        try:
            cur.execute(query)
            # Check if the query is a SELECT query to fetch results
            if query.startswith("SELECT"):
                results = cur.fetchall()
                for row in results:
                    print(row)
            else:
                print(f"Executed: {query}")
        except Exception as e:
            print(f"Error executing query: {query}. Error: {e}")

    # Close the cursor and connection
    cur.close()
    conn.close()
    
schema_turnover = f"""
    part_day DATE,
    time VARCHAR,
    total_turnover BIGINT,
    name VARCHAR
"""

schema_stats_all = f"""
    avg_bikes DOUBLE,
    median_bikes DOUBLE,
    min_bikes BIGINT,
    max_bikes BIGINT,
    avg_fill_ratio DOUBLE,
    min_5 TIMESTAMP
"""

schema_stats_station = f"""
    min_5 TIMESTAMP,
    lat DOUBLE,
    lon DOUBLE,
    avg_bikes DOUBLE,
    median_bikes DOUBLE,
    min_bikes BIGINT,
    max_bikes BIGINT,
    avg_fill_ratio DOUBLE,
    name VARCHAR
"""

createTable(df_turnover, 'minio', 'velib_gold_turnover', 'velib_disponibilite_en_temps_reel_turnover', 's3a://velib/gold/', schema_turnover, 'name', 's3a://velib/gold/velib-disponibilite-en-temps-reel-turnover')
createTable(df_stats_all, 'minio', 'velib_gold_stats_all', 'velib_disponibilite_en_temps_reel_stats_all', 's3a://velib/gold/', schema_stats_all, 'min_5', 's3a://velib/gold/velib-disponibilite-en-temps-reel-stats_all')
createTable(df_stats_station, 'minio', 'velib_gold_stats_station', 'velib_disponibilite_en_temps_reel_stats_station', 's3a://velib/gold/', schema_stats_station, 'name', 's3a://velib/gold/velib-disponibilite-en-temps-reel-stats_station')

24/10/31 15:54:44 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/10/31 15:54:44 WARN S3AInstrumentation: Closing output stream statistics while data is still marked as pending upload in OutputStreamStatistics{blocksSubmitted=1, blocksInQueue=1, blocksActive=0, blockUploadsCompleted=0, blockUploadsFailed=0, bytesPendingUpload=1101, bytesUploaded=0, blocksAllocated=1, blocksReleased=1, blocksActivelyAllocated=0, exceptionsInMultipartFinalize=0, transferDuration=0 ms, queueDuration=0 ms, averageQueueTime=0 ms, totalUploadDuration=0 ms, effectiveBandwidth=0.0 bytes/s}
                                                                                

Executed: CREATE SCHEMA IF NOT EXISTS minio.velib_gold_turnover WITH (location = 's3a://velib/gold/')
Executed: DROP TABLE IF EXISTS minio.velib_gold_turnover.velib_disponibilite_en_temps_reel_turnover
Executed: 
        CREATE TABLE IF NOT EXISTS minio.velib_gold_turnover.velib_disponibilite_en_temps_reel_turnover (
        
    part_day DATE,
    time VARCHAR,
    total_turnover BIGINT,
    name VARCHAR

        )
        WITH (
            format = 'PARQUET',
            partitioned_by = ARRAY['name'],
            external_location = 's3a://velib/gold/velib-disponibilite-en-temps-reel-turnover'
        )
        
Executed: USE minio.velib_gold_turnover
Executed: CALL system.sync_partition_metadata('velib_gold_turnover', 'velib_disponibilite_en_temps_reel_turnover', 'ADD')
[datetime.date(2024, 10, 29), '10:01', 6, 'Béranger - Félix Faure']
[datetime.date(2024, 10, 29), '10:05', 13, 'Bas - Gabriel Péri']
[datetime.date(2024, 10, 29), '10:09', 19, 'Alexandre Parodi - Quai de Valmy']
[d

                                                                                

Executed: CREATE SCHEMA IF NOT EXISTS minio.velib_gold_stats_all WITH (location = 's3a://velib/gold/')
Executed: DROP TABLE IF EXISTS minio.velib_gold_stats_all.velib_disponibilite_en_temps_reel_stats_all
Executed: 
        CREATE TABLE IF NOT EXISTS minio.velib_gold_stats_all.velib_disponibilite_en_temps_reel_stats_all (
        
    avg_bikes DOUBLE,
    median_bikes DOUBLE,
    min_bikes BIGINT,
    max_bikes BIGINT,
    avg_fill_ratio DOUBLE,
    min_5 TIMESTAMP

        )
        WITH (
            format = 'PARQUET',
            partitioned_by = ARRAY['min_5'],
            external_location = 's3a://velib/gold/velib-disponibilite-en-temps-reel-stats_all'
        )
        
Executed: USE minio.velib_gold_stats_all
Executed: CALL system.sync_partition_metadata('velib_gold_stats_all', 'velib_disponibilite_en_temps_reel_stats_all', 'ADD')
[1.2275280898876404, 1.0, 0, 6, 0.04456010416564916, datetime.datetime(2024, 10, 31, 9, 25)]
[12.295228977182065, 10.0, 0, 56, 0.3940967682511678, 

24/10/31 15:56:16 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/10/31 15:56:16 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/10/31 15:56:16 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/10/31 15:56:16 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/10/31 15:56:16 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/10/31 15:56:16 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/10/31 15:56:16 WARN MemoryManager: Total allocation exceeds 95.00% 

Executed: CREATE SCHEMA IF NOT EXISTS minio.velib_gold_stats_station WITH (location = 's3a://velib/gold/')
Executed: DROP TABLE IF EXISTS minio.velib_gold_stats_station.velib_disponibilite_en_temps_reel_stats_station
Executed: 
        CREATE TABLE IF NOT EXISTS minio.velib_gold_stats_station.velib_disponibilite_en_temps_reel_stats_station (
        
    min_5 TIMESTAMP,
    lat DOUBLE,
    lon DOUBLE,
    avg_bikes DOUBLE,
    median_bikes DOUBLE,
    min_bikes BIGINT,
    max_bikes BIGINT,
    avg_fill_ratio DOUBLE,
    name VARCHAR

        )
        WITH (
            format = 'PARQUET',
            partitioned_by = ARRAY['name'],
            external_location = 's3a://velib/gold/velib-disponibilite-en-temps-reel-stats_station'
        )
        
Executed: USE minio.velib_gold_stats_station
Executed: CALL system.sync_partition_metadata('velib_gold_stats_station', 'velib_disponibilite_en_temps_reel_stats_station', 'ADD')
[datetime.datetime(2024, 10, 29, 15, 10), 48.862534941806, 2.3