In [7]:
## 0. Create Spark session

import os
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark import SparkContext
import pyspark.sql.functions as F

# Variables d'environnement
os.environ['PYSPARK_SUBMIT_ARGS'] = (
    '--packages org.apache.hadoop:hadoop-aws:3.2.0,org.apache.hadoop:hadoop-common:3.2.0,io.trino:trino-jdbc:422 '
    'pyspark-shell'
)
os.environ['S3_ENDPOINT'] = "http://minio:9000"
os.environ['AWS_ACCESS_KEY_ID'] = "minio"
os.environ['AWS_SECRET_ACCESS_KEY'] = "minio123"

# On reprend la config S3 de l'autre fichier
spark = (
    SparkSession.builder
    .appName("spark-silver-to-gold")
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID"))
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY"))
    .config("spark.hadoop.fs.s3a.endpoint", os.getenv("S3_ENDPOINT"))
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.attempts.maximum", "1")
    .config("spark.hadoop.fs.s3a.connection.establish.timeout", "5000")
    .config("spark.hadoop.fs.s3a.connection.timeout", "10000")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")


silver_path = "s3a://velib/silver/velib-disponibilite-en-temps-reel"
df_silver = spark.read.parquet(silver_path)
date_coupure = "2024-10-29"
df_silver = df_silver.filter(F.col("duedate") >= date_coupure)
# Verifications
print("Schema:")
df_silver.printSchema()
print("\nSample data:")
df_silver.show(5, truncate=False)
print("\nTotal records:", df_silver.count())

Schema:
root
 |-- capacity: long (nullable = true)
 |-- duedate: string (nullable = true)
 |-- ebike: long (nullable = true)
 |-- is_installed: string (nullable = true)
 |-- is_renting: string (nullable = true)
 |-- is_returning: string (nullable = true)
 |-- mechanical: long (nullable = true)
 |-- name: string (nullable = true)
 |-- nom_arrondissement_communes: string (nullable = true)
 |-- numbikesavailable: long (nullable = true)
 |-- numdocksavailable: long (nullable = true)
 |-- stationcode: string (nullable = true)
 |-- fill_ratio: double (nullable = true)
 |-- part_minute: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- part_day: date (nullable = true)


Sample data:
+--------+-------------------------+-----+------------+----------+------------+----------+-----------------------------------+---------------------------+-----------------+-----------------+-----------+-------------------+----------------+-----------------+--------




Total records: 347392


                                                                                

In [25]:

# Calcul du taux de rotation par station
from pyspark.sql import Window
time_interval = "1 hour"  
df_silver = df_silver.withColumn("timestamp", F.to_timestamp("duedate")) 
df_rotations = (
    df_silver
    .withColumn("lag", F.lag("numbikesavailable").over(Window.partitionBy("stationcode").orderBy("timestamp")))
    .withColumn("change", F.col("numbikesavailable") - F.col("lag"))
    .withColumn("rotation", F.when(F.col("change") > 0, F.col("change"))  # Arrivées
                               .when(F.col("change") < 0, -F.col("change"))  # Départs
                               .otherwise(0))
    .withColumn("part_day", F.substring(F.col("duedate"), 1, 10))
    .groupBy(F.window("timestamp", time_interval), "stationcode", "part_day")
    .agg(F.sum("rotation").alias("total_rotation"),
             F.avg("capacity").alias("capacity"),
            F.coalesce(F.avg("numbikesavailable"), F.lit(0)).alias("numbikesavailable2") )
    
)

# Stats de base sur l'ensemble des stations
df_stats_global = (
    df_silver
    .withColumn("part_day", F.substring(F.col("duedate"), 1, 10))
    .groupBy(F.window("timestamp", time_interval))
    .agg(
        F.avg("numbikesavailable").alias("avg_bikes"),
        F.expr("percentile_approx(numbikesavailable, 0.5)").alias("median_bikes"),
        F.min("numbikesavailable").alias("min_bikes"),
        F.max("numbikesavailable").alias("max_bikes")
    )
)

# Stats de base par station
df_stats_station = (
    df_silver
    .withColumn("part_day", F.substring(F.col("duedate"), 1, 10))
    .groupBy("stationcode", F.window("timestamp", time_interval))
    
    
    .agg(
        F.avg("numbikesavailable").alias("avg_bikes"),
        F.expr("percentile_approx(numbikesavailable, 0.5)").alias("median_bikes"),
        F.min("numbikesavailable").alias("min_bikes"),
        F.max("numbikesavailable").alias("max_bikes"),
        F.first("name").alias("name")
    )
)

# Écriture des résultats dans S3
df_rotations_sorted = df_rotations.orderBy(F.col("total_rotation").desc())


df_station_8002 = df_rotations.filter(F.col("stationcode") == 8002)
df_rotations.limit(30).toPandas()



                                                                                

Unnamed: 0,window,stationcode,part_day,total_rotation,capacity,numbikesavailable2
0,"(2024-10-29 09:00:00, 2024-10-29 10:00:00)",26005,2024-10-29,2,30.0,7.25
1,"(2024-10-29 10:00:00, 2024-10-29 11:00:00)",26005,2024-10-29,39,30.0,6.311111
2,"(2024-10-29 12:00:00, 2024-10-29 13:00:00)",26005,2024-10-29,5,30.0,2.142857
3,"(2024-10-29 13:00:00, 2024-10-29 14:00:00)",26005,2024-10-29,45,30.0,1.8
4,"(2024-10-29 14:00:00, 2024-10-29 15:00:00)",26005,2024-10-29,21,30.0,3.017241
5,"(2024-10-29 15:00:00, 2024-10-29 16:00:00)",26005,2024-10-29,8,30.0,1.410256
6,"(2024-10-30 08:00:00, 2024-10-30 09:00:00)",26005,2024-10-30,24,30.0,16.677419
7,"(2024-10-30 09:00:00, 2024-10-30 10:00:00)",26005,2024-10-30,0,30.0,17.0
8,"(2024-10-29 09:00:00, 2024-10-29 10:00:00)",30002,2024-10-29,0,40.0,14.0
9,"(2024-10-29 10:00:00, 2024-10-29 11:00:00)",30002,2024-10-29,0,40.0,14.0


In [6]:
df_stats_global.toPandas()

                                                                                

Unnamed: 0,window,avg_bikes,median_bikes,min_bikes,max_bikes
0,"(2024-10-29 12:00:00, 2024-10-29 13:00:00)",11.835305,7,0,74
1,"(2024-10-29 10:00:00, 2024-10-29 11:00:00)",12.072469,7,0,72
2,"(2024-10-29 13:00:00, 2024-10-29 14:00:00)",11.920472,7,0,75
3,"(2024-10-29 09:00:00, 2024-10-29 10:00:00)",12.153993,7,0,70
4,"(2024-10-29 15:00:00, 2024-10-29 16:00:00)",11.533213,7,0,75
5,"(2024-10-30 08:00:00, 2024-10-30 09:00:00)",11.908617,8,0,66
6,"(2024-10-29 14:00:00, 2024-10-29 15:00:00)",11.776882,7,0,75
7,"(2024-10-30 09:00:00, 2024-10-30 10:00:00)",12.69017,9,0,67


In [7]:
df_stats_station.toPandas()

                                                                                

Unnamed: 0,stationcode,window,avg_bikes,median_bikes,min_bikes,max_bikes
0,10021,"(2024-10-29 10:00:00, 2024-10-29 11:00:00)",1.822222,2,1,4
1,10039,"(2024-10-30 08:00:00, 2024-10-30 09:00:00)",2.451613,2,2,3
2,10039,"(2024-10-30 09:00:00, 2024-10-30 10:00:00)",3.000000,3,3,3
3,1006,"(2024-10-29 15:00:00, 2024-10-29 16:00:00)",26.512821,26,24,29
4,10202,"(2024-10-29 09:00:00, 2024-10-29 10:00:00)",2.000000,2,2,2
...,...,...,...,...,...,...
11530,8019,"(2024-10-30 09:00:00, 2024-10-30 10:00:00)",9.000000,9,9,9
11531,8036,"(2024-10-29 13:00:00, 2024-10-29 14:00:00)",18.740000,18,18,22
11532,8048,"(2024-10-30 09:00:00, 2024-10-30 10:00:00)",20.000000,20,20,20
11533,8057,"(2024-10-30 09:00:00, 2024-10-30 10:00:00)",16.000000,16,16,16


In [26]:
# Définir le chemin de sortie S3
os.environ['S3_OUTPUT_PATH'] = "s3a://velib/gold/velib-disponibilite-en-temps-reel"
output_path = os.getenv('S3_OUTPUT_PATH')

# Ajout de la colonne hour_start à partir de la fenêtre
df_rotations = df_rotations.withColumn("hour_start", F.col("window.start"))
df_stats_global = df_stats_global.withColumn("hour_start", F.col("window.start"))
df_stats_station = df_stats_station.withColumn("hour_start", F.col("window.start"))

# Supprimer la colonne `window` 
df_rotations = df_rotations.drop("window")
df_stats_global = df_stats_global.drop("window")
df_stats_station = df_stats_station.drop("window")

# Écrire le DataFrame `df_rotations` dans S3 en partitionnant par `hour_start`
(
    df_rotations.write
    .partitionBy("hour_start")  # Utilisation de `hour_start` pour le partitionnement
    .format("parquet")
    .mode("overwrite")
    .save(f"{output_path}/turnover")
)

# Écrire les statistiques globales avec partitionnement par `hour_start`
(
    df_stats_global.write
    .partitionBy("hour_start")
    .format("parquet")
    .mode("overwrite")
    .save(f"{output_path}/stats_global")
)

# Écrire les statistiques par station avec partitionnement par `name`
(
    df_stats_station.write
    .partitionBy("name")
    .format("parquet")
    .mode("overwrite")
    .save(f"{output_path}/stats_station")
)

24/10/31 09:17:07 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/10/31 09:17:07 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
24/10/31 09:17:07 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
24/10/31 09:17:07 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/10/31 09:17:07 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
24/10/31 09:17:07 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 58.46% for 13 writers
24/10/31 09:17:07 WARN MemoryManager: Total allocation exceeds 95.

In [19]:
!pip install trino



In [37]:
import os
import trino
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Connection
def get_trino_connection():
    host, port, user = 'trino-coordinator', 8080, 'trino'
    return trino.dbapi.connect(host=host, port=port, user=user)

# Définition des schémas
schema_rotations = """
    part_day VARCHAR,
    total_rotation BIGINT,
    capacity DOUBLE,
    numbikesavailable2 DOUBLE,
    timestamp TIMESTAMP,
    stationcode VARCHAR,
    hour_start TIMESTAMP
    
"""

schema_stats_global = """
 
    avg_bikes DOUBLE,
    median_bikes BIGINT,
    min_bikes BIGINT,
    max_bikes BIGINT,
    hour_start TIMESTAMP
"""

schema_stats_station = """
   stationcode VARCHAR,
    avg_bikes DOUBLE,
    median_bikes BIGINT,
    min_bikes BIGINT,
    max_bikes BIGINT,
    name VARCHAR
"""

# Fonction pour créer la table Trino
def create_trino_table(catalog, schema_name, table, schema, partitioned_by, external_location):
    conn = get_trino_connection()
    cur = conn.cursor()
    queries = [
        f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema_name} WITH (location = '{external_location}')",
        f"DROP TABLE IF EXISTS {catalog}.{schema_name}.{table}",
        f"""
        CREATE TABLE IF NOT EXISTS {catalog}.{schema_name}.{table} (
        {schema}
        )
        WITH (
            format = 'PARQUET',
            partitioned_by = ARRAY['{partitioned_by}'],
            external_location = '{external_location}'
        )
        """,
        f"CALL {catalog}.system.sync_partition_metadata('{schema_name}', '{table}', 'ADD')",
        f"SELECT * FROM {catalog}.{schema_name}.{table} LIMIT 5"
    ]
    
    for query in queries:
        try:
            cur.execute(query)
            if query.startswith("SELECT"):
                results = cur.fetchall()
                for row in results:
                    print(row)
            else:
                print(f"Executed: {query}")
        except Exception as e:
            print(f"Error executing query: {query}. Error: {e}")
    
    cur.close()
    conn.close()

# Chemin de l'emplacement externe
output_path = 's3a://velib/gold/velib-disponibilite-en-temps-reel'

# Appel de la fonction
create_trino_table('minio', 'velib_gold_turnover', 'velib_disponibilite_en_temps_reel_rotations', 
                   schema_rotations, 'hour_start', output_path + '/turnover')

create_trino_table('minio', 'velib_gold_stats_all', 'velib_disponibilite_en_temps_reel_stats_global', 
                   schema_stats_global, 'hour_start', output_path + '/stats_global')

create_trino_table('minio', 'velib_gold_stats_station', 'velib_disponibilite_en_temps_reel_stats_station', 
                   schema_stats_station, 'name', output_path + '/stats_station')


Executed: CREATE SCHEMA IF NOT EXISTS minio.velib_gold_turnover WITH (location = 's3a://velib/gold/velib-disponibilite-en-temps-reel/turnover')
Executed: DROP TABLE IF EXISTS minio.velib_gold_turnover.velib_disponibilite_en_temps_reel_rotations
Executed: 
        CREATE TABLE IF NOT EXISTS minio.velib_gold_turnover.velib_disponibilite_en_temps_reel_rotations (
        
    part_day VARCHAR,
    total_rotation BIGINT,
    capacity DOUBLE,
    numbikesavailable2 DOUBLE,
    timestamp TIMESTAMP,
    stationcode VARCHAR,
    hour_start TIMESTAMP
    

        )
        WITH (
            format = 'PARQUET',
            partitioned_by = ARRAY['hour_start'],
            external_location = 's3a://velib/gold/velib-disponibilite-en-temps-reel/turnover'
        )
        
Executed: CALL minio.system.sync_partition_metadata('velib_gold_turnover', 'velib_disponibilite_en_temps_reel_rotations', 'ADD')
['2024-10-30', 16, 34.0, 4.548387096774194, None, '17021', datetime.datetime(2024, 10, 30, 8, 0)]

In [35]:
df_station_8002 = df_rotations.filter(F.col("stationcode") == 13123)
df_station_8002.toPandas()

                                                                                

Unnamed: 0,stationcode,part_day,total_rotation,capacity,numbikesavailable2,hour_start
0,13123,2024-10-29,0,42.0,70.0,2024-10-29 09:00:00
1,13123,2024-10-29,11,42.0,70.577778,2024-10-29 10:00:00
2,13123,2024-10-29,3,42.0,74.0,2024-10-29 12:00:00
3,13123,2024-10-29,15,42.0,74.68,2024-10-29 13:00:00
4,13123,2024-10-29,12,42.0,74.810345,2024-10-29 14:00:00
5,13123,2024-10-29,31,42.0,73.820513,2024-10-29 15:00:00
6,13123,2024-10-30,83,42.0,28.741935,2024-10-30 08:00:00
7,13123,2024-10-30,6,42.0,33.0,2024-10-30 09:00:00
