In [2]:
import os
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark import SparkContext
import pyspark.sql.functions as F

# Variables d'environnement
os.environ['PYSPARK_SUBMIT_ARGS'] = (
    '--packages org.apache.hadoop:hadoop-aws:3.2.0,org.apache.hadoop:hadoop-common:3.2.0,io.trino:trino-jdbc:422 '
    'pyspark-shell'
)
os.environ['S3_ENDPOINT'] = "http://minio:9000"
os.environ['AWS_ACCESS_KEY_ID'] = "minio"
os.environ['AWS_SECRET_ACCESS_KEY'] = "minio123"

# On reprend la config S3 de l'autre fichier
spark = (
    SparkSession.builder
    # .master("spark://spark:7077")
    .appName("spark-silver-to-gold-1hour")
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID"))
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY"))
    .config("spark.hadoop.fs.s3a.endpoint", os.getenv("S3_ENDPOINT"))
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.attempts.maximum", "1")
    .config("spark.hadoop.fs.s3a.connection.establish.timeout", "5000")
    .config("spark.hadoop.fs.s3a.connection.timeout", "10000")
    .getOrCreate()
)

#spark.sparkContext.setLogLevel("WARN")

#Comme c'est bien foutu on peut juste lire la table en parquet comme on lisait le json
silver_path = "s3a://velib/silver/velib-disponibilite-en-temps-reel"
df_silver = spark.read.parquet(silver_path)

# Verifications
#print("Schema:")
#df_silver.printSchema()
print("\nTotal records:", df_silver.count())




Total records: 860721


                                                                                

In [None]:
#df_silver.limit(10).toPandas().head()

In [3]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType

# Turnover (mouvements) par heure par station
turnover_window = Window.partitionBy("stationcode").orderBy("duedate")
df_gold_turnover = (
    df_silver
    .withColumn("part_minute_dt", F.to_timestamp("duedate"))  # Convert string to timestamp
    .withColumn("turnover", F.abs(F.lag("numbikesavailable", 1).over(turnover_window) - F.col("numbikesavailable")))
    .withColumn("hour", F.date_trunc("hour", "part_minute_dt"))  # Truncate to hour
    .groupBy("stationcode", "name", "hour", "lat", "lon")  # Include 'name' in groupBy
    .agg(
        F.sum("turnover").alias("total_turnover"),
    )
)
    

# Stats par heure sur l'ensemble des stations
df_gold_stats = (
    df_silver
    .filter(~(F.col("name").isNull() | (F.lower(F.col("name")) == "None")))
    .filter(
        ~(
            F.col("name").isNull() | 
            (F.lower(F.col("name")) == "none") | 
            (F.to_date(F.substring(F.col("duedate"), 1, 10), "yyyy-MM-dd") < "2024-10-29")
        )
    )
    .withColumn("part_minute_dt", F.to_timestamp("duedate"))
    .withColumn("hour", F.date_trunc("hour", "part_minute_dt"))
    .groupBy("hour")
    .agg(
        F.mean("numbikesavailable").alias("avg_bikes"),
        F.expr("percentile_approx(numbikesavailable, 0.5)").cast(IntegerType()).alias("median_bikes"),
        F.min("numbikesavailable").alias("min_bikes"),
        F.max("numbikesavailable").alias("max_bikes"),
        F.mean("fill_ratio").alias("avg_fill_ratio"),
        F.expr("percentile_approx(fill_ratio, 0.5)").cast(IntegerType()).alias("median_fill_ratio"),
        F.min("fill_ratio").alias("min_fill_ratio"),
        F.max("fill_ratio").alias("max_fill_ratio")
    )
    .orderBy( "hour")
)

# Stats par heure pour chacune des stations
df_gold_station_stats = (
    df_silver
    .filter(~(F.col("name").isNull() | (F.lower(F.col("name")) == "None")))
    .filter(
        ~(
            F.col("name").isNull() | 
            (F.lower(F.col("name")) == "none") | 
            (F.to_date(F.substring(F.col("duedate"), 1, 10), "yyyy-MM-dd") < "2024-10-29")
        )
    )
    .withColumn("part_minute_dt", F.to_timestamp("duedate"))
    .withColumn("hour", F.date_trunc("hour", "part_minute_dt"))
    .groupBy("stationcode", "name", "hour", "lat", "lon")
    .agg(
        F.mean("numbikesavailable").alias("avg_bikes"),
        F.expr("percentile_approx(numbikesavailable, 0.5)").cast(IntegerType()).alias("median_bikes"),
        F.min("numbikesavailable").alias("min_bikes"),
        F.max("numbikesavailable").alias("max_bikes"),
        F.mean("fill_ratio").alias("avg_fill_ratio"),
        F.expr("percentile_approx(fill_ratio, 0.5)").cast(IntegerType()).alias("median_fill_ratio"),
        F.min("fill_ratio").alias("min_fill_ratio"),
        F.max("fill_ratio").alias("max_fill_ratio")
    )
    .orderBy("stationcode", "hour")
)
#df_gold_station_stats.printSchema()

In [None]:
#df_gold_stats.printSchema()

In [None]:
#df_gold_turnover.printSchema()

In [4]:
df_gold_station_stats.printSchema()

root
 |-- stationcode: string (nullable = true)
 |-- name: string (nullable = true)
 |-- hour: timestamp (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- avg_bikes: double (nullable = true)
 |-- median_bikes: integer (nullable = true)
 |-- min_bikes: long (nullable = true)
 |-- max_bikes: long (nullable = true)
 |-- avg_fill_ratio: double (nullable = true)
 |-- median_fill_ratio: integer (nullable = true)
 |-- min_fill_ratio: double (nullable = true)
 |-- max_fill_ratio: double (nullable = true)



In [None]:
# Write the gold tables to S3
#gold_output_path = "s3a://velib/gold/"
#df_gold_turnover.write.parquet(f"{gold_output_path}turnover_rate")
#df_gold_stats.write.parquet(f"{gold_output_path}overall_station_stats")
#df_gold_station_stats.write.parquet(f"{gold_output_path}per_station_stats")

In [5]:
import trino
host, port, user = 'trino-coordinator', 8080, 'trino'
conn = trino.dbapi.connect(host=host, port=port, user=user)
cur = conn.cursor()
def createTable(df, catalog, schema_name, table, schema_location, schema, partitioned_by, external_location):
    os.environ['S3_OUTPUT_PATH'] = external_location
    #spark.sparkContext.setLogLevel("WARN")
    # Write DataFrame to S3
    (
        df.write
        .partitionBy(partitioned_by)
        .format("parquet")
        .mode("overwrite")
        .save(os.getenv('S3_OUTPUT_PATH'))
    )
    
    
    host, port, user = 'trino-coordinator', 8080, 'trino'
    conn = trino.dbapi.connect(host=host, port=port, user=user)
    cur = conn.cursor()
    
    queries = [
        f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema_name} WITH (location = '{schema_location}')",
        f"DROP TABLE IF EXISTS {catalog}.{schema_name}.{table}",
        f"""
        CREATE TABLE IF NOT EXISTS {catalog}.{schema_name}.{table} (
        {schema}
        )
        WITH (
            format = 'PARQUET',
            partitioned_by = ARRAY['{partitioned_by}'],
            external_location = '{external_location}'
        )
        """,
        f"USE {catalog}.{schema_name}",
        f"CALL system.sync_partition_metadata('{schema_name}', '{table}', 'ADD')",
        # f"CALL system.sync_partition_metadata('{catalog}.{schema_name}.{table}', 'ADD')",
        f"SELECT * FROM {catalog}.{schema_name}.{table} LIMIT 5"
    ]
    
    # Execute each query in the list
    for query in queries:
        try:
            cur.execute(query)
            # Check if the query is a SELECT query to fetch results
            if query.startswith("SELECT"):
                results = cur.fetchall()
                for row in results:
                    print(row)
            else:
                print(f"Executed: {query}")
        except Exception as e:
            print(f"Error executing query: {query}. Error: {e}")
    # Close the cursor and connection
    cur.close()
    conn.close()

In [8]:
schema_turnover = f"""
    name VARCHAR,
    hour TIMESTAMP,
    lat DOUBLE,
    lon DOUBLE,
    total_turnover BIGINT,
    stationcode VARCHAR
"""

createTable(df_gold_turnover, 'minio', 'velib_gold', 'velib_turnover', 's3a://velib/gold/', schema_turnover, 'stationcode', 's3a://velib/gold/velib_turnover')
 

24/10/31 11:11:25 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/10/31 11:11:25 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
24/10/31 11:11:25 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
24/10/31 11:11:25 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/10/31 11:11:25 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
24/10/31 11:11:25 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 58.46% for 13 writers
24/10/31 11:11:25 WARN MemoryManager: Total allocation exceeds 95.

Executed: CREATE SCHEMA IF NOT EXISTS minio.velib_gold WITH (location = 's3a://velib/gold/')
Executed: DROP TABLE IF EXISTS minio.velib_gold.velib_turnover
Executed: 
        CREATE TABLE IF NOT EXISTS minio.velib_gold.velib_turnover (
        
    name VARCHAR,
    hour TIMESTAMP,
    lat DOUBLE,
    lon DOUBLE,
    total_turnover BIGINT,
    stationcode VARCHAR

        )
        WITH (
            format = 'PARQUET',
            partitioned_by = ARRAY['stationcode'],
            external_location = 's3a://velib/gold/velib_turnover'
        )
        
Executed: USE minio.velib_gold
Executed: CALL system.sync_partition_metadata('velib_gold', 'velib_turnover', 'ADD')
['Place André Malraux', datetime.datetime(2024, 10, 29, 10, 0), 48.863925158727, 2.3356226831675, 0, '1015']
['Louis Blanc - La Chapelle', datetime.datetime(2024, 10, 29, 10, 0), 48.884008964514, 2.3599335551262, 4, '10034']
['Jacquard - Ternaux', datetime.datetime(2024, 10, 29, 10, 0), 48.8646605, 2.373162, 34, '11031']
[

In [9]:
schema_stats_all = f"""
    avg_bikes DOUBLE,
    median_bikes INTEGER,
    min_bikes BIGINT,
    max_bikes BIGINT,
    avg_fill_ratio DOUBLE,
    median_fill_ratio INTEGER,
    min_fill_ratio DOUBLE,
    max_fill_ratio DOUBLE,
    hour TIMESTAMP
"""

createTable(df_gold_stats, 'minio', 'velib_gold', 'velib_stats_all', 's3a://velib/gold/', schema_stats_all, 'hour', 's3a://velib/gold/velib_stats_all')

24/10/31 11:15:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/10/31 11:15:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
24/10/31 11:15:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
24/10/31 11:15:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/10/31 11:15:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
24/10/31 11:15:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 58.46% for 13 writers
24/10/31 11:15:23 WARN MemoryManager: Total allocation exceeds 95.

Executed: CREATE SCHEMA IF NOT EXISTS minio.velib_gold WITH (location = 's3a://velib/gold/')
Executed: DROP TABLE IF EXISTS minio.velib_gold.velib_stats_all
Executed: 
        CREATE TABLE IF NOT EXISTS minio.velib_gold.velib_stats_all (
        
    avg_bikes DOUBLE,
    median_bikes INTEGER,
    min_bikes BIGINT,
    max_bikes BIGINT,
    avg_fill_ratio DOUBLE,
    median_fill_ratio INTEGER,
    min_fill_ratio DOUBLE,
    max_fill_ratio DOUBLE,
    hour TIMESTAMP

        )
        WITH (
            format = 'PARQUET',
            partitioned_by = ARRAY['hour'],
            external_location = 's3a://velib/gold/velib_stats_all'
        )
        
Executed: USE minio.velib_gold
Executed: CALL system.sync_partition_metadata('velib_gold', 'velib_stats_all', 'ADD')
[12.269397079220038, 8, 0, 69, 0.3769421216516648, 0, 0.0, 1.76, datetime.datetime(2024, 10, 30, 10, 0)]
[4.0, 4, 4, 4, 0.13333333333333333, 0, 0.13333333333333333, 0.13333333333333333, datetime.datetime(2024, 10, 29, 23, 0)]

In [13]:
schema_stats_station = f"""
    name VARCHAR,
    hour TIMESTAMP,
    lat DOUBLE,
    lon DOUBLE,
    avg_bikes DOUBLE,
    median_bikes INTEGER,
    min_bikes BIGINT,
    max_bikes BIGINT,
    avg_fill_ratio DOUBLE,
    median_fill_ratio INTEGER,
    min_fill_ratio DOUBLE,
    max_fill_ratio DOUBLE,
    stationcode VARCHAR
"""

createTable(df_gold_station_stats, 'minio', 'velib_gold', 'velib_stats_station', 's3a://velib/gold/', schema_stats_station, 'stationcode', 's3a://velib/gold/velib_stats_station')

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:41735)
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 977, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1115, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:41735)