In [None]:
# from pyspark.sql import SparkSession

# # Initialize Spark session
# spark = SparkSession.builder \
#     .appName("Load JSON from MinIO to Trino") \
#     .config("spark.hadoop.fs.s3a.endpoint", "http://<MINIO-ENDPOINT>") \
#     .config("spark.hadoop.fs.s3a.access.key", "<MINIO-ACCESS-KEY>") \
#     .config("spark.hadoop.fs.s3a.secret.key", "<MINIO-SECRET-KEY>") \
#     .config("spark.hadoop.fs.s3a.path.style.access", "true") \
#     .getOrCreate()

# # Path to the JSON files in MinIO
# path = "s3a://<BUCKET_NAME>/bronze/velib-disponibilite-en-temps-reel/"

# # Read JSON files into DataFrame
# df = spark.read.json(path)

# # Define Trino connection properties
# trino_url = "jdbc:trino://<TRINO-ENDPOINT>:<PORT>/<CATALOG>/<SCHEMA>"
# trino_properties = {
#     "user": "<TRINO-USER>",
#     "password": "<TRINO-PASSWORD>",
#     "driver": "io.trino.jdbc.TrinoDriver"
# }

# # Write DataFrame to Trino table
# df.write \
#     .format("jdbc") \
#     .option("url", trino_url) \
#     .option("dbtable", "<TRINO_TABLE_NAME>") \
#     .options(**trino_properties) \
#     .mode("append") \
#     .save()

# # Stop Spark session
# spark.stop()


## 0. Create Spark session

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark import SparkContext

os.environ['PYSPARK_SUBMIT_ARGS'] = (
    '--packages org.apache.hadoop:hadoop-aws:3.2.0,org.apache.hadoop:hadoop-common:3.2.0,io.trino:trino-jdbc:422 '
    'pyspark-shell'
)
os.environ['S3_ENDPOINT'] = "http://minio:9000"
os.environ['AWS_ACCESS_KEY_ID'] = "minio"
os.environ['AWS_SECRET_ACCESS_KEY'] = "minio123"

spark = (
    SparkSession.builder
    .appName("spark-read-minio")
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID"))
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY"))
    .config("spark.hadoop.fs.s3a.endpoint", os.getenv("S3_ENDPOINT"))
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.attempts.maximum", "1")
    .config("spark.hadoop.fs.s3a.connection.establish.timeout", "5000")
    .config("spark.hadoop.fs.s3a.connection.timeout", "10000")
    .getOrCreate()
)



:: loading settings :: url = jar:file:/usr/local/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.hadoop#hadoop-common added as a dependency
io.trino#trino-jdbc added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6ab569bf-cbd2-41d4-bf84-ab5065f2fdfd;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.2.0 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.375 in central
	found org.apache.hadoop#hadoop-common;3.2.0 in central
	found org.apache.hadoop#hadoop-annotations;3.2.0 in central
	found com.google.guava#guava;11.0.2 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found commons-cli#commons-cli;1.2 in central
	found org.apache.commons#commons-math3;3.1.1 in central
	found org.apache.httpcomponents#httpclient;4.5.2 in central
	found org.apache.httpcomponents#httpcore;4.4.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	f

## 1. Extract from object storage

In [2]:
os.environ['S3_INPUT_PATH'] = "s3a://velib/bronze/velib-disponibilite-en-temps-reel/*/*"

spark.sparkContext.setLogLevel("WARN")

df = spark.read.json(os.getenv("S3_INPUT_PATH"))
df.printSchema()
df.show(1, False)
df.count()

24/10/30 09:16:29 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

root
 |-- capacity: long (nullable = true)
 |-- code_insee_commune: string (nullable = true)
 |-- coordonnees_geo: struct (nullable = true)
 |    |-- lat: double (nullable = true)
 |    |-- lon: double (nullable = true)
 |-- duedate: string (nullable = true)
 |-- ebike: long (nullable = true)
 |-- is_installed: string (nullable = true)
 |-- is_renting: string (nullable = true)
 |-- is_returning: string (nullable = true)
 |-- mechanical: long (nullable = true)
 |-- name: string (nullable = true)
 |-- nom_arrondissement_communes: string (nullable = true)
 |-- numbikesavailable: long (nullable = true)
 |-- numdocksavailable: long (nullable = true)
 |-- stationcode: string (nullable = true)

+--------+------------------+---------------------+-------------------------+-----+------------+----------+------------+----------+-----------------------------+---------------------------+-----------------+-----------------+-----------+
|capacity|code_insee_commune|coordonnees_geo      |duedate       

                                                                                

357853

## 2. Transform

In [3]:
import pyspark.sql.functions as F

df_transformed = (
    df
    .withColumn("fill_ratio", F.col("numbikesavailable") / F.col("capacity"))
    .withColumn("part_day", F.substring(F.col("duedate"), 1, 10))
    .withColumn("part_minute", F.substring(F.col("duedate"), 1, 16))
    .withColumn("lat", F.col("coordonnees_geo.lat"))
    .withColumn("lon", F.col("coordonnees_geo.lon"))
    .drop("code_insee_commune")
    .drop("nom_arrondissement")
    .drop("coordonnees_geo")
    .filter(F.col("part_day") >= "2024-01-01")
)
df_transformed.printSchema()
df_transformed.select("*").show(1, False)
df_transformed.count()

root
 |-- capacity: long (nullable = true)
 |-- duedate: string (nullable = true)
 |-- ebike: long (nullable = true)
 |-- is_installed: string (nullable = true)
 |-- is_renting: string (nullable = true)
 |-- is_returning: string (nullable = true)
 |-- mechanical: long (nullable = true)
 |-- name: string (nullable = true)
 |-- nom_arrondissement_communes: string (nullable = true)
 |-- numbikesavailable: long (nullable = true)
 |-- numdocksavailable: long (nullable = true)
 |-- stationcode: string (nullable = true)
 |-- fill_ratio: double (nullable = true)
 |-- part_day: string (nullable = true)
 |-- part_minute: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)

+--------+-------------------------+-----+------------+----------+------------+----------+-----------------------------+---------------------------+-----------------+-----------------+-----------+-------------------+----------+----------------+---------+--------+
|capacity|duedate    

                                                                                

356407

## 3. Load into object storage

In [4]:
# os.environ['TRINO_ENDPOINT'] = "jdbc:trino://localhost:8079"
# os.environ['TRINO_CATALOG'] = "default_catalog"
# os.environ['TRINO_SCHEMA'] = "velib"
# os.environ['TRINO_TABLE'] = "velib-disponibilite-en-temps-reel"
# os.environ['TRINO_USR'] = "toto"
# os.environ['TRINO_PWD'] = "tata"

# trino_properties = {
#     "user": os.getenv("TRINO_USR"),
#     "password": os.getenv("TRINO_PWD"),
#     "drive": "io.trino.jdbc.TrinoDriver",
#     "url": f'{os.getenv("TRINO_ENDPOINT")}/{os.getenv("TRINO_CATALOG")}/{os.getenv("TRINO_SCHEMA")}',
#     "dbtable": os.getenv("TRINO_TABLE")
# }

# Write DataFrame to Trino table
# (
#     df_transformed.write
#     .format("jdbc")
#     .options(**trino_properties)
#     .mode("append")
#     .save()
# )

os.environ['S3_OUTPUT_PATH'] = "s3a://velib/silver/velib-disponibilite-en-temps-reel"

spark.sparkContext.setLogLevel("WARN")

# Write DataFrame to S3
(
    df_transformed.write
    .partitionBy("part_day")
    .format("parquet")
    .mode("overwrite")
    .save(os.getenv('S3_OUTPUT_PATH'))
)

24/10/30 09:17:02 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/10/30 09:17:02 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
24/10/30 09:17:02 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
24/10/30 09:17:02 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
24/10/30 09:17:02 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
24/10/30 09:17:02 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 58.46% for 13 writers
24/10/30 09:17:02 WARN MemoryManager: Total allocation exceeds 95.

## 4. Update hive metastore

In [6]:
!pip install trino



In [7]:
import trino

host, port, user = 'trino-coordinator', 8080, 'trino'
conn = trino.dbapi.connect(host=host, port=port, user=user)
cur = conn.cursor()

catalog, schema, table = 'minio', 'velib_silver', 'velib_disponibilite_en_temps_reel'
schema_location = 's3a://velib/silver/'
partitioned_by = 'part_day'
external_location = 's3a://velib/silver/velib-disponibilite-en-temps-reel'
# List of queries to execute
queries = [
    f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema} WITH (location = '{schema_location}')",
    f"DROP TABLE IF EXISTS {catalog}.{schema}.{table}",
    f"""
    CREATE TABLE IF NOT EXISTS {catalog}.{schema}.{table} (
        capacity BIGINT,
        duedate VARCHAR,
        ebike BIGINT,
        is_installed VARCHAR,
        is_renting VARCHAR,
        is_returning VARCHAR,
        mechanical BIGINT,
        name VARCHAR,
        nom_arrondissement_communes VARCHAR,
        numbikesavailable BIGINT,
        numdocksavailable BIGINT,
        stationcode VARCHAR,
        fill_ratio DOUBLE,
        part_minute VARCHAR,
        lat DOUBLE,
        lon DOUBLE,
        part_day VARCHAR
    ) 
    WITH (
        format = 'PARQUET',
        partitioned_by = ARRAY['{partitioned_by}'],
        external_location = '{external_location}'
    )
    """,
    "USE minio.velib_silver",
    "CALL system.sync_partition_metadata('velib_silver', 'velib_disponibilite_en_temps_reel', 'ADD')",
    "SELECT * FROM minio.velib_silver.velib_disponibilite_en_temps_reel LIMIT 5"
]

# Execute each query in the list
for query in queries:
    try:
        cur.execute(query)
        # Check if the query is a SELECT query to fetch results
        if query.startswith("SELECT"):
            results = cur.fetchall()
            for row in results:
                print(row)
        else:
            print(f"Executed: {query}")
    except Exception as e:
        print(f"Error executing query: {query}. Error: {e}")

# Close the cursor and connection
cur.close()
conn.close()


Executed: CREATE SCHEMA IF NOT EXISTS minio.velib_silver WITH (location = 's3a://velib/silver/')
Executed: DROP TABLE IF EXISTS minio.velib_silver.velib_disponibilite_en_temps_reel
Executed: 
    CREATE TABLE IF NOT EXISTS minio.velib_silver.velib_disponibilite_en_temps_reel (
        capacity BIGINT,
        duedate VARCHAR,
        ebike BIGINT,
        is_installed VARCHAR,
        is_renting VARCHAR,
        is_returning VARCHAR,
        mechanical BIGINT,
        name VARCHAR,
        nom_arrondissement_communes VARCHAR,
        numbikesavailable BIGINT,
        numdocksavailable BIGINT,
        stationcode VARCHAR,
        fill_ratio DOUBLE,
        part_minute VARCHAR,
        lat DOUBLE,
        lon DOUBLE,
        part_day VARCHAR
    ) 
    WITH (
        format = 'PARQUET',
        partitioned_by = ARRAY['part_day'],
        external_location = 's3a://velib/silver/velib-disponibilite-en-temps-reel'
    )
    
Executed: USE minio.velib_silver
Executed: CALL system.sync_partit