## üì¶ √âtape 0 : Installation et upload du fichier

In [None]:
# Installation de boto3
!pip install boto3 -q
print("‚úÖ boto3 install√©")

In [None]:
import boto3
from botocore.client import Config

# Configuration du client S3 pour MinIO
s3 = boto3.client(
    's3',
    endpoint_url='http://minio:9000',
    aws_access_key_id='minioadmin',
    aws_secret_access_key='minioadmin',
    config=Config(signature_version='s3v4'),
    region_name='us-east-1'
)

print("‚úÖ Client boto3 configur√©")

In [None]:
# Cr√©er le bucket datalake
bucket = 'datalake'

try:
    s3.head_bucket(Bucket=bucket)
    print(f"‚úÖ Bucket '{bucket}' existe")
except:
    s3.create_bucket(Bucket=bucket)
    print(f"‚úÖ Bucket '{bucket}' cr√©√©")

In [None]:
# Cr√©er et uploader le fichier CSV
csv_content = """id,produit,prix,quantite
1,Stylo,1.20,10
2,Cahier,2.50,5
3,Gomme,0.80,20
4,Stylo,1.20,15"""

s3.put_object(
    Bucket='datalake',
    Key='bronze/ventes/ventes.csv',
    Body=csv_content,
    ContentType='text/csv'
)

print("‚úÖ Fichier CSV upload√© dans datalake/bronze/ventes/ventes.csv")

# V√©rification
obj = s3.get_object(Bucket='datalake', Key='bronze/ventes/ventes.csv')
content = obj['Body'].read().decode('utf-8')
print("\nüìÑ Contenu du fichier upload√© :")
print(content)

## ‚ú® √âtape 1 : Connexion PySpark ‚Üî MinIO

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("TP Bronze Silver Gold") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .getOrCreate()

print(f"‚úÖ SparkSession cr√©√©e - Version : {spark.version}")
print(f"üìä Spark UI : http://localhost:4040")

## ü•â √âtape 2 : Layer Bronze - Ingestion brute

In [None]:
# Lecture du CSV depuis MinIO (sans inf√©rence de sch√©ma pour √©viter les erreurs)
df_bronze = spark.read.csv(
    "s3a://datalake/bronze/ventes/ventes.csv",
    header=True,
    inferSchema=False  # On lit tout en String d'abord
)

print("ü•â BRONZE - Donn√©es brutes :")
df_bronze.show()
df_bronze.printSchema()
print(f"\nüìä {df_bronze.count()} lignes")

In [None]:
# Sauvegarde Bronze en Parquet
df_bronze.write.mode("overwrite").parquet("s3a://datalake/bronze/ventes/parquet/")
print("‚úÖ Bronze sauvegard√© en Parquet")

## ü•à √âtape 3 : Layer Silver - Nettoyage + typage

In [None]:
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, DoubleType

# Typage et nettoyage
df_silver = df_bronze \
    .withColumn("id", col("id").cast(IntegerType())) \
    .withColumn("prix", col("prix").cast(DoubleType())) \
    .withColumn("quantite", col("quantite").cast(IntegerType())) \
    .dropna() \
    .withColumn("montant_total", col("prix") * col("quantite"))

print("ü•à SILVER - Donn√©es nettoy√©es :")
df_silver.show()
df_silver.printSchema()
print(f"\nüìä {df_silver.count()} lignes")

In [None]:
# Sauvegarde Silver en Parquet
df_silver.write.mode("overwrite").parquet("s3a://datalake/silver/ventes/")
print("‚úÖ Silver sauvegard√© en Parquet")

## ü•á √âtape 4 : Layer Gold - Agr√©gation (CA par produit)

In [None]:
from pyspark.sql.functions import sum as _sum, count, round as _round

# Calcul du CA par produit
df_gold = df_silver \
    .groupBy("produit") \
    .agg(
        _sum("montant_total").alias("chiffre_affaires"),
        _sum("quantite").alias("quantite_totale"),
        count("id").alias("nombre_ventes")
    ) \
    .withColumn("chiffre_affaires", _round("chiffre_affaires", 2)) \
    .orderBy(col("chiffre_affaires").desc())

print("ü•á GOLD - CA par produit :")
df_gold.show()
print(f"\nüìä {df_gold.count()} produits")

In [None]:
# Sauvegarde Gold en Parquet
df_gold.write.mode("overwrite").parquet("s3a://datalake/gold/ca_par_produit/")
print("‚úÖ Gold sauvegard√© en Parquet")

## üéâ √âtape 5 : V√©rification finale

In [None]:
print("=" * 70)
print("üìÇ STRUCTURE COMPL√àTE DANS MinIO")
print("=" * 70)

for prefix in ['bronze/', 'silver/', 'gold/']:
    print(f"\nüìÅ {prefix}")
    try:
        response = s3.list_objects_v2(Bucket='datalake', Prefix=prefix)
        if 'Contents' in response:
            for obj in response['Contents']:
                size_kb = obj['Size'] / 1024
                print(f"   - {obj['Key']} ({size_kb:.2f} KB)")
        else:
            print("   ‚ö†Ô∏è  Vide")
    except Exception as e:
        print(f"   ‚ùå Erreur : {e}")

print("\n" + "=" * 70)
print("‚úÖ PIPELINE ETL TERMIN√â AVEC SUCC√àS !")
print("=" * 70)

print("\nüìä R√©sum√© :")
print(f"   Bronze : {df_bronze.count()} lignes")
print(f"   Silver : {df_silver.count()} lignes")
print(f"   Gold   : {df_gold.count()} produits")

# Arr√™t de Spark
spark.stop()
print("\n‚úÖ SparkSession arr√™t√©e")