In [1]:
 

#imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth, count, mean, stddev, regexp_extract, avg, round, sum as _sum, floor,size,split,to_date
from pyspark.sql.window import Window

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


#initialisation de la session spark
spark = SparkSession.builder \
    .appName("BatchETLPipelineEnhanced") \
    .master("local[*]") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()


#chargement du csv movies
movies = spark.read.csv(
    "hdfs://namenode:9000/movie-lens/rawdata/movie.csv",
    header=True, inferSchema=True
)

#chargement du csv rating
ratings = spark.read.csv(
    "hdfs://namenode:9000/movie-lens/rawdata/rating.csv",
    header=True, inferSchema=True
)

# Nettoyage initial des données
# Nettoyage des films
movies = movies.dropna()  # Supprimer les lignes avec des valeurs manquantes
movies = movies.dropDuplicates(['movieId'])  # Supprimer les doublons

# Nettoyage des évaluations
ratings = ratings.dropna()
ratings = ratings.dropDuplicates(['userId', 'movieId'])  # Une évaluation par utilisateur/film

ratings = ratings.withColumn('rating_date', to_date('timestamp')).withColumn('rating_year', year('timestamp')).withColumn('rating_month', month('timestamp')).withColumn('rating_day', dayofmonth('timestamp'))
# Filtrer les évaluations extrêmes (trop basses ou trop hautes)
ratings = ratings.filter((ratings.rating >= 0.5) & (ratings.rating <= 5.0))
#ne garder que les films avec un minimum d'évaluations
from pyspark.sql.functions import count
movie_rating_counts = ratings.groupBy('movieId').agg(count('rating').alias('rating_count'))
movies = movies.join(movie_rating_counts, 'movieId', 'left')
movies = movies.filter(movies.rating_count >= 10)  
#ne garder que les utilisateurs ayant évalué un minimum de films
user_rating_counts = ratings.groupBy('userId').agg(count('rating').alias('user_rating_count'))
ratings = ratings.join(user_rating_counts, 'userId', 'left')
ratings = ratings.filter(ratings.user_rating_count >= 20)  # Seuil à ajuster

# Ajout d'informations sur les films aux évaluations
enriched_ratings = ratings.join(movies, "movieId", "left")

# Calcul de la moyenne des notes par film
movie_stats = ratings.groupBy("movieId").agg(
    avg("rating").alias("avg_rating"),
    count("rating").alias("num_ratings")
)

# Calcul de l'activité des utilisateurs
user_stats = ratings.groupBy("userId").agg(
    count("rating").alias("user_total_ratings"),
    avg("rating").alias("user_avg_rating")
)
# Fusion des statistiques avec les données enrichies
enriched_ratings = enriched_ratings.join(movie_stats, "movieId", "left")
enriched_ratings = enriched_ratings.join(user_stats, "userId", "left")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/30 15:37:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [2]:
#imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth, count, mean, stddev, regexp_extract,avg, count
from pyspark.sql.window import Window

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [3]:
#initialisation de la session spark
spark = SparkSession.builder \
    .appName("BatchETLPipelineEnhanced") \
    .master("local[*]") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()




In [4]:
def safe_load_dataframe(func, *args, error_msg="Erreur lors du chargement des données", **kwargs):
    """Fonction utilitaire pour charger les dataframes en toute sécurité avec gestion des erreurs"""
    try:
        return func(*args, **kwargs)
    except Exception as e:
        print(f"{error_msg}: {str(e)}")
        import sys
        sys.exit(1)


In [5]:
  # Charger un très petit échantillon des films (limité à 100)
movies_minimal = safe_load_dataframe(
    spark.read.csv,
    "hdfs://namenode:9000/movie-lens/rawdata/movie.csv",
    header=True, inferSchema=True
).limit(100000).cache()

In [6]:
# Charger un très petit échantillon des évaluations (limité à 1000)
ratings_minimal = safe_load_dataframe(
        spark.read.csv,
        "hdfs://namenode:9000/movie-lens/rawdata/rating.csv", 
        header=True, inferSchema=True
).limit(1000000).cache()

                                                                                

In [5]:
'''#chargement du csv movies
movies = spark.read.csv(
    "hdfs://namenode:9000/movie-lens/rawdata/movie.csv",
    header=True, inferSchema=True
)'''

                                                                                

In [6]:
'''#chargement du csv rating
ratings = spark.read.csv(
    "hdfs://namenode:9000/movie-lens/rawdata/rating.csv",
    header=True, inferSchema=True
)'''

                                                                                

In [7]:
# Nettoyage initial des données
# Nettoyage des films
movies = movies_minimal.dropna()  # Supprimer les lignes avec des valeurs manquantes
movies = movies_minimal.dropDuplicates(['movieId'])  # Supprimer les doublons

# Nettoyage des évaluations
ratings = ratings_minimal.dropna()
ratings = ratings_minimal.dropDuplicates(['userId', 'movieId'])  # Une évaluation par utilisateur/film

In [8]:
from pyspark.sql.functions import to_date, year, month, dayofmonth


In [9]:
ratings = ratings.withColumn('rating_date', to_date('timestamp')).withColumn('rating_year', year('timestamp')).withColumn('rating_month', month('timestamp')).withColumn('rating_day', dayofmonth('timestamp'))

In [10]:
# Filtrer les évaluations extrêmes (trop basses ou trop hautes)
ratings = ratings.filter((ratings.rating >= 0.5) & (ratings.rating <= 5.0))

In [11]:
#ne garder que les films avec un minimum d'évaluations
from pyspark.sql.functions import count
movie_rating_counts = ratings.groupBy('movieId').agg(count('rating').alias('rating_count'))
movies = movies.join(movie_rating_counts, 'movieId', 'left')
movies = movies.filter(movies.rating_count >= 10)  

In [12]:
#ne garder que les utilisateurs ayant évalué un minimum de films
user_rating_counts = ratings.groupBy('userId').agg(count('rating').alias('user_rating_count'))
ratings = ratings.join(user_rating_counts, 'userId', 'left')
ratings = ratings.filter(ratings.user_rating_count >= 20)  # Seuil à ajuster

In [13]:
# Ajout d'informations sur les films aux évaluations
enriched_ratings = ratings.join(movies, "movieId", "left")

# Calcul de la moyenne des notes par film
movie_stats = ratings.groupBy("movieId").agg(
    avg("rating").alias("avg_rating"),
    count("rating").alias("num_ratings")
)

# Calcul de l'activité des utilisateurs
user_stats = ratings.groupBy("userId").agg(
    count("rating").alias("user_total_ratings"),
    avg("rating").alias("user_avg_rating")
)
# Fusion des statistiques avec les données enrichies
enriched_ratings = enriched_ratings.join(movie_stats, "movieId", "left")
enriched_ratings = enriched_ratings.join(user_stats, "userId", "left")

In [14]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.streaming import StreamingContext
from pyspark.sql.functions import from_json, col, struct, rand
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType, TimestampType
import time
import os
import matplotlib.pyplot as plt
import numpy as np

# 1. Préparation des données pour le modèle ALS
# Sélection des colonnes nécessaires pour l'entraînement

In [15]:
# Optionnel mais recommandé : réordonner de manière aléatoire pour split
ratings_randomized = ratings.orderBy(rand())

# Split train/test
train, test = ratings_randomized.randomSplit([0.8, 0.2], seed=42)


In [16]:
# entrainement du model
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    rank=20,
    maxIter=10, 
    regParam=0.1,
    coldStartStrategy="drop",  # Pour éviter NaN lors des prédictions
    nonnegative=True
)




In [19]:
model = als.fit(train)

                                                                                

In [20]:
# evaluation du model
predictions = model.transform(test)

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)
print(f"RMSE du modèle ALS sur le test set : {rmse:.4f}")


[Stage 215:>                                                        (0 + 8) / 8]

RMSE du modèle ALS sur le test set : 0.7516


                                                                                

## Sauvegarde HDFS

In [21]:
model.write().overwrite().save("hdfs://namenode:9000/movie-lens/models/als_model")

                                                                                