In [2]:
#imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth, count, mean, stddev, regexp_extract,avg, count
from pyspark.sql.window import Window

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [3]:
#initialisation de la session spark
spark = SparkSession.builder \
    .appName("BatchETLPipelineEnhanced") \
    .master("local[*]") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()




Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/02 09:22:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
def safe_load_dataframe(func, *args, error_msg="Erreur lors du chargement des données", **kwargs):
    """Fonction utilitaire pour charger les dataframes en toute sécurité avec gestion des erreurs"""
    try:
        return func(*args, **kwargs)
    except Exception as e:
        print(f"{error_msg}: {str(e)}")
        import sys
        sys.exit(1)


In [5]:
  # Chargement movies
movies_minimal = safe_load_dataframe(
    spark.read.csv,
    "hdfs://namenode:9000/movie-lens/rawdata/movie.csv",
    header=True, inferSchema=True
)

                                                                                

In [6]:
# Charger un très petit échantillon des évaluations
ratings_minimal = safe_load_dataframe(
        spark.read.csv,
        "hdfs://namenode:9000/movie-lens/rawdata/rating.csv", 
        header=True, inferSchema=True
).limit(1000000).cache()

                                                                                

In [7]:
'''#chargement du csv movies
movies = spark.read.csv(
    "hdfs://namenode:9000/movie-lens/rawdata/movie.csv",
    header=True, inferSchema=True
)'''

'#chargement du csv movies\nmovies = spark.read.csv(\n    "hdfs://namenode:9000/movie-lens/rawdata/movie.csv",\n    header=True, inferSchema=True\n)'

In [8]:
'''#chargement du csv rating
ratings = spark.read.csv(
    "hdfs://namenode:9000/movie-lens/rawdata/rating.csv",
    header=True, inferSchema=True
)'''

'#chargement du csv rating\nratings = spark.read.csv(\n    "hdfs://namenode:9000/movie-lens/rawdata/rating.csv",\n    header=True, inferSchema=True\n)'

In [9]:
# Nettoyage initial des données
# Nettoyage des films
movies = movies_minimal.dropna()  # Supprimer les lignes avec des valeurs manquantes
movies = movies.dropDuplicates(['movieId'])  # Supprimer les doublons

# Nettoyage des évaluations
ratings = ratings_minimal.dropna()
ratings = ratings.dropDuplicates(['userId', 'movieId'])  # Une évaluation par utilisateur/film

In [10]:
from pyspark.sql.functions import to_date, year, month, dayofmonth


In [11]:
ratings = ratings.withColumn('rating_date', to_date('timestamp')).withColumn('rating_year', year('timestamp')).withColumn('rating_month', month('timestamp')).withColumn('rating_day', dayofmonth('timestamp'))

In [12]:
# Filtrer les évaluations extrêmes (trop basses ou trop hautes)
ratings = ratings.filter((ratings.rating >= 0.5) & (ratings.rating <= 5.0))

In [13]:
#ne garder que les films avec un minimum d'évaluations
from pyspark.sql.functions import count
movie_rating_counts = ratings.groupBy('movieId').agg(count('rating').alias('rating_count'))
movies = movies.join(movie_rating_counts, 'movieId', 'left')
movies = movies.filter(movies.rating_count >= 10)  

In [14]:
#ne garder que les utilisateurs ayant évalué un minimum de films
user_rating_counts = ratings.groupBy('userId').agg(count('rating').alias('user_rating_count'))
ratings = ratings.join(user_rating_counts, 'userId', 'left')
ratings = ratings.filter(ratings.user_rating_count >= 20)  # Seuil à ajuster

In [15]:
# Ajout d'informations sur les films aux évaluations
enriched_ratings = ratings.join(movies, "movieId", "left")

# Calcul de la moyenne des notes par film
movie_stats = ratings.groupBy("movieId").agg(
    avg("rating").alias("avg_rating"),
    count("rating").alias("num_ratings")
)

# Calcul de l'activité des utilisateurs
user_stats = ratings.groupBy("userId").agg(
    count("rating").alias("user_total_ratings"),
    avg("rating").alias("user_avg_rating")
)
# Fusion des statistiques avec les données enrichies
enriched_ratings = enriched_ratings.join(movie_stats, "movieId", "left")
enriched_ratings = enriched_ratings.join(user_stats, "userId", "left")

In [16]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.streaming import StreamingContext
from pyspark.sql.functions import from_json, col, struct, rand
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType, TimestampType
import time
import os
import matplotlib.pyplot as plt
import numpy as np

### Modele ALS

In [17]:
training_data = enriched_ratings.orderBy(rand())
training_data = training_data.select("userId", "movieId", "rating")

In [18]:


# Split train/test
train, test = training_data.randomSplit([0.8, 0.2], seed=42)


In [19]:
# entrainement du model
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    rank=20,
    maxIter=10,
    regParam=0.1,
    coldStartStrategy="drop",  # Pour éviter NaN lors des prédictions
    nonnegative=True
)




In [20]:
model = als.fit(train)

25/05/02 09:23:42 WARN BlockManager: Block rdd_27_0 already exists on this machine; not re-adding it
25/05/02 09:24:20 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [21]:
# evaluation du model
predictions = model.transform(test)

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)
print(f"RMSE du modèle ALS sur le test set : {rmse:.4f}")


[Stage 139:>                                                      (0 + 12) / 12]

RMSE du modèle ALS sur le test set : 0.7522


                                                                                

In [22]:
model.write().overwrite().save("hdfs://namenode:9000/movie-lens/models/als_model")

                                                                                

In [23]:
# Affichage des prédictions 10
predictions.select("userId", "movieId", "rating", "prediction").orderBy("userId", "movieId").show(10)


[Stage 253:>                                                      (0 + 12) / 12]

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|     1|    112|   3.5| 3.4432836|
|     1|    260|   4.0|  4.172689|
|     1|    318|   4.0|  4.203028|
|     1|   1097|   4.0| 3.6698084|
|     1|   1198|   4.5| 4.1139584|
|     1|   1222|   3.5| 3.7203972|
|     1|   1304|   3.0| 3.6708007|
|     1|   1525|   3.0| 2.8368628|
|     1|   1920|   3.5| 3.0198135|
|     1|   1997|   3.5| 3.5845442|
+------+-------+------+----------+
only showing top 10 rows



                                                                                

In [24]:
#Recommander les 10 meilleurs films pour chaque utilisateur
user_recommendations = model.recommendForAllUsers(10)
user_recommendations.show(truncate=False)




+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                                        |
+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|28    |[{83182, 5.178395}, {1685, 5.0803757}, {6160, 5.0638304}, {88678, 5.0529585}, {78653, 4.9082255}, {82931, 4.8076205}, {105246, 4.797654}, {96592, 4.797654}, {6818, 4.787118}, {87234, 4.7412353}]     |
|31    |[{66915, 6.6571393}, {44949, 5.9532495}, {6311, 5.8647633}, {6600, 5.813042}, {4763, 5.5262346}, {52845, 5.339627}, {59295, 5.3141274}, {1817, 5.294243}, {2

                                                                                

In [25]:
#  Recommander les 10 meilleurs utilisateurs pour chaque film :
movie_recommendations = model.recommendForAllItems(10)
movie_recommendations.show(truncate=False)




+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|movieId|recommendations                                                                                                                                                                             |
+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|28     |[{2572, 5.048024}, {862, 4.999926}, {4539, 4.9911613}, {1362, 4.934133}, {1943, 4.9096384}, {6354, 4.902889}, {722, 4.8665247}, {741, 4.841887}, {3654, 4.8143783}, {2180, 4.8024755}]      |
|31     |[{3493, 4.8696265}, {5071, 4.7865405}, {3939, 4.7798247}, {3615, 4.7748055}, {2424, 4.744201}, {434, 4.7315307}, {3926, 4.713034}, {6678, 4.677958}, {3354, 4.656839}, {6126, 4.648616}]    |
|34  

                                                                                

In [26]:
# Pour un utilisateur spécifique 
single_user = ratings.select("userId").distinct().filter(col("userId") == 123)

recommendations_123 = model.recommendForUserSubset(single_user, 10)
recommendations_123.show(truncate=False)


+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                          |
+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|123   |[{727, 4.712902}, {6823, 4.664118}, {82931, 4.570569}, {7077, 4.498721}, {88678, 4.4797106}, {5484, 4.452812}, {5251, 4.450688}, {27369, 4.4186525}, {4261, 4.408932}, {95776, 4.404784}]|
+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

