In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql import SparkSession

# Creazione della sessione Spark
spark = SparkSession.builder \
    .appName("PageRank Recommendation System with Link Prediction") \
    .getOrCreate()
    # .config("spark.jars.packages", "graphframes:graphframes") \

In [3]:
# Caricamento dei dati
ratings = spark.read.csv("hdfs://localhost:9000/test/ratings.csv", header=True, inferSchema=True)

In [4]:
movies = spark.read.csv("hdfs://localhost:9000/test/movies.csv", header=True, inferSchema=True)

In [5]:
ratings = ratings.join(movies, on="movieId")

In [6]:
# Mappatura dei punteggi
mapping_score = {
    0.5: -1.0,
    1: -1.0,
    1.5: -0.5,
    2: 0.0,
    2.5: 0.0,
    3: 0.0,
    3.5: 0.5,
    4: 1.0,
    4.5: 1.1,
    5: 1.2
}

In [7]:
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col

map_score_udf = spark.udf.register("map_score", lambda x: mapping_score.get(x, 0), FloatType())
ratings = ratings.withColumn("weight", map_score_udf(col("rating")))

In [8]:
from pyspark.sql.types import StringType

# Creazione degli archi per il grafo (Utente -> Film)
edges = ratings.select(
    col("userId").cast(StringType()).alias("src"),
    col("title").alias("dst"),
    col("weight")
)

In [9]:
from pyspark.sql.functions import lit
# Creazione dei vertici (utenti e film)
vertices_users = ratings.select(col("userId").cast(StringType()).alias("id")).distinct().withColumn("bipartite", lit(0))

In [10]:
vertices_movies = ratings.select(col("title").alias("id")).distinct().withColumn("bipartite", lit(1))

In [11]:
vertices = vertices_users.union(vertices_movies)

In [12]:
from graphframes import GraphFrame

# Creazione del grafo
user_movie_graph = GraphFrame(vertices, edges)



In [13]:
# Proiezione del grafo: utente-utente e film-film
user_user_edges = user_movie_graph.edges.alias("e1") \
    .join(user_movie_graph.edges.alias("e2"), col("e1.dst") == col("e2.dst")) \
    .select(
        col("e1.src").alias("src"),
        col("e2.src").alias("dst"),
        (col("e1.weight") + col("e2.weight")).alias("weight")
    ).filter(col("src") != col("dst"))

user_user_graph = GraphFrame(user_movie_graph.vertices, user_user_edges)

In [14]:
movie_movie_edges = user_movie_graph.edges.alias("e1") \
    .join(user_movie_graph.edges.alias("e2"), col("e1.src") == col("e2.src")) \
    .select(
        col("e1.dst").alias("src"),
        col("e2.dst").alias("dst"),
        (col("e1.weight") + col("e2.weight")).alias("weight")
    ).filter(col("src") != col("dst"))

movie_movie_graph = GraphFrame(user_movie_graph.vertices, movie_movie_edges)

In [15]:
# Funzione per calcolare la similarità tra due film usando i vicini comuni
def common_neighbors_similarity(movie1, movie2, user_movie_graph):
    users1 = user_movie_graph.edges.filter(col("dst") == movie1).select("src").rdd.flatMap(lambda x: x).collect()
    users2 = user_movie_graph.edges.filter(col("dst") == movie2).select("src").rdd.flatMap(lambda x: x).collect()
    
    # Calcolare i vicini comuni
    common_users = set(users1).intersection(set(users2))
    return len(common_users)

In [16]:
# Funzione per predire i link tra un utente e un film
def predict_links(user_id, movie_id, user_movie_graph):
    movies = user_movie_graph.vertices.filter(col("bipartite") == 1).select("id").rdd.map(lambda row: row[0]).collect()
    
    similarity_scores = []
    for movie in movies:
        if movie != movie_id:
            score = common_neighbors_similarity(movie_id, movie, user_movie_graph)
            similarity_scores.append((movie, score))
    
    # Ordina i film in base alla similarità
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    return similarity_scores[:10]  # Restituisci i 10 film più simili

In [17]:
# Funzione per creare il vettore di preferenze per un utente
def create_preference_vector(user_id, user_movie_graph):
    edges = user_movie_graph.edges.filter(col("src") == user_id).rdd.map(lambda row: (row["dst"], row["weight"])).collect()
    print(f"Preference vector for user {user_id}: {list(edges)[:10]}")

    tot = sum([weight for _, weight in edges])
    
    if tot > 0:
        return {movie: weight / tot for movie, weight in edges}
    else:
        movies = user_movie_graph.vertices.filter(col("bipartite") == 1).select("id").rdd.map(lambda row: row[0]).collect()
        return {movie: 1 / len(movies) for movie in movies}

In [18]:
# Funzione per prevedere i film per un utente
def predict_user(user_id, user_movie_graph, movie_movie_graph):
    p_vec = create_preference_vector(user_id, user_movie_graph)
    already_seen = [movie for movie, weight in p_vec.items() if weight > 0]
    
    # Predici i film che l'utente potrebbe apprezzare
    predicted_links = []
    for movie in already_seen:
        similar_movies = predict_links(user_id, movie, user_movie_graph)
        for movie_id, score in similar_movies:
            if movie_id not in already_seen:
                predicted_links.append((movie_id, score))
    
    # Calcola il PageRank per i film
    pagerank_results = movie_movie_graph.pageRank(resetProbability=0.95, maxIter=20)
    item_rank = pagerank_results.vertices.select("id", "pagerank").rdd.map(lambda row: (row["id"], row["pagerank"])).collectAsMap()
    
    # Combina le raccomandazioni
    recommendations = sorted(predicted_links, key=lambda x: x[1], reverse=True)
    
    # Ordina i film in base al punteggio di PageRank
    recommendations += sorted(
        (movie for movie in item_rank if movie not in already_seen),
        key=lambda x: item_rank[x], reverse=True
    )
    
    return recommendations[:10]

In [None]:
user = "10"
recommended_movies = predict_user(user, user_movie_graph, movie_movie_graph)
print(f"Recommended movies for user {user}: {recommended_movies[:10]}")

Preference vector for user 10: [('Pulp Fiction (1994)', -1.0), ('Forrest Gump (1994)', 0.5), ('Aladdin (1992)', 1.0), ('Pretty Woman (1990)', 0.5), ('Casablanca (1942)', 1.0), ('Mary Poppins (1964)', -1.0), ('Dirty Dancing (1987)', 0.0), ('Graduate, The (1967)', 0.0), ('When Harry Met Sally... (1989)', 0.0), ('As Good as It Gets (1997)', 0.5)]


In [None]:
from pyspark.sql.functions import col, lit, size, collect_list
from pyspark.sql.types import IntegerType, FloatType, StringType