USARE LINK PREDICTION CON ADAMIC ADAR PER PAGE RANK

In [3]:
# pip install hdfs
# ./hdfscli.cfg

#     [global]
#     default.alias = dev

#     [dev.alias]
#     url = http://localhost:9870

# from hdfs import Config

# client = Config().get_client('dev')
# test = client.list('/test')
# print(test)

# with client.read('/test/movies.csv') as reader:
#     movies = reader.read()

In [4]:
import findspark
findspark.init()

In [5]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
from graphframes import GraphFrame

# Creazione della SparkSession
# spark = SparkSession.builder.appName("Spark GraphFrames Example").getOrCreate()

try:
    # Configura SparkSession per connettersi al master
    spark = SparkSession.builder \
        .appName("Spark GraphFrames Example") \
        .master("spark://192.168.0.112:7077") \
        .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9001") \
        .getOrCreate()

    # Verifica versione Spark
    print(f"Spark Version: {spark.version}")

    # Verifica configurazione del master
    master_config = spark.conf.get("spark.master")
    print(f"Connected to master: {master_config}")

    # Test: crea un DataFrame vuoto
    test_df = spark.createDataFrame([], schema="id INT, value STRING")
    test_df.show()

    print("Connection successful, Spark is ready!")

except Exception as e:
    print("Failed to connect to Spark master.")
    print(f"Error: {e}")


Spark Version: 3.5.3
Connected to master: spark://192.168.0.112:7077
+---+-----+
| id|value|
+---+-----+
+---+-----+

Connection successful, Spark is ready!


In [6]:
# # Verifica accesso a HDFS
# hdfs_files = spark.read.format("text").load("hdfs://localhost:9001/test")
# hdfs_files.show()

In [7]:
# Caricamento dei dati da HDFS
ratings = spark.read.csv('hdfs://localhost:50010/test/ratings.csv', header=True, inferSchema=True)
movies = spark.read.csv('hdfs://localhost:50010/test/movies.csv', header=True, inferSchema=True)

In [8]:
# Merge dei dataset ratings e movies
user_movie_matrix = ratings.join(movies, on="movieId", how="inner")
user_movie_matrix.show(5)

+-------+------+------+---------+--------------------+--------------------+
|movieId|userId|rating|timestamp|               title|              genres|
+-------+------+------+---------+--------------------+--------------------+
|      1|     1|   4.0|964982703|    Toy Story (1995)|Adventure|Animati...|
|      3|     1|   4.0|964981247|Grumpier Old Men ...|      Comedy|Romance|
|      6|     1|   4.0|964982224|         Heat (1995)|Action|Crime|Thri...|
|     47|     1|   5.0|964983815|Seven (a.k.a. Se7...|    Mystery|Thriller|
|     50|     1|   5.0|964982931|Usual Suspects, T...|Crime|Mystery|Thr...|
+-------+------+------+---------+--------------------+--------------------+
only showing top 5 rows



In [9]:
# # Controllo per film comuni valutati da più utenti
# common_movies = user_movie_matrix.groupBy("title").count()
# common_movies.filter(col("count") > 1).show(10)

In [10]:
# Mappatura dei punteggi

mapping_score = {
        0.5: -1.0,
        1: -1.0,
        1.5: -0.5,
        2: 0.0,
        2.5: 0.0,
        3: 0.0,
        3.5: 0.5,
        4: 1.0,
        4.5: 1.1,
        5: 1.2
}

In [11]:
map_score_udf = spark.udf.register("map_score", lambda x: mapping_score.get(x, 0), FloatType())
user_movie_matrix = user_movie_matrix.withColumn("weight", map_score_udf(col("rating")))

In [12]:
# Creazione degli edge
edges = user_movie_matrix.select(
    col("userId").cast("string").alias("src"),
    col("movieId").cast("string").alias("dst"),
    col("weight")
)

In [13]:
# Creazione dei vertici

user_vertices = user_movie_matrix.select(col("userId").cast("string").alias("id")).distinct().withColumn("bipartite", lit(0))
movie_vertices = user_movie_matrix.select(col("movieId").cast("string").alias("id")).distinct().withColumn("bipartite", lit(1))
vertices = user_vertices.union(movie_vertices)

In [14]:
# Creazione del grafo bipartito
user_movie_graph = GraphFrame(vertices, edges)



In [15]:
# Proiezione user-user
def project_user_user_graph(user_movie_graph):
    user_user_edges = user_movie_graph.edges.alias("e1") \
        .join(user_movie_graph.edges.alias("e2"), col("e1.dst") == col("e2.dst")) \
        .select(
            col("e1.src").alias("src"),
            col("e2.src").alias("dst"),
            (col("e1.weight") + col("e2.weight")).alias("weight")
        ).filter(col("src") != col("dst"))
    return GraphFrame(user_movie_graph.vertices, user_user_edges)

user_user_graph = project_user_user_graph(user_movie_graph)

In [16]:
# Proiezione movie-movie
def project_movie_movie_graph(user_movie_graph):
    movie_movie_edges = user_movie_graph.edges.alias("e1") \
        .join(user_movie_graph.edges.alias("e2"), col("e1.src") == col("e2.src")) \
        .select(
            col("e1.dst").alias("src"),
            col("e2.dst").alias("dst"),
            (col("e1.weight") + col("e2.weight")).alias("weight")
        ).filter(col("src") != col("dst"))
    return GraphFrame(user_movie_graph.vertices, movie_movie_edges)

movie_movie_graph = project_movie_movie_graph(user_movie_graph)

In [17]:
# Funzione per calcolare il vettore di preferenze
def create_preference_vector(user_id, user_movie_graph):
    edges = user_movie_graph.edges.filter(col("src") == user_id).rdd.map(lambda row: (row["dst"], row["weight"])).collect()
    tot = sum([weight for _, weight in edges])
    if tot > 0:
        return {movie: weight / tot for movie, weight in edges}
    else:
        movies = user_movie_graph.vertices.filter(col("bipartite") == 1).select("id").rdd.map(lambda row: row[0]).collect()
        return {movie: 1 / len(movies) for movie in movies}

**Page Rank**

In [18]:
# Funzione di predizione

def predict_user(user_id, user_movie_graph, movie_movie_graph):
    # Crea il vettore di preferenze dell'utente
    p_vec = create_preference_vector(user_id, user_movie_graph)
    # Film già visti dall'utente
    already_seen = [movie for movie, weight in p_vec.items() if weight > 0]
    if len(already_seen) == len(p_vec):  # Se ha visto tutti i film, non c'è nulla da predire
        return []
    
    # Calcolo del PageRank sui film
    pagerank_results = movie_movie_graph.pageRank(resetProbability=0.95, maxIter=20)
    
    # Ordina i film in base al PageRank (senza usare .collect() in anticipo)
    item_rank = pagerank_results.vertices.select("id", "pagerank") \
                                          .filter(~col("id").isin(already_seen)) \
                                          .orderBy(col("pagerank"), ascending=False)
    
    # Recupera i primi 10 film raccomandati
    recommendations = item_rank.limit(10).rdd.map(lambda row: row['id']).collect()
    return recommendations




**Link Prediction**

In [19]:
#VERA CALCULATE ADAMIC ADAR

from pyspark.sql.functions import collect_list
def calculate_adamic_adar(graph):
    # Trova tutti i vicini per ciascun nodo (utente o film)
    neighbors = graph.edges.groupBy("src").agg(collect_list("dst").alias("neighbors"))

    # Genera coppie di nodi (film-utente, utente-utente, etc.)
    neighbors_df = neighbors.alias("n1").join(
        neighbors.alias("n2"), col("n1.src") < col("n2.src")
    ).select(
        col("n1.src").alias("v1"),
        col("n2.src").alias("v2"),
        col("n1.neighbors").alias("neighbors_v1"),
        col("n2.neighbors").alias("neighbors_v2")
    )

    # Funzione per calcolare l'indice di Adamic-Adar
    def compute_adamic_adar(neighbors_v1, neighbors_v2):
        common_neighbors = set(neighbors_v1).intersection(set(neighbors_v2))
        if not common_neighbors:
            return 0.0
        return float(sum(1 / np.log(len(neighbors_v1) + len(neighbors_v2)) for _ in common_neighbors))

    # Creazione dell'udf per calcolare l'indice
    compute_adamic_adar_udf = udf(compute_adamic_adar, FloatType())

    # Calcola l'Adamic-Adar index
    adamic_adar_scores = neighbors_df.withColumn(
        "score", compute_adamic_adar_udf(col("neighbors_v1"), col("neighbors_v2"))
    ).filter(col("score") > 0)  # Filtro per evitare punteggi nulli

    return adamic_adar_scores.select("v1", "v2", "score")


In [20]:
#Funzione che fa venire più collegamenti

# from pyspark.sql.functions import when, col

# def calculate_adamic_adar(graph):
#     from pyspark.sql.functions import lit

#     # Trova tutti i vicini per ciascun nodo (utente o film)
#     neighbors = graph.edges.groupBy("src").agg(collect_list("dst").alias("neighbors"))

#     # Genera coppie di nodi (film-utente, utente-utente, etc.)
#     neighbors_df = neighbors.alias("n1").join(
#         neighbors.alias("n2"), col("n1.src") < col("n2.src")
#     ).select(
#         col("n1.src").alias("v1"),
#         col("n2.src").alias("v2"),
#         col("n1.neighbors").alias("neighbors_v1"),
#         col("n2.neighbors").alias("neighbors_v2")
#     )

#     # Aggiungi il tipo dei nodi usando il campo bipartite dai vertici
#     vertices_with_type = graph.vertices.select(col("id"), col("bipartite"))
#     neighbors_with_type = neighbors_df \
#         .join(vertices_with_type.alias("v1_type"), col("v1") == col("v1_type.id")) \
#         .join(vertices_with_type.alias("v2_type"), col("v2") == col("v2_type.id")) \
#         .select(
#             col("v1"), col("v2"),
#             col("neighbors_v1"), col("neighbors_v2"),
#             col("v1_type.bipartite").alias("v1_type"),
#             col("v2_type.bipartite").alias("v2_type")
#         )

#     # Funzione per calcolare l'indice di Adamic-Adar
#     def compute_adamic_adar(neighbors_v1, neighbors_v2):
#         common_neighbors = set(neighbors_v1).intersection(set(neighbors_v2))
#         if not common_neighbors:
#             return 0.0
#         return float(sum(1 / np.log(len(neighbors_v1) + len(neighbors_v2)) for _ in common_neighbors))

#     # Creazione dell'udf per calcolare l'indice
#     compute_adamic_adar_udf = udf(compute_adamic_adar, FloatType())

#     # Calcola l'Adamic-Adar index
#     adamic_adar_scores = neighbors_with_type.withColumn(
#         "score", compute_adamic_adar_udf(col("neighbors_v1"), col("neighbors_v2"))
#     ).filter(col("score") > 0)  # Filtro per evitare punteggi nulli

#     # Aggiungi una colonna che indica il tipo di link (questa parte era sbagliata)
#     adamic_adar_scores = adamic_adar_scores.withColumn(
#         "link_type",
#         when((col("v1_type") == 0) & (col("v2_type") == 0), "user-user")
#         .when((col("v1_type") == 1) & (col("v2_type") == 1), "movie-movie")
#         .otherwise("user-movie")
#     )

#     return adamic_adar_scores.select("v1", "v2", "score", "link_type")


In [21]:
# Plot histogram for Adamic-Adar Index
def plot_adamic_adar_histogram(adamic_adar_scores):
    # Estrai i punteggi in un array
    scores = [score[2] for score in adamic_adar_scores]
    plt.hist(scores, bins=np.arange(0, max(scores), 0.01), edgecolor='black', alpha=0.7)
    plt.xlabel('Adamic-Adar Index')
    plt.ylabel('Frequency')
    plt.title('Histogram of Adamic-Adar Index for Predicted Edges')
    plt.show()


In [22]:
# Link prediction and plot
# Calcolo dell'indice di Adamic-Adar
adamic_adar_scores = calculate_adamic_adar(user_movie_graph)

In [26]:
from pyspark.sql.functions import when, count

def count_link_types(predicted_edges, vertices):
    """
    Conta le tipologie di link predetti (user-user, movie-movie, user-movie).

    :param predicted_edges: DataFrame dei link predetti con colonne 'v1', 'v2', 'score'.
    :param vertices: DataFrame dei nodi con colonne 'id' e 'bipartite'.
    :return: Un dizionario con i conti per ciascuna tipologia di link.
    """
    # Aggiungi il tipo di nodo (bipartite) per v1 e v2
    edges_with_types = predicted_edges \
        .join(vertices.select(col("id").alias("v1_id"), col("bipartite").alias("v1_type")), predicted_edges["v1"] == col("v1_id"), "left") \
        .join(vertices.select(col("id").alias("v2_id"), col("bipartite").alias("v2_type")), predicted_edges["v2"] == col("v2_id"), "left") \
        .dropDuplicates(["v1", "v2"])  # Rimuovi duplicati dopo il join

    # Classifica i link
    link_counts = edges_with_types.withColumn(
        "link_type",
        when((col("v1_type") == 0) & (col("v2_type") == 0), "user-user")
        .when((col("v1_type") == 1) & (col("v2_type") == 1), "movie-movie")
        .when((col("v1_type") != col("v2_type")), "user-movie")
        .otherwise("unknown")
    ).groupBy("link_type").agg(count("*").alias("count"))

    # Raccogli i risultati in un dizionario
    link_counts_dict = {row["link_type"]: row["count"] for row in link_counts.collect()}

    # Verifica che il totale corrisponda al numero di link predetti
    total_count = sum(link_counts_dict.values())
    print(f"Total predicted edges: {predicted_edges.count()}, Classified edges: {total_count}")

    return link_counts_dict

In [27]:
link_type_counts = count_link_types(adamic_adar_scores, user_movie_graph.vertices)
print("Link type counts:", link_type_counts)

Total predicted edges: 164054, Classified edges: 164054
Link type counts: {'movie-movie': 121118, 'user-user': 3198, 'user-movie': 39738}


In [None]:
#print the length of the predicted edges
print(len(adamic_adar_scores.collect()))

164054


In [28]:
## # PROVA per stampare la tipologia di link trovate con adamic-adar (del tipo "quanti link user-user ho, e quanti link movie-movie ho")

In [29]:
# #plot to find a correct threshold
# plot_adamic_adar_histogram(adamic_adar_scores)

In [30]:
def add_predicted_links(graph, predicted_edges, threshold):
    # Raccolta dei punteggi di Adamic-Adar in una lista
    predicted_edges_list = predicted_edges.collect()

    # Filtra gli edge con punteggio superiore alla soglia
    new_edges = [(row['v1'], row['v2'], row['score']) for row in predicted_edges_list if row['score'] > threshold]
    
    # Crea un DataFrame PySpark per i nuovi edge
    edges_df = spark.createDataFrame(new_edges, ["src", "dst", "weight"])
    
    # Unisce i nuovi edge al grafo esistente
    extended_graph = GraphFrame(graph.vertices, graph.edges.union(edges_df))
    
    return extended_graph


In [22]:
user_movie_graph_extended = add_predicted_links(user_movie_graph, adamic_adar_scores, 0.5)

**Prediction**

In [None]:
# Predict movies for a user
user =10
recommended_movies = predict_user(user, user_movie_graph_extended, movie_movie_graph)
print(f"Recommended movies for user {user}: {recommended_movies[:10]}")

In [None]:
spark.stop()