USARE LINK PREDICTION CON ADAMIC ADAR PER PAGE RANK

In [1]:
import pandas as pd
import networkx as nx
from collections import defaultdict

# Caricamento dei dati
ratings = pd.read_csv("ml-latest-small/ml-latest-small/ratings.csv")
movies = pd.read_csv("ml-latest-small/ml-latest-small/movies.csv")

# Merge dei dataset ratings e movies
ratings = pd.merge(ratings, movies, on="movieId")
print(ratings.head())

   userId  movieId  rating   timestamp             title  \
0       1        1     4.0   964982703  Toy Story (1995)   
1       5        1     4.0   847434962  Toy Story (1995)   
2       7        1     4.5  1106635946  Toy Story (1995)   
3      15        1     2.5  1510577970  Toy Story (1995)   
4      17        1     4.5  1305696483  Toy Story (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1  Adventure|Animation|Children|Comedy|Fantasy  
2  Adventure|Animation|Children|Comedy|Fantasy  
3  Adventure|Animation|Children|Comedy|Fantasy  
4  Adventure|Animation|Children|Comedy|Fantasy  


In [2]:
# Controllo per film comuni valutati da più utenti
common_movies = ratings.groupby("title").size().reset_index(name='count')
print(common_movies[common_movies['count'] > 1].head(10))  # Debug

                             title  count
2           'Round Midnight (1986)      2
4        'Til There Was You (1997)      2
6               'burbs, The (1989)     17
8      (500) Days of Summer (2009)     42
9   *batteries not included (1987)      7
11   ...And Justice for All (1979)      3
14                       10 (1979)      4
15           10 Cent Pistol (2015)      2
16      10 Cloverfield Lane (2016)     14
17         10 Items or Less (2006)      3


In [3]:
# Mappatura dei punteggi
mapping_score = {
    0.5: -1,
    1: -1,
    1.5: -0.5,
    2: 0,
    2.5: 0,
    3: 0,
    3.5: 0.5,
    4: 1,
    4.5: 1.1,
    5: 1.2
}

# ratings["weight"] = ratings["rating"].map(mapping_score)  #non presente in cod originale
# print(ratings.head())

In [4]:
# Create bipartite graph
user_movie_graph = nx.Graph()
#Add nodes and edges
for _, row in ratings.iterrows():
    user_movie_graph.add_node(row["userId"], bipartite=0)
    user_movie_graph.add_node(row["title"], bipartite=1)
    user_movie_graph.add_edge(row["userId"], row["title"], weight=mapping_score[row["rating"]])

# Debug
print(f"Nodes in the graph: {list(user_movie_graph.nodes(data=True))[:10]}")
print(f"Edges in the graph: {list(user_movie_graph.edges(data=True))[:10]}")

Nodes in the graph: [(1, {'bipartite': 0}), ('Toy Story (1995)', {'bipartite': 1}), (5, {'bipartite': 0}), (7, {'bipartite': 0}), (15, {'bipartite': 0}), (17, {'bipartite': 0}), (18, {'bipartite': 0}), (19, {'bipartite': 0}), (21, {'bipartite': 0}), (27, {'bipartite': 0})]
Edges in the graph: [(1, 'Toy Story (1995)', {'weight': 1}), (1, 'Grumpier Old Men (1995)', {'weight': 1}), (1, 'Heat (1995)', {'weight': 1}), (1, 'Seven (a.k.a. Se7en) (1995)', {'weight': 1.2}), (1, 'Usual Suspects, The (1995)', {'weight': 1.2}), (1, 'From Dusk Till Dawn (1996)', {'weight': 0}), (1, 'Bottle Rocket (1996)', {'weight': 1.2}), (1, 'Braveheart (1995)', {'weight': 1}), (1, 'Rob Roy (1995)', {'weight': 1.2}), (1, 'Canadian Bacon (1995)', {'weight': 1.2})]


In [5]:
# Proiezione del grafo bipartito
users = {n for n, d in user_movie_graph.nodes(data=True) if d["bipartite"] == 0}
movies = {n for n, d in user_movie_graph.nodes(data=True) if d["bipartite"] == 1}
user_user_graph = nx.bipartite.weighted_projected_graph(user_movie_graph, users)
movie_movie_graph = nx.bipartite.weighted_projected_graph(user_movie_graph, movies)

# Debug per le proiezioni
print(f"Nodes in user_user_graph: {list(user_user_graph.nodes(data=True))[:10]}")
print(f"Edges in user_user_graph: {list(user_user_graph.edges(data=True))[:10]}")
print(f"Nodes in movie_movie_graph: {list(movie_movie_graph.nodes(data=True))[:10]}")
print(f"Edges in movie_movie_graph: {list(movie_movie_graph.edges(data=True))[:10]}")

Nodes in user_user_graph: [(1, {'bipartite': 0}), (2, {'bipartite': 0}), (3, {'bipartite': 0}), (4, {'bipartite': 0}), (5, {'bipartite': 0}), (6, {'bipartite': 0}), (7, {'bipartite': 0}), (8, {'bipartite': 0}), (9, {'bipartite': 0}), (10, {'bipartite': 0})]
Edges in user_user_graph: [(1, 2, {'weight': 2}), (1, 3, {'weight': 7}), (1, 4, {'weight': 45}), (1, 5, {'weight': 13}), (1, 6, {'weight': 33}), (1, 7, {'weight': 26}), (1, 8, {'weight': 15}), (1, 9, {'weight': 5}), (1, 10, {'weight': 6}), (1, 11, {'weight': 16})]
Nodes in movie_movie_graph: [('Denise Calls Up (1995)', {'bipartite': 1}), ('Nine Lives of Tomas Katz, The (2000)', {'bipartite': 1}), ('Bank Job, The (2008)', {'bipartite': 1}), ('Crippled Avengers (Can que) (Return of the 5 Deadly Venoms) (1981)', {'bipartite': 1}), ('Bling Ring, The (2013)', {'bipartite': 1}), ('Beerfest (2006)', {'bipartite': 1}), ('Other Sister, The (1999)', {'bipartite': 1}), ('Dead Again (1991)', {'bipartite': 1}), ('The Cave of the Golden Rose (199

In [6]:
# Funzione per calcolare il vettore di preferenze
def create_preference_vector(user_id: int, user_movie_graph: nx.Graph):
    edges = {m: v for _, m, v in user_movie_graph.edges(user_id, data="weight")}
    tot = sum(edges.values())
    if tot > 0:
        return {
            movie: edges.get(movie, 0) / tot
            for movie in movies
        }
    else:
        return {movie: 1 for movie in movies}


**Page Rank**

In [7]:
def predict_user(user_id, user_movie_graph: nx.Graph, movie_movie_graph: nx.Graph):
    p_vec = create_preference_vector(user_id, user_movie_graph)
    already_seen = [movie for movie, p in p_vec.items() if p > 0]
    if len(already_seen) < 1:
        return []
    item_rank = nx.pagerank(movie_movie_graph, personalization=p_vec, alpha=0.95, weight="weight")
    s_t = [
        x for x in sorted(
            movie_movie_graph.nodes(), key=lambda x: item_rank[x] if x in item_rank else 0, reverse=True
            )
        if x not in already_seen
        ]
    return s_t

**Link Prediction**

In [8]:
# 1. Calculate Link Prediction using Adamic-Adar Index
def link_prediction(graph: nx.Graph):
    predicted_edges = []
    for u, v, p in nx.adamic_adar_index(graph):
        predicted_edges.append((u, v, p))
    return predicted_edges

In [9]:
import matplotlib.pyplot as plt
import numpy as np

def plot_adamic_adar_histogram(predicted_edges):
    p = [x[2] for x in predicted_edges]
    plt.hist(p, bins=np.arange(0, max(p), 0.01), edgecolor='black', alpha=0.7)
    plt.xlabel('Adamic-Adar Index')
    plt.ylabel('Frequency')
    plt.title('Histogram of Adamic-Adar Index for Predicted Edges')
    plt.show()

In [None]:
# Plottare l'istogramma delle probabilità ottenute dall'indice Adamic-Adar
plot_adamic_adar_histogram(link_prediction(user_movie_graph))

In [None]:
#print the length of the predicted edges
print(len(link_prediction(user_movie_graph)))

In [66]:
def add_predicted_links(graph: nx.Graph, predicted_edges, threshold: float):
    extended_graph = graph.copy()
    for u, v, p in predicted_edges:
        if p > threshold:  # Considera solo le predizioni sopra una certa soglia
            extended_graph.add_edge(u, v, weight=p)
            print(f"Added predicted edge: ({u}, {v}) with weight {p}")  # Debug
    return extended_graph

In [67]:
# Apply Link Prediction to the movie-movie graph
add_predicted_links(movie_movie_graph, link_prediction(movie_movie_graph), 0.5)

KeyboardInterrupt: 

**Prediction**

In [None]:
# Now, calculate the PageRank on the updated graph
user = 10
s_t = predict_user(user, user_movie_graph, movie_movie_graph)
print(f"Predicted movies for user {user}: {s_t[:10]}")