In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

# Caricamento dei dati
ratings = pd.read_csv('../ml-latest-small/ratings.csv')
movies = pd.read_csv('../ml-latest-small/movies.csv')


In [3]:
# Merge dei dataset ratings e movies
user_movie_matrix = ratings.merge(movies, on="movieId", how="inner")
print(user_movie_matrix.head())

   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  


In [4]:
# Mappatura dei punteggi
mapping_score = {
    0.5: -1.0,
    1: -1.0,
    1.5: -0.5,
    2: 0.0,
    2.5: 0.0,
    3: 0.0,
    3.5: 0.5,
    4: 1.0,
    4.5: 1.1,
    5: 1.2
}

In [5]:
user_movie_matrix['weight'] = user_movie_matrix['rating'].map(mapping_score)

In [6]:
# Creazione degli edge
edges = user_movie_matrix[['userId', 'movieId', 'weight']].rename(
    columns={'userId': 'src', 'movieId': 'dst'}
)
edges['src'] = edges['src'].astype(str)
edges['dst'] = edges['dst'].astype(str)

In [7]:
print(edges.head())

  src dst  weight
0   1   1     1.0
1   1   3     1.0
2   1   6     1.0
3   1  47     1.2
4   1  50     1.2


In [8]:
# Creazione dei vertici
user_vertices = pd.DataFrame(user_movie_matrix['userId'].astype(str).unique(), columns=['id'])
user_vertices['bipartite'] = 0

movie_vertices = pd.DataFrame(user_movie_matrix['movieId'].astype(str).unique(), columns=['id'])
movie_vertices['bipartite'] = 1

vertices = pd.concat([user_vertices, movie_vertices], ignore_index=True)

#debug print
print(vertices.head())


  id  bipartite
0  1          0
1  2          0
2  3          0
3  4          0
4  5          0


In [9]:
# Creazione del grafo bipartito
user_movie_graph = nx.Graph()
for _, edge in edges.iterrows():
    user_movie_graph.add_edge(edge['src'], edge['dst'], weight=edge['weight'])


In [10]:
# Proiezione user-user
def project_user_user_graph(user_movie_graph):
    user_user_edges = []
    users = [node for node, data in user_movie_graph.nodes(data=True) if data.get('bipartite') == 0]
    for i, u1 in enumerate(users):
        for u2 in users[i+1:]:
            common_movies = set(user_movie_graph.neighbors(u1)).intersection(user_movie_graph.neighbors(u2))
            weight = sum(
                user_movie_graph[u1][movie]['weight'] + user_movie_graph[u2][movie]['weight']
                for movie in common_movies
            )
            if weight > 0:
                user_user_edges.append((u1, u2, weight))
    user_user_graph = nx.Graph()
    user_user_graph.add_weighted_edges_from(user_user_edges)
    return user_user_graph

user_user_graph = project_user_user_graph(user_movie_graph)


In [11]:
# Proiezione movie-movie
def project_movie_movie_graph(user_movie_graph):
    movie_movie_edges = []
    movies = [node for node, data in user_movie_graph.nodes(data=True) if data.get('bipartite') == 1]
    for i, m1 in enumerate(movies):
        for m2 in movies[i+1:]:
            common_users = set(user_movie_graph.neighbors(m1)).intersection(user_movie_graph.neighbors(m2))
            weight = sum(
                user_movie_graph[user][m1]['weight'] + user_movie_graph[user][m2]['weight']
                for user in common_users
            )
            if weight > 0:
                movie_movie_edges.append((m1, m2, weight))
    movie_movie_graph = nx.Graph()
    movie_movie_graph.add_weighted_edges_from(movie_movie_edges)
    return movie_movie_graph

movie_movie_graph = project_movie_movie_graph(user_movie_graph)

In [12]:
# Funzione per calcolare il vettore di preferenze
def create_preference_vector(user_id, user_movie_graph):
    user_node = str(user_id)
    edges = [(neighbor, user_movie_graph[user_node][neighbor]['weight']) for neighbor in user_movie_graph.neighbors(user_node)]
    tot = sum(weight for _, weight in edges)
    if tot > 0:
        return {movie: weight / tot for movie, weight in edges}
    else:
        movies = [n for n, data in user_movie_graph.nodes(data=True) if data.get('bipartite') == 1]
        return {movie: 1 / len(movies) for movie in movies}


**Page Rank**

In [13]:
# Funzione di predizione
def predict_user(user_id, user_movie_graph, movie_movie_graph):
    p_vec = create_preference_vector(user_id, user_movie_graph)
    already_seen = [movie for movie, weight in p_vec.items() if weight > 0]
    if len(already_seen) == len(p_vec):
        return []
    pagerank = nx.pagerank(movie_movie_graph, alpha=0.95)
    item_rank = sorted(
        [(movie, rank) for movie, rank in pagerank.items() if movie not in already_seen],
        key=lambda x: x[1], reverse=True
    )
    return [movie for movie, _ in item_rank[:10]]

**Link Prediction**

In [14]:
# Calcolo dell'indice di Adamic-Adar
def calculate_adamic_adar(graph):
    scores = []
    for u, v in nx.non_edges(graph):
        common_neighbors = set(nx.common_neighbors(graph, u, v))
        if common_neighbors:
            score = sum(1 / np.log(len(list(graph.neighbors(w)))) for w in common_neighbors)
            scores.append((u, v, score))
    return scores

In [15]:
#plot histogram for Adamic-Adar index 
def plot_histogram(scores):
    plt.hist([score for _, _, score in scores], bins=100)
    plt.title('Adamic-Adar index')
    plt.xlabel('Index')
    plt.ylabel('Frequency')
    plt.show()

In [16]:
# Uso delle funzioni
adamic_adar_scores = calculate_adamic_adar(user_movie_graph)

In [42]:
#     # :param predicted_edges: DataFrame dei link predetti con colonne 'v1', 'v2', 'score'.
#     # :param vertices: DataFrame dei nodi con colonne 'id' e 'bipartite'.
#     # :return: Un dizionario con i conti per ciascuna tipologia di link.

# def count_link_types(predicted_edges, vertices):
#     link_types = {}
#     for _, edge in predicted_edges.iterrows():
#         v1_bipartite = vertices.loc[vertices['id'] == edge['v1'], 'bipartite'].values[0]
#         v2_bipartite = vertices.loc[vertices['id'] == edge['v2'], 'bipartite'].values[0]
#         link_type = (v1_bipartite, v2_bipartite)
#         link_types[link_type] = link_types.get(link_type, 0) + 1
#     return link_types

# # Calcolo del numero di link predetti per ciascuna tipologia
# predicted_edges = pd.DataFrame(adamic_adar_scores, columns=['v1', 'v2', 'score'])
# link_types = count_link_types(predicted_edges, vertices)
# print(link_types)

def count_link_types(predicted_edges, vertices):
    """
    Conta le tipologie di link predetti (user-user, movie-movie, user-movie).
    
    :param predicted_edges: Lista di tuple (u, v, score) con i link predetti.
    :param vertices: Dizionario con ID dei nodi e il loro tipo (0 = user, 1 = movie).
    :return: Un dizionario con i conti per ciascuna tipologia di link.
    """
    # Dizionario per contare le tipologie di link
    link_counts = {
        "user-user": 0,
        "movie-movie": 0,
        "user-movie": 0,
        "unknown": 0
    }

    counter = 0
    # Itera attraverso i link predetti e classifica
    for u, v, _ in predicted_edges:
        v1_type = vertices.get(u, -1)  # Tipo del nodo u
        v2_type = vertices.get(v, -1)  # Tipo del nodo v
        
        if counter % 100 == 0:
            print(u, v)
            print(v1_type, v2_type)

        if v1_type == 0 and v2_type == 0:
            link_counts["user-user"] += 1
        elif v1_type == 1 and v2_type == 1:
            link_counts["movie-movie"] += 1
        elif v1_type != v2_type:
            link_counts["user-movie"] += 1
        else:
            link_counts["unknown"] += 1  # Caso di tipo sconosciuto
        counter += 1
    # Stampa il totale
    print(f"Total predicted edges: {len(predicted_edges)}, Classified edges: {sum(link_counts.values())}")

    return link_counts


In [38]:
# Create a directed graph
user_movie_graph1 = nx.Graph()

for _, row in user_movie_matrix.iterrows():
    user_movie_graph1.add_node(row["userId"], bipartite=0)
    user_movie_graph1.add_node(row["movieId"], bipartite=1, genre=row["genres"], title=row["title"])
    user_movie_graph1.add_edge(row["userId"], row["movieId"], weight=row["rating"])
    # user_movie_graph1.add_edge(row["userId"], row["title"], weight=mapping_score[row["rating"]])

In [39]:
vertices1 = {n: d["bipartite"] for n, d in user_movie_graph1.nodes(data=True)}

In [40]:
print(len(vertices1))

9811


In [41]:
for k, v in vertices1.items():
    print(k, v)

1 1
3 1
6 1
47 1
50 1
70 1
101 1
110 1
151 1
157 1
163 1
216 1
223 1
231 1
235 1
260 1
296 1
316 1
333 1
349 1
356 1
362 1
367 1
423 1
441 1
457 1
480 1
500 1
527 1
543 1
552 1
553 1
590 1
592 1
593 1
596 1
608 1
648 1
661 1
673 1
733 1
736 1
780 1
804 1
919 1
923 1
940 1
943 1
954 1
1009 1
1023 1
1024 1
1025 1
1029 1
1030 1
1031 1
1032 1
1042 1
1049 1
1060 1
1073 1
1080 1
1089 1
1090 1
1092 1
1097 1
1127 1
1136 1
1196 1
1197 1
1198 1
1206 1
1208 1
1210 1
1213 1
1214 1
1219 1
1220 1
1222 1
1224 1
1226 1
1240 1
1256 1
1258 1
1265 1
1270 1
1275 1
1278 1
1282 1
1291 1
1298 1
1348 1
1377 1
1396 1
1408 1
1445 1
1473 1
1500 1
1517 1
1552 1
1573 1
1580 1
1587 1
1617 1
1620 1
1625 1
1644 1
1676 1
1732 1
1777 1
1793 1
1804 1
1805 1
1920 1
1927 1
1954 1
1967 1
2000 1
2005 1
2012 1
2018 1
2028 1
2033 1
2046 1
2048 1
2054 1
2058 1
2078 1
2090 1
2093 1
2094 1
2096 1
2099 1
2105 1
2115 1
2116 1
2137 1
2139 1
2141 1
2143 1
2161 1
2174 1
2193 1
2253 1
2268 1
2273 1
2291 1
2329 1
2338 1
2353 1
2366 1
2

In [43]:
link_type_counts = count_link_types(adamic_adar_scores, vertices1)
print("Link type counts:", link_type_counts)

281 2174
-1 -1
281 2100
-1 -1
281 3398
-1 -1
281 70
-1 -1
281 1259
-1 -1
281 68237
-1 -1
281 102
-1 -1
281 1286
-1 -1
281 230
-1 -1
281 3809
-1 -1
281 560
-1 -1
281 80831
-1 -1
281 3471
-1 -1
281 8
-1 -1
281 994
-1 -1
281 5060
-1 -1
281 54281
-1 -1
281 26726
-1 -1
281 105844
-1 -1
281 85796
-1 -1
281 584
-1 -1
281 1641
-1 -1
281 5792
-1 -1
281 1242
-1 -1
281 60684
-1 -1
281 202
-1 -1
281 60291
-1 -1
281 95167
-1 -1
281 2688
-1 -1
281 43908
-1 -1
281 5198
-1 -1
51357 1
-1 -1
51357 208
-1 -1
51357 91658
-1 -1
51357 150548
-1 -1
51357 96488
-1 -1
4317 1333
-1 -1
4317 553
-1 -1
4317 2088
-1 -1
4317 833
-1 -1
4317 2804
-1 -1
4317 8949
-1 -1
4317 227
-1 -1
4317 1095
-1 -1
4317 45440
-1 -1
4317 4127
-1 -1
4317 6687
-1 -1
4317 3916
-1 -1
4317 212
-1 -1
4317 46970
-1 -1
4317 4643
-1 -1
4317 441
-1 -1
93270 95207
-1 -1
93270 3978
-1 -1
93270 570
-1 -1
93270 90888
-1 -1
93270 6294
-1 -1
93270 48738
-1 -1
93270 7099
-1 -1
93270 67087
-1 -1
93270 66371
-1 -1
93270 104211
-1 -1
93270 88140
-1 -1
932

: 

In [None]:
print("The link predicted are # ", len(adamic_adar_scores))

In [None]:
# Aggiunta di link predetti
def add_predicted_links(graph, predicted_edges, threshold):
    for u, v, score in predicted_edges:
        if score > threshold:
            graph.add_edge(u, v, weight=score)
    return graph


In [None]:
user_movie_graph_extended = add_predicted_links(user_movie_graph, adamic_adar_scores, 0.5)

In [None]:
# Predizione per un utente
user = 10
recommended_movies = predict_user(user, user_movie_graph_extended, movie_movie_graph)
print(f"Recommended movies for user {user}: {recommended_movies[:10]}")