In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

# Caricamento dei dati
ratings = pd.read_csv('../ml-latest-small/ml-latest-small/ratings.csv')
movies = pd.read_csv('../ml-latest-small/ml-latest-small/movies.csv')


In [2]:
# Merge dei dataset ratings e movies
user_movie_matrix = ratings.merge(movies, on="movieId", how="inner")
print(user_movie_matrix.head())

   userId  movieId  rating   timestamp             title  \
0       1        1     4.0   964982703  Toy Story (1995)   
1       5        1     4.0   847434962  Toy Story (1995)   
2       7        1     4.5  1106635946  Toy Story (1995)   
3      15        1     2.5  1510577970  Toy Story (1995)   
4      17        1     4.5  1305696483  Toy Story (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1  Adventure|Animation|Children|Comedy|Fantasy  
2  Adventure|Animation|Children|Comedy|Fantasy  
3  Adventure|Animation|Children|Comedy|Fantasy  
4  Adventure|Animation|Children|Comedy|Fantasy  


In [3]:
# Mappatura dei punteggi
mapping_score = {
    0.5: -1.0,
    1: -1.0,
    1.5: -0.5,
    2: 0.0,
    2.5: 0.0,
    3: 0.0,
    3.5: 0.5,
    4: 1.0,
    4.5: 1.1,
    5: 1.2
}

In [4]:
user_movie_matrix['weight'] = user_movie_matrix['rating'].map(mapping_score)

In [5]:
# Creazione degli edge
edges = user_movie_matrix[['userId', 'movieId', 'weight']].rename(
    columns={'userId': 'src', 'movieId': 'dst'}
)
edges['src'] = edges['src'].astype(str)
edges['dst'] = edges['dst'].astype(str)

In [6]:
print(edges.head())

  src dst  weight
0   1   1     1.0
1   5   1     1.0
2   7   1     1.1
3  15   1     0.0
4  17   1     1.1


In [10]:
# Creazione dei vertici
user_vertices = pd.DataFrame(user_movie_matrix['userId'].astype(str).unique(), columns=['id'])
user_vertices['bipartite'] = 0

movie_vertices = pd.DataFrame(user_movie_matrix['movieId'].astype(str).unique(), columns=['id'])
movie_vertices['bipartite'] = 1

vertices = pd.concat([user_vertices, movie_vertices], ignore_index=True)

#debug print
print(vertices.head())


   id  bipartite
0   1          0
1   5          0
2   7          0
3  15          0
4  17          0


In [11]:
# Creazione del grafo bipartito
user_movie_graph = nx.Graph()
for _, edge in edges.iterrows():
    user_movie_graph.add_edge(edge['src'], edge['dst'], weight=edge['weight'])


In [12]:
# Proiezione user-user
def project_user_user_graph(user_movie_graph):
    user_user_edges = []
    users = [node for node, data in user_movie_graph.nodes(data=True) if data.get('bipartite') == 0]
    for i, u1 in enumerate(users):
        for u2 in users[i+1:]:
            common_movies = set(user_movie_graph.neighbors(u1)).intersection(user_movie_graph.neighbors(u2))
            weight = sum(
                user_movie_graph[u1][movie]['weight'] + user_movie_graph[u2][movie]['weight']
                for movie in common_movies
            )
            if weight > 0:
                user_user_edges.append((u1, u2, weight))
    user_user_graph = nx.Graph()
    user_user_graph.add_weighted_edges_from(user_user_edges)
    return user_user_graph

user_user_graph = project_user_user_graph(user_movie_graph)


In [13]:
# Proiezione movie-movie
def project_movie_movie_graph(user_movie_graph):
    movie_movie_edges = []
    movies = [node for node, data in user_movie_graph.nodes(data=True) if data.get('bipartite') == 1]
    for i, m1 in enumerate(movies):
        for m2 in movies[i+1:]:
            common_users = set(user_movie_graph.neighbors(m1)).intersection(user_movie_graph.neighbors(m2))
            weight = sum(
                user_movie_graph[user][m1]['weight'] + user_movie_graph[user][m2]['weight']
                for user in common_users
            )
            if weight > 0:
                movie_movie_edges.append((m1, m2, weight))
    movie_movie_graph = nx.Graph()
    movie_movie_graph.add_weighted_edges_from(movie_movie_edges)
    return movie_movie_graph

movie_movie_graph = project_movie_movie_graph(user_movie_graph)

In [14]:
# Funzione per calcolare il vettore di preferenze
def create_preference_vector(user_id, user_movie_graph):
    user_node = str(user_id)
    edges = [(neighbor, user_movie_graph[user_node][neighbor]['weight']) for neighbor in user_movie_graph.neighbors(user_node)]
    tot = sum(weight for _, weight in edges)
    if tot > 0:
        return {movie: weight / tot for movie, weight in edges}
    else:
        movies = [n for n, data in user_movie_graph.nodes(data=True) if data.get('bipartite') == 1]
        return {movie: 1 / len(movies) for movie in movies}


**Page Rank**

In [15]:
# Funzione di predizione
def predict_user(user_id, user_movie_graph, movie_movie_graph):
    p_vec = create_preference_vector(user_id, user_movie_graph)
    already_seen = [movie for movie, weight in p_vec.items() if weight > 0]
    if len(already_seen) == len(p_vec):
        return []
    pagerank = nx.pagerank(movie_movie_graph, alpha=0.95)
    item_rank = sorted(
        [(movie, rank) for movie, rank in pagerank.items() if movie not in already_seen],
        key=lambda x: x[1], reverse=True
    )
    return [movie for movie, _ in item_rank[:10]]


**Link Prediction**

In [16]:
# Calcolo dell'indice di Adamic-Adar
def calculate_adamic_adar(graph):
    scores = []
    for u, v in nx.non_edges(graph):
        common_neighbors = set(nx.common_neighbors(graph, u, v))
        if common_neighbors:
            score = sum(1 / np.log(len(list(graph.neighbors(w)))) for w in common_neighbors)
            scores.append((u, v, score))
    return scores


In [17]:
#plot histogram for Adamic-Adar index 
def plot_histogram(scores):
    plt.hist([score for _, _, score in scores], bins=100)
    plt.title('Adamic-Adar index')
    plt.xlabel('Index')
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Uso delle funzioni
adamic_adar_scores = calculate_adamic_adar(user_movie_graph)

In [None]:
#     # :param predicted_edges: DataFrame dei link predetti con colonne 'v1', 'v2', 'score'.
#     # :param vertices: DataFrame dei nodi con colonne 'id' e 'bipartite'.
#     # :return: Un dizionario con i conti per ciascuna tipologia di link.

# def count_link_types(predicted_edges, vertices):
#     link_types = {}
#     for _, edge in predicted_edges.iterrows():
#         v1_bipartite = vertices.loc[vertices['id'] == edge['v1'], 'bipartite'].values[0]
#         v2_bipartite = vertices.loc[vertices['id'] == edge['v2'], 'bipartite'].values[0]
#         link_type = (v1_bipartite, v2_bipartite)
#         link_types[link_type] = link_types.get(link_type, 0) + 1
#     return link_types

# # Calcolo del numero di link predetti per ciascuna tipologia
# predicted_edges = pd.DataFrame(adamic_adar_scores, columns=['v1', 'v2', 'score'])
# link_types = count_link_types(predicted_edges, vertices)
# print(link_types)

def count_link_types(predicted_edges, vertices):
    """
    Conta le tipologie di link predetti (user-user, movie-movie, user-movie).
    
    :param predicted_edges: Lista di tuple (u, v, score) con i link predetti.
    :param vertices: Dizionario con ID dei nodi e il loro tipo (0 = user, 1 = movie).
    :return: Un dizionario con i conti per ciascuna tipologia di link.
    """
    # Dizionario per contare le tipologie di link
    link_counts = {
        "user-user": 0,
        "movie-movie": 0,
        "user-movie": 0,
        "unknown": 0
    }

    # Itera attraverso i link predetti e classifica
    for u, v, _ in predicted_edges:
        v1_type = vertices.get(u, -1)  # Tipo del nodo u
        v2_type = vertices.get(v, -1)  # Tipo del nodo v
        
        if v1_type == 0 and v2_type == 0:
            link_counts["user-user"] += 1
        elif v1_type == 1 and v2_type == 1:
            link_counts["movie-movie"] += 1
        elif v1_type != v2_type:
            link_counts["user-movie"] += 1
        else:
            link_counts["unknown"] += 1  # Caso di tipo sconosciuto

    # Stampa il totale
    print(f"Total predicted edges: {len(predicted_edges)}, Classified edges: {sum(link_counts.values())}")

    return link_counts


In [None]:
link_type_counts = count_link_types(adamic_adar_scores, vertices)
print("Link type counts:", link_type_counts)

In [None]:
print("The link predicted are # ", len(adamic_adar_scores))

In [None]:
# Aggiunta di link predetti
def add_predicted_links(graph, predicted_edges, threshold):
    for u, v, score in predicted_edges:
        if score > threshold:
            graph.add_edge(u, v, weight=score)
    return graph


In [None]:
user_movie_graph_extended = add_predicted_links(user_movie_graph, adamic_adar_scores, 0.5)

In [None]:
# Predizione per un utente
user = 10
recommended_movies = predict_user(user, user_movie_graph_extended, movie_movie_graph)
print(f"Recommended movies for user {user}: {recommended_movies[:10]}")