In [1]:
utente = 10

In [2]:
import pandas as pd
ratings = pd.read_csv('../ml-latest-small/ratings.csv')

In [3]:
movies = pd.read_csv("../ml-latest-small/movies.csv")

In [4]:
user_movie_matrix = pd.merge(ratings, movies, on='movieId')

In [5]:
print(user_movie_matrix.head())

   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  


In [6]:
user_movie_matrix_pivot = user_movie_matrix.pivot(index='userId', columns='movieId', values='rating').fillna(0)
print(user_movie_matrix_pivot.head())

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     0.0     4.0     0.0     0.0     4.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0

In [7]:
already_rated = user_movie_matrix_pivot.loc[utente][user_movie_matrix_pivot.loc[utente] > 0].index.tolist()
print(len(already_rated))

140


In [8]:
not_seen = user_movie_matrix_pivot.columns[~user_movie_matrix_pivot.columns.isin(already_rated)].tolist()
print(len(not_seen))

9584


In [9]:
from scipy.sparse import csr_matrix

sparse_user_movie_matrix = csr_matrix(user_movie_matrix_pivot.values)
print(sparse_user_movie_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 100836 stored elements and shape (610, 9724)>
  Coords	Values
  (0, 0)	4.0
  (0, 2)	4.0
  (0, 5)	4.0
  (0, 43)	5.0
  (0, 46)	5.0
  (0, 62)	3.0
  (0, 89)	5.0
  (0, 97)	4.0
  (0, 124)	5.0
  (0, 130)	5.0
  (0, 136)	5.0
  (0, 184)	5.0
  (0, 190)	3.0
  (0, 197)	5.0
  (0, 201)	4.0
  (0, 224)	5.0
  (0, 257)	3.0
  (0, 275)	3.0
  (0, 291)	5.0
  (0, 307)	4.0
  (0, 314)	4.0
  (0, 320)	5.0
  (0, 325)	4.0
  (0, 367)	3.0
  (0, 384)	4.0
  :	:
  (609, 9238)	5.0
  (609, 9246)	4.5
  (609, 9256)	4.0
  (609, 9268)	5.0
  (609, 9274)	3.5
  (609, 9279)	3.5
  (609, 9282)	3.0
  (609, 9288)	3.0
  (609, 9304)	3.0
  (609, 9307)	2.5
  (609, 9312)	4.5
  (609, 9317)	3.0
  (609, 9324)	3.0
  (609, 9339)	4.0
  (609, 9341)	4.0
  (609, 9348)	3.5
  (609, 9371)	3.5
  (609, 9372)	3.5
  (609, 9374)	5.0
  (609, 9415)	4.0
  (609, 9416)	4.0
  (609, 9443)	5.0
  (609, 9444)	5.0
  (609, 9445)	5.0
  (609, 9485)	3.0


In [10]:
import numpy as np
from scipy.sparse.linalg import svds

# SVD decomposition
latent_features = 20
U, sigma, Vt = svds(sparse_user_movie_matrix, k=latent_features)
sigma = np.diag(sigma)
print('U.shape', U.shape)
print('sigma.shape', sigma.shape)
print('Vt.shape', Vt.shape)

U.shape (610, 20)
sigma.shape (20, 20)
Vt.shape (20, 9724)


In [11]:
# from scipy.linalg import sqrtm

# root_sigma = sqrtm(sigma)
# print('root_sigma.shape', root_sigma.shape)
# U = np.dot(U, root_sigma)
# Vt = np.dot(root_sigma, Vt)

In [12]:
user_movie_matrix_hat = np.dot(np.dot(U, sigma), Vt)

In [13]:
# Undo the pivot done earlier to obtain a dataframe with user_id, movie_id and rating as columns
user_movie_matrix_hat = pd.DataFrame(user_movie_matrix_hat, columns=user_movie_matrix_pivot.columns, index=user_movie_matrix_pivot.index)
print(user_movie_matrix_hat.head())

movieId    1         2         3         4         5         6         7       \
userId                                                                          
1        2.290336  1.460203  1.033507 -0.061334 -0.002275  1.243261  0.029650   
2        0.038570  0.015272  0.016968  0.002944  0.019201 -0.005821 -0.025436   
3       -0.015220  0.049067  0.047202 -0.004936 -0.035349  0.052758 -0.012911   
4        2.238621  0.060011  0.039384  0.066455  0.221806  0.487591  0.318594   
5        1.358363  0.970071  0.340939  0.121053  0.479936  0.628346  0.504583   

movieId    8         9         10      ...    193565    193567    193571  \
userId                                 ...                                 
1        0.056161  0.036220  1.442856  ... -0.008584 -0.007358 -0.009810   
2        0.000918  0.010531 -0.117149  ...  0.010662  0.009139  0.012186   
3        0.010422 -0.002532 -0.014094  ...  0.000029  0.000025  0.000033   
4       -0.057422  0.016371  0.234273  ...  0.002029

In [14]:
print(user_movie_matrix_hat.columns)

Index([     1,      2,      3,      4,      5,      6,      7,      8,      9,
           10,
       ...
       193565, 193567, 193571, 193573, 193579, 193581, 193583, 193585, 193587,
       193609],
      dtype='int64', name='movieId', length=9724)


In [15]:
# get name of thw rows of the dataframe
print(user_movie_matrix_hat.index.name)

userId


In [16]:
user_movie_matrix_hat_unpivot = user_movie_matrix_hat.reset_index().melt(id_vars='userId', var_name='movieId', value_name='rating')

In [17]:
user_movie_matrix_hat_unpivot = user_movie_matrix_hat_unpivot.merge(user_movie_matrix[['userId', 'movieId', 'timestamp', 'title', 'genres']], on=['userId', 'movieId'], how='left')

In [18]:
user_movie_matrix_hat_unpivot = user_movie_matrix_hat_unpivot[['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres']]

In [19]:
print(user_movie_matrix_hat_unpivot.head())

   userId movieId    rating    timestamp             title  \
0       1       1  2.290336  964982703.0  Toy Story (1995)   
1       2       1  0.038570          NaN               NaN   
2       3       1 -0.015220          NaN               NaN   
3       4       1  2.238621          NaN               NaN   
4       5       1  1.358363  847434962.0  Toy Story (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                                          NaN  
2                                          NaN  
3                                          NaN  
4  Adventure|Animation|Children|Comedy|Fantasy  


In [12]:
user_hat = user_movie_matrix_hat[utente]

In [23]:
import networkx as nx

user_movie_graph = nx.Graph()

# Add nodes and edges
for _, row in user_movie_matrix_hat_unpivot.iterrows():
    if row["userId"] == utente and pd.isna(row["title"]):
        user_movie_graph.add_node(row["userId"], bipartite=0)
        user_movie_graph.add_node(row["movieId"], bipartite=1)
        user_movie_graph.add_edge(row["userId"], row["movieId"], weight=row["rating"])
        # user_movie_graph.add_edge(row["userId"], row["movieId"], weight=mapping_score[row["rating"]])

In [24]:
print(len(user_movie_graph.nodes()))

9584


In [26]:
# Project the graph using weights
movie_movie_graph = nx.bipartite.weighted_projected_graph(user_movie_graph, {node for node, node_data in user_movie_graph.nodes(data=True) if node_data['bipartite'] == 1})
# Debug print to check the projected graphs
print(f"Nodes in movie_movie_graph: {list(movie_movie_graph.nodes(data=True))[:10]}")
print(f"Edges in movie_movie_graph: {list(movie_movie_graph.edges(data=True))[:10]}")

Nodes in movie_movie_graph: [(1, {'bipartite': 1}), (2, {'bipartite': 1}), (3, {'bipartite': 1}), (4, {'bipartite': 1}), (5, {'bipartite': 1}), (6, {'bipartite': 1}), (7, {'bipartite': 1}), (8, {'bipartite': 1}), (9, {'bipartite': 1}), (32770, {'bipartite': 1})]
Edges in movie_movie_graph: [(1, 2, {'weight': 1}), (1, 3, {'weight': 1}), (1, 4, {'weight': 1}), (1, 5, {'weight': 1}), (1, 6, {'weight': 1}), (1, 7, {'weight': 1}), (1, 8, {'weight': 1}), (1, 9, {'weight': 1}), (1, 10, {'weight': 1}), (1, 11, {'weight': 1})]


In [27]:
def filter_nodes(graph: nx.Graph, node_type: int):
    return [n for n, d in graph.nodes(data=True) if d["bipartite"] == node_type]

In [28]:
def create_preference_vector(user_id: int, user_movie_graph: nx.Graph):
    edges = {m: v for _, m, v in user_movie_graph.edges(user_id, data="weight")}
    print(f"Edges for user {user_id}: {list(edges)[:10]}")  # Debug print
    tot = sum(edges.values())
    print(f"Total for user {user_id}: {tot}")  # Debug print
    if tot > 0:
        return {
            movie: edges.get(movie, 0) / tot
            for movie in filter_nodes(user_movie_graph, 1) # 1 : Movie
        }
    else:
        return {
            movie: 1 for movie in filter_nodes(user_movie_graph, 1)
        }

In [29]:
def predict_user(user_id, user_movie_graph: nx.Graph, movie_movie_graph: nx.Graph):
    p_vec = create_preference_vector(user_id, user_movie_graph)
    print(f"Preference vector for user {user_id}: {list(p_vec)[:10]}")  # Debug print
    already_seen = [movie for movie, p in p_vec.items() if p > 0]
    print(f"Already seen movies for user {user_id}: {list(already_seen)[:10]}")  # Debug print
    if len(already_seen) < 1:
        return []
    item_rank = nx.pagerank(movie_movie_graph, personalization=p_vec, alpha=0.95, weight="weight")
    print(f"Item rank for user {user_id}: {list(item_rank)[:10]}")  # Debug print
    s_t = [
        x for x in sorted(
            movie_movie_graph.nodes(), key=lambda x: item_rank[x] if x in item_rank else 0, reverse=True
            )
        if x not in already_seen
        ]
    
    return s_t

In [None]:
user = 10
s_t = predict_user(user, user_movie_graph, movie_movie_graph)
# convert movieId to movie title
print(f"Predicted movies for user {user}: {s_t[:10]}")

Edges for user 10: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Total for user 10: 269.96268531187997
Preference vector for user 10: [1, 2, 3, 4, 5, 6, 7, 8, 9, 11]
Already seen movies for user 10: [1, 2, 4, 5, 7, 11, 13, 17, 18, 19]
Item rank for user 10: [1, 2, 3, 4, 5, 6, 7, 8, 9, 32770]


TypeError: 'set' object is not subscriptable

In [39]:
top_ten = s_t[:10]
print(top_ten)

[ 3  6  8  9 10 12 14 15 16 20]


In [None]:
top_ten_titles = [movies[movies['movieId'] == movie].iloc[0]['title'] for movie in top_ten]
print(top_ten_titles)

['GoldenEye (1995)', 'Prison Break: The Final Break (2009)', 'House at the End of the Street (2012)', 'Byzantium (2012)', 'Spiral (2007)', 'Amer (2009)', 'Ménage (Tenue de soirée) (1986)', 'À nous la liberté (Freedom for Us) (1931)', 'Stop! Or My Mom Will Shoot (1992)', 'For the Love of Benji (1977)']
