In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv("./datasets/ml-latest/movies.csv")
ratings = pd.read_csv("./datasets/ml-latest/ratings.csv")

In [3]:
aggregates = ratings.groupby("movieId").agg(
    ratings=pd.NamedAgg(column="rating", aggfunc="size"),
    mean_rating=pd.NamedAgg(column="rating", aggfunc="mean")
).reset_index()

movies = movies.merge(aggregates, on="movieId", how="left")

movies.fillna({"ratings": 0}, inplace=True)
movies.fillna({"mean_rating": 0}, inplace=True)

In [4]:
movies

Unnamed: 0,movieId,title,genres,ratings,mean_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,76813.0,3.893508
1,2,Jumanji (1995),Adventure|Children|Fantasy,30209.0,3.278179
2,3,Grumpier Old Men (1995),Comedy|Romance,15820.0,3.171271
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3028.0,2.868395
4,5,Father of the Bride Part II (1995),Comedy,15801.0,3.076957
...,...,...,...,...,...
86532,288967,State of Siege: Temple Attack (2021),Action|Drama,1.0,3.500000
86533,288971,Ouija Japan (2021),Action|Horror,1.0,0.500000
86534,288975,The Men Who Made the Movies: Howard Hawks (1973),Documentary,1.0,4.000000
86535,288977,Skinford: Death Sentence (2023),Crime|Thriller,1.0,3.000000


In [5]:
movies_sorted = movies.sort_values(by="ratings", ascending=False)
movies_sorted

Unnamed: 0,movieId,title,genres,ratings,mean_rating
314,318,"Shawshank Redemption, The (1994)",Crime|Drama,122296.0,4.416792
351,356,Forrest Gump (1994),Comedy|Drama|Romance|War,113581.0,4.068189
292,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,108756.0,4.191778
2480,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,107056.0,4.160631
585,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,101802.0,4.150287
...,...,...,...,...,...
33769,143291,Seven from Thebes (1964),Adventure,0.0,0.000000
74772,244912,A Rather English Marriage (1998),Comedy|Drama|Romance,0.0,0.000000
33739,143225,The Man Who Wanted to Live Forever (1970),Thriller,0.0,0.000000
33737,143221,Find the Lady (1976),(no genres listed),0.0,0.000000


In [6]:
movies_10000 = movies_sorted.iloc[:10000,:]
movies_10000 = movies_10000.reset_index(drop=True)
movies_10000

Unnamed: 0,movieId,title,genres,ratings,mean_rating
0,318,"Shawshank Redemption, The (1994)",Crime|Drama,122296.0,4.416792
1,356,Forrest Gump (1994),Comedy|Drama|Romance|War,113581.0,4.068189
2,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,108756.0,4.191778
3,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,107056.0,4.160631
4,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,101802.0,4.150287
...,...,...,...,...,...
9995,88248,Quarantine 2: Terminal (2011),Horror|Mystery|Sci-Fi,170.0,2.867647
9996,88746,Terri (2011),Comedy,170.0,3.285294
9997,5832,Left Behind II: Tribulation Force (2002),Drama,170.0,2.241176
9998,100882,Journey to the West: Conquering the Demons (Da...,Adventure|Comedy|Fantasy|Romance|IMAX,170.0,3.576471


In [8]:
movies_ids_10000 = movies_10000["movieId"].values
movies_ids_10000

array([   318,    356,    296, ...,   5832, 100882,  70992])

In [9]:
ratings_10000 = ratings[ratings["movieId"].isin(movies_10000["movieId"].values)]

In [26]:
users_ids_unique = ratings_10000["userId"].unique()
new_users_ids = { users_ids_unique[i]: i for i in range(len(users_ids_unique)) }

def update_user_id(x):
    return new_users_ids.get(x, x)

vectorized_update_user_id = np.vectorize(update_user_id)

new_users_ids = vectorized_update_user_id(ratings_10000["userId"].values)

In [27]:
ratings_10000.loc[:,"userId"] = new_users_ids

In [28]:
len(ratings_10000["userId"].unique())

330593

In [None]:
n_users = len(ratings_10000["userId"].unique())
n_movies = len(ratings_10000["movieId"].unique())

movies_ids = movies_10000["movieId"].unique()

ratings_matrix = np.zeros((n_movies, n_users))

for index, movie_id in enumerate(movies_ids):
    # if index % 100 == 0:
    #     print(index)
        
    movie = ratings_10000[ratings_10000["movieId"] == movie_id]
    
    movie_ratings = np.zeros((n_users,))
    
    indexes = movie["userId"]
    values = movie["rating"]
    
    movie_ratings[indexes] = values
    
    ratings_matrix[index] = movie_ratings

In [32]:
ratings_matrix.shape

(10000, 330593)

In [33]:
from scipy import sparse

In [34]:
sparse_ratings_matrix = sparse.csr_matrix(ratings_matrix)

In [36]:
sparse.save_npz("./datasets/custom-ml/sparse_ratings_matrix_10000.npz", sparse_ratings_matrix)

In [37]:
movies_10000.head()

Unnamed: 0,movieId,title,genres,ratings,mean_rating
0,318,"Shawshank Redemption, The (1994)",Crime|Drama,122296.0,4.416792
1,356,Forrest Gump (1994),Comedy|Drama|Romance|War,113581.0,4.068189
2,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,108756.0,4.191778
3,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,107056.0,4.160631
4,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,101802.0,4.150287


In [38]:
ratings_10000.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,1,4.0,1225734739
1,0,110,4.0,1225865086
2,0,158,4.0,1225733503
3,0,260,4.5,1225735204
4,0,356,5.0,1225735119


In [39]:
movies_10000.to_csv("./datasets/custom-ml/movies_10000.csv", index=False)

In [45]:
ratings_10000 = ratings_10000.drop(["timestamp"],axis=1)

In [46]:
ratings_10000.to_csv("./datasets/custom-ml/ratings_10000.csv", index=False)