In [2]:
import pandas as pd
import numpy as np
from scipy import sparse

In [3]:
ratings = pd.read_csv("./datasets/custom-ml/ratings_10000.csv")

In [4]:
movies_10000_full = pd.read_csv("./datasets/custom-ml/movies_10000_full.csv")
movies_10000_full = movies_10000_full.rename(columns={"Unnamed: 0": "index"})

In [5]:
ratings_with_movies_index = ratings.merge(movies_10000_full[["index", "movieId"]], on="movieId", how="left")
ratings_with_movies_index = ratings_with_movies_index.rename(columns={"index": "movieIndex"})
ratings_with_movies_index = ratings_with_movies_index.drop("movieId", axis=1)

In [6]:
data = ratings_with_movies_index["rating"].values
column_indices = ratings_with_movies_index["userId"].values
row_indices = ratings_with_movies_index["movieIndex"].values

In [9]:
n_movies = movies_10000_full.shape[0]
n_users = len(ratings_with_movies_index["userId"].unique())
sparse_from_ratings = sparse.csr_matrix((data, (row_indices, column_indices)), shape=(n_movies, n_users))

In [12]:
print("# of ratings:",sparse_from_ratings.nnz)

# of ratings: 32765490


In [13]:
sparse_ratings_5000_users = sparse_from_ratings[:,:5000]

In [16]:
data_5000_users = sparse_ratings_5000_users.data
column_indices_5000_users = sparse_ratings_5000_users.indices
indptr_5000_users = sparse_ratings_5000_users.indptr

row_indices_5000_users = np.zeros(len(data_5000_users), dtype=int)

start_idx = 0
for i in range(len(indptr_5000_users) - 1):
    row_length = indptr_5000_users[i + 1] - indptr_5000_users[i]
    row_indices_5000_users[start_idx:start_idx + row_length] = i
    start_idx += row_length
    
print("Data:",data_5000_users)
print("Column indices:",column_indices_5000_users)
print("Row indices:",row_indices_5000_users)

Data: [5. 5. 5. ... 2. 3. 4.]
Column indices: [   1    2    3 ... 4885 4195 4242]
Row indices: [   0    0    0 ... 9997 9999 9999]


In [17]:
# Test if they are the same
sparse_ratings_5000_users_rebuilt = sparse.csr_matrix((data_5000_users, (row_indices_5000_users, column_indices_5000_users)), shape=sparse_ratings_5000_users.shape)

In [18]:
(sparse_ratings_5000_users != sparse_ratings_5000_users_rebuilt).nnz == 0

True

### Insert to mongo

In [30]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")
db = client["cinematch"]
collection_ratings_data = db["ratings_data"]

In [31]:
ratings_data_id = collection_ratings_data.insert_one({
    "n_users": 5000,
    "n_movies": n_movies,
    "data":data_5000_users.tolist()
})

In [33]:
ratings_data_id.inserted_id

ObjectId('66319b299c06afb0e54e82b4')

In [34]:
collection_column_indices = db["ratings_column_indices"]
collection_row_indices = db["ratings_row_indices"]

collection_column_indices.insert_one({
    "ratings_data_id": ratings_data_id.inserted_id,
    "data": column_indices_5000_users.tolist(),
})

collection_row_indices.insert_one({
    "ratings_data_id": ratings_data_id.inserted_id,
    "data": row_indices_5000_users.tolist()
})

InsertOneResult(ObjectId('66319c069c06afb0e54e82b6'), acknowledged=True)