In [29]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from scipy.linalg import sqrtm
from sklearn.metrics import mean_squared_error as mse

In [2]:
users = pd.read_csv("../data/interim/users.csv", index_col=0)
movies = pd.read_csv("../data/interim/items.csv", index_col=0)
ratings = pd.read_csv("../data/interim/ratings.csv", index_col=0)

In [3]:
users.shape, movies.shape

((943, 5), (1682, 22))

In [4]:
user2item = np.zeros((users.shape[0], movies.shape[0]))

In [5]:
check = ratings.groupby(["user_id", "item_id"]).count()
check[check["rating"] != 1].shape, check.shape

((0, 1), (100000, 1))

In [6]:
def construct_matrix(ratings, user2item):
    for rating in ratings.iterrows():
        user_id = rating[1]["user_id"]
        item_id = rating[1]["item_id"]
        r = rating[1]["rating"]
        user2item[user_id - 1][item_id - 1] = r
    return user2item

In [7]:
user2item = construct_matrix(ratings, user2item)

In [8]:
zeros = 0
for i in range(user2item.shape[0]):
    for j in range(user2item.shape[1]):
        if user2item[i][j] == 0:
            zeros += 1

In [9]:
assert zeros + ratings.shape[0] == users.shape[0] * movies.shape[0]

In [10]:
np.save("../data/interim/user2item.npy", user2item)

In [11]:
print(
    f"{zeros/(users.shape[0] * movies.shape[0]) * 100:.2f}% of elements in the matrix are empty"
)

93.70% of elements in the matrix are empty


In [12]:
from sklearn.model_selection import train_test_split

user2item_train = np.zeros((users.shape[0], movies.shape[0]))
user2item_test = np.zeros((users.shape[0], movies.shape[0]))

r_train, r_test = train_test_split(ratings, test_size=0.2)
user2item_train = construct_matrix(r_train, user2item_train)
user2item_test = construct_matrix(r_test, user2item_test)

np.save("../data/interim/user2item_train.npy", user2item_train)
np.save("../data/interim/user2item_test.npy", user2item_test)

In [187]:
def svd(train, k):
    # Convert input 'train' into a numpy array
    util_mat = np.array(train)

    # Create a mask for missing values in 'util_mat'
    mask = np.isnan(util_mat)

    # Mask the array using the created mask
    masked_arr = np.ma.masked_array(util_mat, mask)

    # Calculate mean of each column (item) in the masked array
    item_means = np.mean(masked_arr, axis=0)

    # Fill masked values in 'util_mat' with the respective column means
    util_mat = masked_arr.filled(item_means)

    # Subtract column means from each element in the matrix
    x = np.tile(item_means, (util_mat.shape[0], 1))
    util_mat = util_mat - x

    # Perform Singular Value Decomposition (SVD) on the modified utility matrix
    U, s, V = np.linalg.svd(util_mat, full_matrices=False)

    # Convert s to a diagonal matrix and keep only the top 'k' singular values
    s = np.diag(s)
    s = s[0:k, 0:k]

    # Reduce dimensions of U and V matrices to 'k' dimensions
    U = U[:, 0:k]
    V = V[0:k, :]

    # Calculate square root of the diagonal matrix 's'
    s_root = sqrtm(s)

    # Calculate Usk and skV using the reduced matrices
    Usk = np.dot(U, s_root)
    skV = np.dot(s_root, V)

    # Compute the product of Usk and skV to approximate the original matrix
    UsV = np.dot(Usk, skV)

    # Add back the mean values to the reconstructed matrix
    UsV = UsV + x

    return UsV  # Return the reconstructed matrix

In [185]:
def create_utility_matrix(data):
    userList = data.iloc[:, 0].tolist()
    itemList = data.iloc[:, 1].tolist()
    valueList = data.iloc[:, 2].tolist()

    users = data["user_id"].unique()
    items = data["item_id"].unique()

    users_index = {users[i]: i for i in range(len(users))}
    pd_dict = {item: [np.nan for i in range(len(users))] for item in items}

    for i in range(0, len(data)):
        pd_dict[itemList[i]][users_index[userList[i]]] = valueList[i]
    X = pd.DataFrame(pd_dict)
    X.index = users

    itemcols = list(X.columns)
    items_index = {itemcols[i]: i for i in range(len(itemcols))}
    return X, users_index, items_index

In [123]:
t = r_test.copy()
true_movies_test = t.groupby('user_id')['item_id'].apply(list).reset_index()

In [189]:
no_of_features = [8, 10, 12, 14, 17, 943]
util_mat, users_index, items_index = create_utility_matrix(r_train)
best_matrix = None
K = 100
best_rmse = np.Inf
for f in tqdm(no_of_features):
    svdout = svd(util_mat, k=f)
    pred = []
    for _, row in r_test.iterrows():
        user = row["user_id"]
        item = row["item_id"]
        u_index = users_index[user]

        if item in items_index:
            i_index = items_index[item]
            pred_rating = svdout[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
        pred.append(pred_rating)
    pred = np.array(pred)
    rmse = np.sqrt(mse(r_test.rating, pred))

    intersection = 0
    for row in true_movies_test.iterrows():
        user_id = row[1]['user_id']
        user_scores = svdout[u_index]
        true = row[1]['item_id']
        pred = user_scores.argsort()[-K:]
        true_set = set(true)
        pred_set = set(pred)
        intersection += len(true_set.intersection(pred_set))
    mean_intersection = intersection / (len(true_movies_test))
    print(f"intersections - {mean_intersection}")
    print(f"rmse - {rmse}")
    print('-'*20)
    if rmse < best_rmse:
        best_rmse = rmse
        best_matrix = svdout

  0%|          | 0/6 [00:00<?, ?it/s]

intersections - 1.6945917285259808
rmse - 0.9804504585904057
--------------------
intersections - 1.6712619300106044
rmse - 0.9799286549793254
--------------------
intersections - 1.623541887592789
rmse - 0.9794069932066453
--------------------
intersections - 1.7327677624602333
rmse - 0.9798167810803191
--------------------
intersections - 1.6595970307529162
rmse - 0.9817691523893289
--------------------
intersections - 1.6288441145281018
rmse - 1.026158342901167
--------------------


In [182]:
n = 91
user_id = true_movies_test.iloc[n]['user_id']
user_scores = best_matrix[u_index]
true = true_movies_test.iloc[n]['item_id']
pred = user_scores.argsort()[-K:]
true_set = set(true)
pred_set = set(pred)
intersection = len(true_set.intersection(pred_set))
intersection

6