In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from dataPreprocessor import DataPreprocessor
%load_ext autoreload
%autoreload 2

In [2]:
train = pd.read_csv("data/train.csv")
data = DataPreprocessor(train)
user_item_matrix = data.user_item_matrix
item_user_matrix = user_item_matrix.T

In [3]:
item_similarity = data.fast_cosine_similarity(user_item_matrix, kind='item')

In [4]:
#print shape and type of user_item_matrix, iterm_user_matrix, and item_similarity with text
print(f"user_item_matrix shape: {user_item_matrix.shape}, type: {type(user_item_matrix)}")
print(f"item_user_matrix shape: {item_user_matrix.shape}, type: {type(item_user_matrix)}")
print(f"item_similarity shape: {item_similarity.shape}, type: {type(item_similarity)}")

user_item_matrix shape: (18905, 15712), type: <class 'numpy.ndarray'>
item_user_matrix shape: (15712, 18905), type: <class 'numpy.ndarray'>
item_similarity shape: (15712, 15712), type: <class 'numpy.ndarray'>


In [5]:
def predict_with_matrix(ratings, similarity):
    # Mask to find which items are rated by each user
    rated_mask = (ratings > 0).astype(np.float64)  # Binary mask (1 for rated items, 0 otherwise)

    # Compute weighted sum for all items (dot product)
    weighted_sum = ratings.dot(similarity)  # Shape: (n_users, n_items)

    # Compute the sum of absolute similarities for each item-user pair
    similarity_sum = rated_mask.dot(np.abs(similarity))  # Shape: (n_users, n_items)

    # Avoid division by zero
    similarity_sum[similarity_sum == 0] = 1  # Prevent division errors

    # Normalize weighted sums by similarity sums
    predictions = weighted_sum / similarity_sum  # Element-wise division

    return predictions


item_prediction = predict_with_matrix(user_item_matrix, item_similarity)
print(item_prediction)

[[1.35056789 1.35056789 1.35056789 ... 1.35056789 1.35056789 1.35056789]
 [1.8529818  1.8529818  1.8529818  ... 1.8529818  1.8529818  1.8529818 ]
 [1.51893664 1.51893664 1.51893664 ... 1.51893664 1.51893664 1.51893664]
 ...
 [4.         4.         4.         ... 4.         4.         4.        ]
 [2.         2.         2.         ... 2.         2.         2.        ]
 [5.         5.         5.         ... 5.         5.         5.        ]]
