In [10]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from typing import Optional, Tuple, Union

In [2]:
ITEM_SHRINK_TERM = 5
K = 10

In [28]:
item_feature_df = pd.read_csv('data_ICM_metadata.csv')
target_users_df = pd.read_csv('data_target_users_test.csv')
train_df = pd.read_csv('data_train.csv')

train_df

Unnamed: 0,user_id,item_id,data
0,0,0,1.0
1,0,2,1.0
2,0,120,1.0
3,0,128,1.0
4,0,211,1.0
...,...,...,...
1764602,35735,37802,1.0
1764603,35735,37803,1.0
1764604,35735,37805,1.0
1764605,35735,38000,1.0


In [30]:
def preprocess_data(users: pd.DataFrame):
    unique_users = users.user_id.unique()
    unique_items = users.item_id.unique()

    num_users, min_user_id, max_user_id = (
        unique_users.size,
        unique_users.min(),
        unique_users.max(),
    )
    num_items, min_item_id, max_item_id = (
        unique_items.size,
        unique_items.min(),
        unique_items.max(),
    )

    print(num_users, min_user_id, max_user_id)
    print(num_items, min_item_id, max_item_id)

    mapping_user_id = pd.DataFrame(
        {"mapped_user_id": np.arange(num_users), "user_id": unique_users}
    )
    mapping_item_id = pd.DataFrame(
        {"mapped_item_id": np.arange(num_items), "item_id": unique_items}
    )

    users = pd.merge(left=users, right=mapping_user_id, how="inner", on="user_id")

    users = pd.merge(left=users, right=mapping_item_id, how="inner", on="item_id")

    return users

users = preprocess_data(train_df)
users

35736 0 35735
38121 0 38120


Unnamed: 0,user_id,item_id,data,mapped_user_id,mapped_item_id
0,0,0,1.0,0,0
1,5,0,1.0,5,0
2,6,0,1.0,6,0
3,7,0,1.0,7,0
4,9,0,1.0,9,0
...,...,...,...,...,...
1764602,34967,37744,1.0,34967,38120
1764603,35040,37744,1.0,35040,38120
1764604,35308,37744,1.0,35308,38120
1764605,35431,37744,1.0,35431,38120


In [8]:

# Convert into sparse matrix items x features
item_idx, item_map = pd.factorize(item_feature_df["item_id"])
feature_idx, feature_map = pd.factorize(item_feature_df["feature_id"])

items_sparse_matrix = sp.csr_matrix((item_feature_df['data'], (item_idx, feature_idx)), 
                           shape=(len(item_map), len(feature_map)))
items_sparse_matrix

<38121x94331 sparse matrix of type '<class 'numpy.float64'>'
	with 2940040 stored elements in Compressed Sparse Row format>

In [32]:
from sklearn.model_selection import train_test_split


def dataset_splits(
    users,
    num_users,
    num_items,
    validation_percentage: float,
    testing_percentage: float,
):
    seed = 1234

    (
        user_ids_training,
        user_ids_test,
        item_ids_training,
        item_ids_test,
        data_training,
        data_test,
    ) = train_test_split(
        users.mapped_user_id,
        users.mapped_item_id,
        users.data,
        test_size=testing_percentage,
        shuffle=True,
        random_state=seed,
    )

    (
        user_ids_training,
        user_ids_validation,
        item_ids_training,
        item_ids_validation,
        data_training,
        data_validation,
    ) = train_test_split(
        user_ids_training,
        item_ids_training,
        data_training,
        test_size=validation_percentage,
    )

    urm_train = sp.csr_matrix(
        (data_training, (user_ids_training, item_ids_training)),
        shape=(num_users, num_items),
    )

    urm_validation = sp.csr_matrix(
        (data_validation, (user_ids_validation, item_ids_validation)),
        shape=(num_users, num_items),
    )

    urm_test = sp.csr_matrix(
        (data_test, (user_ids_test, item_ids_test)), shape=(num_users, num_items)
    )

    return urm_train, urm_validation, urm_test

urm_train, urm_validation, urm_test = dataset_splits(
    users,
    num_users=35736,
    num_items=38121,
    validation_percentage=0.10,
    testing_percentage=0.20,
)
urm_train

<35736x38121 sparse matrix of type '<class 'numpy.float64'>'
	with 1270516 stored elements in Compressed Sparse Row format>

In [33]:
def vector_similarity(urm: sp.csc_matrix, shrink: int):
    item_weights = np.sqrt(np.sum(urm.power(2), axis=0)).A.flatten()

    num_items = urm.shape[1]
    urm_t = urm.T
    weights = np.empty(shape=(num_items, num_items))
    for item_id in range(num_items):
        numerator = urm_t.dot(urm[:, item_id]).A.flatten()
        denominator = item_weights[item_id] * item_weights + shrink + 1e-6

        weights[item_id] = numerator / denominator

    np.fill_diagonal(weights, 0.0)
    return weights


def matrix_similarity(urm: sp.csc_matrix, shrink: int):
    item_weights = np.sqrt(np.sum(urm.power(2), axis=0)).A

    numerator = urm.T.dot(urm)
    denominator = item_weights.T.dot(item_weights) + shrink + 1e-6
    weights = numerator / denominator
    np.fill_diagonal(weights, 0.0)

    return weights

In [41]:
class CFItemKNN(object):
    def __init__(self, shrink: int):
        self.shrink = shrink
        self.weights = None

    def fit(self, urm_train: sp.csc_matrix, similarity_function):
        if not sp.isspmatrix_csc(urm_train):
            raise TypeError(f"We expected a CSC matrix, we got {type(urm_train)}")

        self.weights = similarity_function(urm_train, self.shrink)

    def recommend(
        self,
        user_id: int,
        urm_train: sp.csr_matrix,
        at: Optional[int] = None,
        remove_seen: bool = True,
    ):
        user_profile = urm_train[user_id]

        ranking = user_profile.dot(self.weights).A.flatten()

        if remove_seen:
            user_profile_start = urm_train.indptr[user_id]
            user_profile_end = urm_train.indptr[user_id + 1]

            seen_items = urm_train.indices[user_profile_start:user_profile_end]

            ranking[seen_items] = -np.inf

        ranking = np.flip(np.argsort(ranking))
        return ranking[:at]

itemknn_recommender = CFItemKNN(shrink=50)
itemknn_recommender

<__main__.CFItemKNN at 0x25717c5d2b0>

In [42]:
%%time

itemknn_recommender.fit(urm_train.tocsc()[:1000,:1000], matrix_similarity)

CPU times: total: 31.2 ms
Wall time: 28.1 ms


In [43]:
for user_id in range(10):
    print(
        itemknn_recommender.recommend(
            user_id=user_id, urm_train=urm_train, at=10, remove_seen=True
        )
    )

ValueError: dimension mismatch