In [1]:
import sys
import pandas as pd 
import numpy as np 
import cornac
from cornac.models import UserKNN, ItemKNN
from cornac.eval_methods import RatioSplit
from cornac.utils import cache

FM model is only supported on Linux.
Windows executable can be found at http://www.libfm.org.


In [2]:
SEED = 42
VERBOSE = True

print(f"System version: {sys.version}")
print(f"Cornac version: {cornac.__version__}")

System version: 3.7.11 (default, Jul 27 2021, 07:03:16) 
[Clang 10.0.0 ]
Cornac version: 1.14.1


In [3]:
sample_df = pd.read_csv('../data/datasets/ml-1m/ratings.dat', header=None, sep='::', engine='python')
sample_df.columns = ['UserId', 'MovieId', 'Rating', 'Timestamp']
sample_df.head()

Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
def rec2mat(df, replace_zeros='skip', add_meanR_col=False):
    dataset = cornac.data.Dataset.from_uir(df.copy().itertuples(index=False))
    df = pd.DataFrame(dataset.matrix.A,
                    index=[f"User {u + 1}" for u in np.arange(dataset.num_users)],
                    columns=[f"Item {i + 1}" for i in np.arange(dataset.num_items)])
    if replace_zeros != 'skip':
        df.replace(0, replace_zeros, inplace=True)
    
    if add_meanR_col:
        df["Mean Rating"] = df.mean(axis=1)

    return df


In [5]:
test_df = sample_df.loc[:, ['UserId', 'MovieId', 'Rating']]

In [6]:
rating_mat = rec2mat(test_df, replace_zeros=np.nan, add_meanR_col=True)

In [7]:
def rec2MCmat(df, replace_zeros='skip'):
    dataset = cornac.data.Dataset.from_uir(df.copy().itertuples(index=False))
    df = pd.DataFrame(dataset.matrix.A,
                    index=[f"User {u + 1}" for u in np.arange(dataset.num_users)],
                    columns=[f"Item {i + 1}" for i in np.arange(dataset.num_items)])
    df.replace(0, np.nan, inplace=True)
    df["Mean Rating"] = df.mean(axis=1)

    ratings = df[df.columns.difference(["Mean Rating"])].values
    means = df["Mean Rating"].values[:, np.newaxis]
    df[df.columns.difference(["Mean Rating"])] = (ratings - means)
    df.drop(columns="Mean Rating")

    if replace_zeros != 'skip':
        df.replace(0, replace_zeros, inplace=True)

    return df


In [8]:
norm_rat_mat = rec2MCmat(test_df)

In [21]:
class UserBasedKNN(object):
    def __init__(self, data, k=50, similarity='cosine', weighting='skip', split_ratio=0.1):
        self.split_ratio = split_ratio
        df = data.copy()
        df.iloc[:, :2] = df.iloc[:, :2].astype(str)
        self.dataset = df.values.tolist()
        if weighting != 'skip':
            self.model = UserKNN(k=k, similarity=similarity, name='user-base KNN model', verbose=VERBOSE, weighting=weighting)
        else:
            self.model = UserKNN(k=k, similarity=similarity, name='user-base KNN model', verbose=VERBOSE)

    def train(self):
        ratio_split = RatioSplit(self.dataset, test_size=self.split_ratio, seed=SEED, verbose=VERBOSE)
        cornac.Experiment(
            eval_method=ratio_split,
            models=[self.model],
            metrics=[cornac.metrics.RMSE(), cornac.metrics.NDCG()]
        ).run()

    def predict(self, user_ids, k=10):
        user_id2idx = self.model.train_set.uid_map
        item_idx2id = list(self.model.train_set.item_ids)

        recs = {}
        for ID in user_ids:
            idx = user_id2idx[str(ID)]
            recommendations, scores = self.model.rank(idx)
            items_id = [item_idx2id[i] for i in recommendations[:k]]
            recs[ID] = items_id

        return recs

In [22]:
userKnn = UserBasedKNN(test_df, k=50, similarity='cosine', weighting='idf')

In [23]:
userKnn.train()

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 6040
Number of items = 3691
Number of ratings = 900188
Max rating = 5.0
Min rating = 1.0
Global mean = 3.6
---
Test data:
Number of users = 5958
Number of items = 3298
Number of ratings = 100004
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 6040
Total items = 3691

[user-base KNN model] Training started!


100%|██████████| 6040/6040 [00:02<00:00, 2265.94it/s]



[user-base KNN model] Evaluation started!


Rating: 100%|██████████| 100004/100004 [00:18<00:00, 5526.40it/s]
Ranking: 100%|██████████| 5958/5958 [05:41<00:00, 17.44it/s]


TEST:
...
                    |   RMSE | NDCG@-1 | Train (s) | Test (s)
------------------- + ------ + ------- + --------- + --------
user-base KNN model | 0.8995 |  0.2475 |    4.6544 | 360.4012






In [24]:
userKnn.predict([5412, 425, 824])

{5412: ['3382',
  '557',
  '989',
  '130',
  '1830',
  '758',
  '2503',
  '3245',
  '3517',
  '3323'],
 425: ['3382',
  '557',
  '989',
  '130',
  '1830',
  '758',
  '2503',
  '3245',
  '3517',
  '2858'],
 824: ['3382',
  '557',
  '989',
  '130',
  '1830',
  '758',
  '3245',
  '2503',
  '3517',
  '3323']}

In [25]:
class ItemBasedKNN(object):
    def __init__(self, data, k=50, similarity='cosine', weighting='skip', split_ratio=0.1):
        self.split_ratio = split_ratio
        df = data.copy()
        df.iloc[:, :2] = df.iloc[:, :2].astype(str)
        self.dataset = df.values.tolist()
        if weighting != 'skip':
            self.model = ItemKNN(k=k, similarity=similarity, name='item-base KNN model', verbose=VERBOSE, weighting=weighting)
        else:
            self.model = ItemKNN(k=k, similarity=similarity, name='item-base KNN model', verbose=VERBOSE)

    def train(self):
        ratio_split = RatioSplit(self.dataset, test_size=self.split_ratio, seed=SEED, verbose=VERBOSE)
        cornac.Experiment(
            eval_method=ratio_split,
            models=[self.model],
            metrics=[cornac.metrics.RMSE(), cornac.metrics.NDCG()]
        ).run()

    def predict(self, user_ids, k=10):
        user_id2idx = self.model.train_set.uid_map
        item_idx2id = list(self.model.train_set.item_ids)

        recs = {}
        for ID in user_ids:
            idx = user_id2idx[str(ID)]
            recommendations, scores = self.model.rank(idx)
            items_id = [item_idx2id[i] for i in recommendations[:k]]
            recs[ID] = items_id

        return recs

In [26]:
itemKnn = ItemBasedKNN(test_df, k=30, similarity='pearson', weighting='bm25')

In [27]:
itemKnn.train()

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 6040
Number of items = 3691
Number of ratings = 900188
Max rating = 5.0
Min rating = 1.0
Global mean = 3.6
---
Test data:
Number of users = 5958
Number of items = 3298
Number of ratings = 100004
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 6040
Total items = 3691

[item-base KNN model] Training started!


100%|██████████| 3691/3691 [00:01<00:00, 2574.09it/s]



[item-base KNN model] Evaluation started!


Rating: 100%|██████████| 100004/100004 [00:14<00:00, 6872.09it/s]
Ranking: 100%|██████████| 5958/5958 [05:51<00:00, 16.97it/s]


TEST:
...
                    |   RMSE | NDCG@-1 | Train (s) | Test (s)
------------------- + ------ + ------- + --------- + --------
item-base KNN model | 0.9372 |  0.2355 |    2.3993 | 366.4086






In [28]:
itemKnn.predict([5412, 425, 824])

{5412: ['858',
  '2258',
  '3888',
  '3065',
  '1218',
  '2690',
  '3265',
  '1252',
  '449',
  '3288'],
 425: ['3290',
  '1709',
  '813',
  '133',
  '624',
  '3280',
  '3505',
  '2314',
  '3096',
  '1185'],
 824: ['3280',
  '3888',
  '1747',
  '1189',
  '2039',
  '3127',
  '2020',
  '2244',
  '2679',
  '912']}