## Item Based Collaborative Filtering (IBCF)

### Importing Libraries

In [1]:
# set the environment path to find Recommenders
import sys
import logging
import scipy
import numpy as np
import pandas as pd

from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k, serendipity, diversity, catalog_coverage, distributional_coverage, novelty
from recommenders.models.sar import SAR

print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"SciPy version: {scipy.__version__}")

System version: 3.8.10 (tags/v3.8.10:3d8993a, May  3 2021, 11:48:03) [MSC v.1928 64 bit (AMD64)]
Pandas version: 1.4.4
NumPy version: 1.23.5
SciPy version: 1.9.1


### Loading the Dataset

In [2]:
# Top k items to recommend
TOP_K = 10

In [5]:
#TODO Load the CSV file into a dataframe
df = pd.read_csv('cf_final2.csv')
df= df.rename(columns={"user":"userID", 'item':"itemID", "label":"rating", "song_by": "song"})
df=df[["userID","itemID","rating","timestamp","song"]]
# Convert the float precision to 32-bit in order to reduce memory consumption
df["rating"] = df["rating"].astype(np.float32)
df.head(5)

Unnamed: 0,userID,itemID,rating,timestamp,song
0,577,30377,0.172815,1447978000.0,The Safety Dance by Men Without Hats
1,147,15910,0.127811,1441325000.0,Endless Summer by Grizfolk
2,690,40737,0.097224,1413331000.0,Castaway by Zac Brown Band
3,59,40737,0.103762,1404950000.0,Castaway by Zac Brown Band
4,415,49732,0.102779,1402963000.0,Islands In the Stream by Dolly Parton


In [6]:
header = {
    "col_user": "userID",
    "col_item": "itemID",
    "col_rating": "rating",
    "col_timestamp": "timestamp",
    "col_prediction": "Prediction",
}

#Split the dataset into 75% train and 25% test

train, test = python_stratified_split(
    df, ratio=0.75, col_user=header["col_user"], col_item=header["col_item"], seed=42
)

### Training the Model

In [7]:
# Instantiating the model using the Jaccard Similarity method. This will find a balance between recommending popular items and obscure items.

model = SAR(
    similarity_type="jaccard",
    time_decay_coefficient=30,
    time_now=None,
    timedecay_formula=True,
    **header
)

In [8]:
# Fitting the model on the training data and computing the matrices.
model.fit(train)

### Prediction and Evaluation

In [9]:
# Predicting top k items for every user.
# We are not recommending items that have been rated by the user.
top_k = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

  return self._with_data(self.data * other)


In [10]:
# Link movie id's to movie names.

top_k_with_titles = top_k.join(
    df[["itemID", "song"]].drop_duplicates().set_index("itemID"),
    on="itemID",
    how="inner",
).sort_values(by=["userID", "Prediction"], ascending=False)

top_k_with_titles.head(10)

Unnamed: 0,userID,itemID,Prediction,song
8590,860,30714,0.002303,My Mind by ShortRound
8591,860,47524,0.002303,Black Pullet by Steve Aoki
8592,860,25930,0.002303,Harder State Of Mind by D-Block & S-te-Fan
8593,860,44360,0.002303,Seven Nation Army by THNDERZ
8594,860,25243,0.002303,Burn Up by Teknoclash
8595,860,27257,0.002303,Dreaming by Will Sparks
8596,860,46795,0.002303,Heaven Is A Place On Earth by W&W
8597,860,1084,0.002303,When We Were Still Young by Klaas
8598,860,52298,0.002303,Break It Down by Olly James
8599,860,8441,0.002303,I Wanna Rave by ShortRound


In [11]:
# Defining arguments for calculating metrics. All ranking based metrics have the same arguments
args = [test, top_k]
kwargs = dict(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_prediction="Prediction",
    relevancy_method="top_k",
    k=TOP_K,
)

#### Ranking Metrics

##### MAP

It is the average precision for each user normalized over all users.

In [12]:
eval_map = map_at_k(*args, **kwargs)
print(f"MAP: {eval_map}")

MAP: 0.006474504757133408


##### NDCG

Normalized Discounted Cumulative Gain (NDCG) - evaluates how well the predicted items for a user are ranked based on relevance

In [13]:
eval_ndcg = ndcg_at_k(*args, **kwargs)
print(f"NDCG: {eval_ndcg}")

NDCG: 0.03482738980357478


##### Precision Recall

Precision - this measures the proportion of recommended items that are relevant

Recall - this measures the proportion of relevant items that are recommended

In [14]:
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)
print(f"Precision: {eval_precision} \nRecall: {eval_recall}")

Precision: 0.03290697674418605 
Recall: 0.013482475319192364


#### Diversity Metrics

##### Coverage

In [15]:
cov_args = [train, top_k]
cov_kwargs = dict(
    col_user="userID",
    col_item="itemID",
)
cat_coverage = catalog_coverage(*cov_args, **cov_kwargs)
dist_coverage = distributional_coverage(*cov_args, **cov_kwargs)
print(f"Catalog Coverage: {cat_coverage} \nDistributional Coverage: {dist_coverage}")

Catalog Coverage: 0.1386640114290662 
Distributional Coverage: 12.33902569722968


##### Diversity

In [16]:
div_args = [train, top_k]
div_kwargs = dict(
    col_user="userID",
    col_item="itemID",
)
diversity_eval = diversity(*div_args, **div_kwargs)
print(f"Diversity: {diversity_eval}")

Diversity: 0.24165682829636229


##### Novelty

In [17]:
nov_args = [train, top_k]
nov_kwargs = dict(
    col_user="userID",
    col_item="itemID",
)
novelty_eval = novelty(*nov_args, **nov_kwargs)
print(f"Novelty: {novelty_eval}")

Novelty: 15.29584124277557


##### Serendipity

In [18]:
ser_args = [train, top_k]
ser_kwargs = dict(
    col_user="userID",
    col_item="itemID",
)
ser_eval = serendipity(*ser_args, **ser_kwargs)
print(f"Serendipity: {ser_eval}")

Serendipity: 0.9547009748825526


### Summary of Ranking Metrics

<center>

|Metric|Range|Selection criteria|Limitation|
|------|-------------------------------|---------|----------|
|Precision|$\geq 0$ and $\leq 1$|Higher the better.|Only for hits in recommendations.|
|Recall|$\geq 0$ and $\leq 1$|Higher the better.|Only for hits in the ground truth.|
|NDCG|$\geq 0$ and $\leq 1$|Higher the better.|Does not penalize for bad/missing items, and does not perform for several equally good items.|
|MAP|$\geq 0$ and $\leq 1$|Higher the better.|Depend on variable distributions.|

</center>