In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("../data/processed/train.csv")
test = pd.read_csv("../data/processed/test.csv")

In [3]:
train.head()

Unnamed: 0,user_id,item_id,interaction,timestamp
0,A7RV1KU5O0II9,B00000JDKU,1,931996800
1,A7RV1KU5O0II9,B00000JFMK,1,931996800
2,A7RV1KU5O0II9,B00000JYWQ,1,940896000
3,A1JSO7PPEA0W72,B00000J4LQ,1,941500800
4,AD50TWQOM8W4G,B00000K3RI,1,943660800


In [4]:
test.head()

Unnamed: 0,user_id,item_id,interaction,timestamp
0,AT8TIN5JKHO2V,B00001OPJV,1,1183507200
1,AT8TIN5JKHO2V,B00001OPJQ,1,1183507200
2,AT8TIN5JKHO2V,B000001OMT,1,1183507200
3,AT8TIN5JKHO2V,B000001OL6,1,1185148800
4,A2R6RA8FRBS608,B00000J1EP,1,1186272000


### Popularity Baseline : Find top 10 popular items

In [5]:
top_items = (
    train.groupby("item_id")
    .size()
    .sort_values(ascending=False)
    .head(10)
    .index
    .tolist()
)

In [6]:
top_items

['B00004SB92',
 'B00004S9AK',
 'B00000JSGF',
 'B00002JXBI',
 'B00004TEN2',
 'B00004TS16',
 'B00004YMBK',
 'B00002SWHH',
 'B00002JXFH',
 'B00000J3II']

### Evaluate Popularity Model (Recall@K)

In [7]:
def recall_at_k(test, recommendations, k=10):
    recalls = []

    for user in test.user_id.unique():
        true_items = set(
            test[test.user_id == user].item_id
        )

        if not true_items:
            continue

        recs = set(recommendations[:k])
        recall = len(true_items & recs) / len(true_items)
        recalls.append(recall)

    return sum(recalls) / len(recalls)

In [8]:
pop_recall = recall_at_k(test, top_items, k=10)

In [10]:
pop_recall

0.006666666666666666

### User-Based Collaborative Filtering

In [11]:
user_item = (
    train.pivot_table(
        index="user_id",
        columns="item_id",
        values="interaction",
        fill_value=0
    )
)

In [13]:
user_item.head()

item_id,B000001OKH,B00000DM9W,B00000G20L,B00000J05A,B00000J060,B00000J061,B00000J1E6,B00000J1EJ,B00000J1EP,B00000J1GA,...,B00004Z0BO,B00004Z5BQ,B00004Z5H3,B00004Z5L8,B00004Z5M1,B00004Z5QU,B00004Z62J,B00004Z62S,B00004Z677,B00004Z6PI
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A11ED8O95W2103,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A14JBDSWKPKTZA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
A1522TN5FVJL0Y,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
A1ISUNUWG0K02V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1JSO7PPEA0W72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

user_sim = cosine_similarity(user_item)

### Evalute User-based model (Recall@K)

In [15]:
def recommend_user_cf(user_idx, user_item, user_sim, k=10):
    sim_scores = user_sim[user_idx]
    weighted_scores = sim_scores @ user_item.values
    item_indices = np.argsort(weighted_scores)[::-1][:k]
    return user_item.columns[item_indices].tolist()

In [16]:
user_index = {u: i for i, u in enumerate(user_item.index)}
user_index

{'A11ED8O95W2103': 0,
 'A14JBDSWKPKTZA': 1,
 'A1522TN5FVJL0Y': 2,
 'A1ISUNUWG0K02V': 3,
 'A1JSO7PPEA0W72': 4,
 'A1M542G46C8C7N': 5,
 'A1NVD0TKNS1GT5': 6,
 'A1ORUSHRRG0VWN': 7,
 'A1RPTVW5VEOSI': 8,
 'A1WVMDRJU19AFD': 9,
 'A1ZR6RT42KHUIF': 10,
 'A204ETWOV23HO4': 11,
 'A231WM2Z2JL0U3': 12,
 'A23ZO1BVFFLGHO': 13,
 'A243HY69GIAHFI': 14,
 'A2AEZQ3DGBBLPR': 15,
 'A2BGZ52M908MJY': 16,
 'A2BLITJITO97N5': 17,
 'A2G2QNKDL1Y6AC': 18,
 'A2K5FK58JSWXJ9': 19,
 'A2L7WYA5OENV03': 20,
 'A2PKU0R1QOSF9Y': 21,
 'A2R6RA8FRBS608': 22,
 'A2XA9KKAAFXQMH': 23,
 'A36T7WFA475ZOT': 24,
 'A37Z65SZVT0TVB': 25,
 'A3A15L96IYUO6V': 26,
 'A3BYHAAOGDU5RE': 27,
 'A3FTI86WAVJOLG': 28,
 'A3IBOQ8R44YG9L': 29,
 'A3JRAKUG0TB81C': 30,
 'A3MEIR72XKQY88': 31,
 'A3PLX6PTM2ERKL': 32,
 'A3QV0B4DIOB1PG': 33,
 'A3S87ZOPB3UM9N': 34,
 'A3TIJC6L8USJ6Q': 35,
 'A3TRPVAGT3NWBS': 36,
 'A3UWDEBSKQFM7V': 37,
 'A5JLAU2ARJ0BO': 38,
 'A6FIAB28IS79': 39,
 'A6ZPLVAUQ6695': 40,
 'A7RV1KU5O0II9': 41,
 'A7Y6AVS576M03': 42,
 'A94O1XOG5H69F': 43,
 'AD50

In [17]:
recalls = []

for user in test.user_id.unique():
    if user not in user_index:
        continue

    idx = user_index[user]
    recs = recommend_user_cf(idx, user_item, user_sim, k=10)

    true_items = set(
        test[test.user_id == user].item_id
    )

    recall = len(true_items & set(recs)) / len(true_items)
    recalls.append(recall)

In [18]:
user_cf_recall = sum(recalls) / len(recalls)
user_cf_recall

0.09375