In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("../data/processed/train.csv")
test = pd.read_csv("../data/processed/test.csv")

### Item-based Collaborative Filtering

In [3]:
item_user = (
    train.pivot_table(
        index="item_id",
        columns="user_id",
        values="interaction",
        fill_value=0
    )
)

In [4]:
item_user.head()

user_id,A11ED8O95W2103,A14JBDSWKPKTZA,A1522TN5FVJL0Y,A1ISUNUWG0K02V,A1JSO7PPEA0W72,A1M542G46C8C7N,A1NVD0TKNS1GT5,A1ORUSHRRG0VWN,A1RPTVW5VEOSI,A1WVMDRJU19AFD,...,AJJV9Z17KV6GR,ALUNVOQRXOZIA,AMIU8UVMZJLER,AN30G4IKL1BMZ,AQ6XS10RR7CE0,ARXU3FESTWMJJ,AT4AV7XIQDKQP,AT8TIN5JKHO2V,AY8Q1X7G96HV5,AZ0SIZRQWN7RC
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B000001OKH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
B00000DM9W,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00000G20L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00000J05A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B00000J060,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
from sklearn.metrics.pairwise import cosine_similarity

item_sim = cosine_similarity(item_user)

In [6]:
def recommend_item_cf(user_id, train, item_user, item_sim, k=10):
    user_items = train[train.user_id == user_id].item_id.tolist()

    scores = np.zeros(item_user.shape[0])

    for item in user_items:
        if item not in item_user.index:
            continue
        idx = item_user.index.get_loc(item)
        scores += item_sim[idx]

    top_indices = np.argsort(scores)[::-1][:k]
    return item_user.index[top_indices].tolist()

### Evaluate Item-Based CF (Recall@K)

In [7]:
def recall_at_k_model(test, train, recommender, k=10):
    recalls = []

    for user in test.user_id.unique():
        true_items = set(test[test.user_id == user].item_id)
        if not true_items:
            continue

        recs = recommender(user)[:k]
        recall = len(true_items & set(recs)) / len(true_items)
        recalls.append(recall)

    return sum(recalls) / len(recalls)

In [8]:
item_cf_recall = recall_at_k_model(
    test,
    train,
    lambda u: recommend_item_cf(u, train, item_user, item_sim),
    k=10
)

In [9]:
item_cf_recall


0.029000000000000005

### Matrix Factorization

In [10]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

In [11]:
train["user_idx"] = user_encoder.fit_transform(train.user_id)
train["item_idx"] = item_encoder.fit_transform(train.item_id)

In [12]:
from scipy.sparse import csr_matrix

interaction_matrix = csr_matrix(
    (
        train.interaction,
        (train.user_idx, train.item_idx)
    )
)


In [13]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(
    n_components=50,
    random_state=42
)


In [14]:
user_factors = svd.fit_transform(interaction_matrix)
item_factors = svd.components_.T

### Recommendation Fucntion

In [15]:
def recommend_mf(user_id, k=10):
    if user_id not in user_encoder.classes_:
        return []

    u_idx = user_encoder.transform([user_id])[0]
    scores = user_factors[u_idx] @ item_factors.T
    top_items = scores.argsort()[::-1][:k]

    return item_encoder.inverse_transform(top_items)

### Evalute Matrix Factorization

In [16]:
mf_recall = recall_at_k_model(
    test,
    train,
    recommend_mf,
    k=10
)

In [17]:
mf_recall

0.0

### Fix Matrix Facorization

In [18]:
np.max(user_factors), np.max(item_factors)

(np.float64(5.18261229231044), np.float64(0.5773502691896261))

In [19]:
common_users = set(train.user_id.unique()) & set(test.user_id.unique())
test_mf = test[test.user_id.isin(common_users)]

In [20]:
user_ids = train.user_id.unique()
item_ids = train.item_id.unique()

user2idx = {u: i for i, u in enumerate(user_ids)}
item2idx = {i: j for j, i in enumerate(item_ids)}

idx2item = {j: i for i, j in item2idx.items()}

In [21]:
print(user_factors.shape, item_factors.shape)

(58, 50) (243, 50)


In [22]:
print(len(user2idx))
print(len(item2idx))

58
243


In [23]:
def recommend_mf(user_id, k=10):
    if user_id not in user2idx:
        return []

    u_idx = user2idx[user_id]
    scores = item_factors @ user_factors[u_idx]

    seen_items = set(train[train.user_id == user_id].item_id)

    ranked_items = [
        idx2item[i]
        for i in np.argsort(scores)[::-1]
        if idx2item[i] not in seen_items
    ]

    return ranked_items[:k]

In [24]:
u = test_mf.user_id.iloc[0]

print("True items:", 
      test_mf[test_mf.user_id == u].item_id.tolist())

print("MF recs:", recommend_mf(u))


True items: ['B00001OPJV', 'B00001OPJQ', 'B000001OMT', 'B000001OL6']
MF recs: ['B00004TJ7O', 'B00004UE8S', 'B00004SU73', 'B00002JXFH', 'B00000J1E6', 'B00000JCTO', 'B00000JD4T', 'B00004Y7CF', 'B00000J579', 'B00004RID2']


In [25]:
def recall_at_k(test_df, recommend_fn, k=10):
    """
    test_df: DataFrame with columns [user_id, item_id]
    recommend_fn: function(user_id) -> list of recommended item_ids
    k: cutoff
    """
    recalls = []

    for user_id in test_df.user_id.unique():
        true_items = set(test_df[test_df.user_id == user_id].item_id)

        if not true_items:
            continue

        recs = recommend_fn(user_id)

        if not recs:
            continue

        recs_k = set(recs[:k])
        recall = len(true_items & recs_k) / len(true_items)
        recalls.append(recall)

    return sum(recalls) / len(recalls) if recalls else 0.0


In [26]:
mf_recall = recall_at_k(
    test_mf,
    lambda u: recommend_mf(u, k=10),
    k=10
)

mf_recall

0.125

In [30]:
import pickle

# Save MF matrices
pickle.dump(
    (user_factors, item_factors),
    open("../src/models/mf_model.pkl", "wb")
)

In [32]:
pickle.dump(user_factors, open("../src/models/user_map.pkl", "wb"))
pickle.dump(item_factors, open("../src/models/item_map.pkl", "wb"))

In [35]:
user_ids = train.user_id.unique()
user_map = {u: i for i, u in enumerate(user_ids)}

In [36]:
item_ids = train.item_id.unique()
item_map = {i: j for j, i in enumerate(item_ids)}


In [37]:
assert user_factors.shape[0] == len(user_map)
assert item_factors.shape[0] == len(item_map)

In [38]:
reverse_item_map = {v: k for k, v in item_map.items()}
pickle.dump(reverse_item_map, open("../src/models/reverse_item_map.pkl", "wb"))

In [40]:
type(item_map)

dict

In [41]:
print(user_factors.shape[0], len(user_map))
print(item_factors.shape[0], len(item_map))


58 58
243 243


In [43]:
import pickle

pickle.dump(
    (user_factors, item_factors),
    open("../src/models/mf_model.pkl", "wb")
)

pickle.dump(user_map, open("../src/models/user_map.pkl", "wb"))
pickle.dump(item_map, open("../src/models/item_map.pkl", "wb"))
pickle.dump(reverse_item_map, open("../src/models/reverse_item_map.pkl", "wb"))


In [44]:
import pickle
type(pickle.load(open("../src/models/user_map.pkl", "rb")))


dict