# Recommendation Systems Project

Mate Balogh

E28H7B

**Item** recommendations for **visitors** in an online store based on *implicit feedback* to provide better user experience and to boost sales.

## Setup

In [None]:
!pip install cornac adjustText --quiet

In [None]:
import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cornac
import itertools
import seaborn as sns

from cornac.eval_methods import RatioSplit
from cornac.metrics import Precision, Recall
from cornac.models import WMF
from adjustText import adjust_text
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
RAND = 10

In [None]:
#@title ### Helpers
def get_idx(ids, collection):
    return list(map(lambda x: collection.index.get_loc(x), ids))

In [None]:
#@title ### Download dataset
path = kagglehub.dataset_download("retailrocket/ecommerce-dataset")
print("Path to dataset files:", path)

## Dataset

In [None]:
events = pd.read_csv(path + "/events.csv",
                     dtype={'timestamp': int, 'visitorid': int, 'itemid': int, 'event': 'string', 'transactionid': object})

## Exploratory Data Analysis

In [None]:
events.shape

In [None]:
events.columns.to_list()

In [None]:
events.head(n=3)

In [None]:
events.isna().any()

In [None]:
events.describe()

In [None]:
print("# of visitors in 'events':", events['visitorid'].count(), "among which", events['visitorid'].nunique(), "is unique.")
print("# of items in 'events':", events['itemid'].count(), "among which", events['itemid'].nunique(), "is unique.")
print("# of event in 'events':", events['event'].count(), "among which", events['event'].nunique(), "is unique.")
print("# of transactions in 'events':", events['transactionid'].count(), "among which", events['transactionid'].nunique(), "is unique.")

In [None]:
events.sort_values(by=['transactionid'], ascending=False)

In [None]:
#@title #### Event frequency distribution

fig, axes = plt.subplots(1, 2, figsize=(10, 5))

events_freqdist = events.groupby('event')['event'].count()

def plot_on_ax(_ax):
    fd = _ax.bar(events_freqdist.index, events_freqdist.values, color="darkkhaki")
    _ax.bar_label(fd, fmt="{:,.0f}")

ax = axes[0]
plot_on_ax(ax)
ax.set_title("Normal scale")

ax = axes[1]
plot_on_ax(ax)
ax.set_yscale('log')
ax.set_title("Log scale")

for ax in axes:
    ax.set_ylabel('Frequency')
    ax.margins(.15)

plt.suptitle('Event distribution')
plt.tight_layout()
plt.show()

del plot_on_ax

### Transaction events

In [None]:
condTransaction = events['event'] == 'transaction'
transaction_events = events[condTransaction]

print("`transaction_events` shape:", transaction_events.shape)
assert transaction_events['transactionid'].notna().all()

In [None]:
plt.hist(transaction_events.groupby(by='itemid')['transactionid'].count().reset_index(drop=True),
         bins=40, edgecolor="white", color="olive")

plt.title("Item-wise Purchase Histogram (log scale)")
plt.xlabel("Number of Purchase")
plt.ylabel("Frequency")
plt.yscale('log')
plt.show()

In [None]:
plt.hist(transaction_events.groupby(by=['visitorid', 'itemid'])['transactionid'].count().reset_index(drop=True),
         bins=40, edgecolor="white", color="olive")

plt.title("User-item-wise Purchase Histogram (log scale)")
plt.xlabel("Number of Purchase")
plt.ylabel("Frequency")
plt.yscale('log')
plt.show()

# Implementations

In [None]:
# designated user
designated_visitor_id = 71586
condDesignatedVisitor = transaction_events['visitorid'] == designated_visitor_id
assert transaction_events[condDesignatedVisitor]['itemid'].count() > 0

In [None]:
K = 100
N = 5

## `PopularItemRecommender`

In [None]:
class PopularItemRecommender():
    def __init__(self, transactions, verb=False):
        self.transactions = transactions
        self.verb = verb

        if self.verb:
            self.welcome()

    def welcome(self):
        print("Popular Item Recommender", "\n")

    def recommend(self, n):
        items_recommended = (
            self.transactions.groupby(by='itemid')['transactionid']
            .count()
            .sort_values(ascending=False)[:n]
            .index
            .tolist()
        )

        if self.verb:
            print("Recommended items:", "\n")
            display(pd.Series(items_recommended, name="itemid").to_frame().T)

        return items_recommended

In [None]:
items_recommended = (
    PopularItemRecommender(transaction_events, verb=True)
        .recommend(N)
)

### Explanation

In [None]:
# designated user
display((designated_visitor_id, visitor_item_pivot.index.get_loc(designated_visitor_id)))

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# 0
ax = axes[0]
df = transaction_events.groupby(by='itemid')['transactionid'].count()
ax.hist(df, bins=40, edgecolor="white", color="darkkhaki")

ax.set_title("Purchase histogram of all items (log scale)")
ax.set_xlabel("Number of purchase")
ax.set_ylabel("Frequency")
ax.set_yscale('log')

# 1
ax = axes[1]
df = (transaction_events
    .pipe(lambda df: df[df['itemid'].isin(items_recommended)])
    .pipe(lambda df: df.groupby(by='itemid')['transactionid'].count())
    .rename('purchase count')
    .sort_values(ascending=False)
    .reset_index()
)

ax.bar(df['itemid'].astype(str), df['purchase count'], color="purple")

ax.set_xlabel("Item ID")
ax.set_ylabel("Number of purchase")
ax.set_title("Recommended items")

plt.suptitle('Item popularity based on purchase count')
plt.tight_layout()
plt.show()

## `MeanPopularItemRecommender`

In [None]:
class MeanPopularItemRecommender():
    def __init__(self, transactions, verb=False):
        self.transactions = transactions
        self.verb = verb

        if self.verb:
            self.welcome()

    def welcome(self):
        print("Mean Popular Item Recommender", "\n")

    def recommend(self, n):
        items_recommended = (
            self.transactions
            .pipe(lambda df: df.groupby(by=['visitorid', 'itemid'])['transactionid'].count().rename("visitor purchase count"))
            .reset_index()
            .pipe(lambda df: df.groupby(by='itemid')['visitor purchase count'].mean())
            .sort_values(ascending=False)[:n]
            .index
            .tolist()
        )

        if self.verb:
            print("Recommended items:", "\n")
            display(pd.Series(items_recommended, name="itemid").to_frame().T)

        return items_recommended

In [None]:
items_recommended = (
    MeanPopularItemRecommender(transaction_events, verb=True)
        .recommend(N)
)

### Explanation

In [None]:
# designated user
display((designated_visitor_id, visitor_item_pivot.index.get_loc(designated_visitor_id)))

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# 0
ax = axes[0]
df = (transaction_events
    .pipe(lambda df: df.groupby(by=['visitorid', 'itemid'])['transactionid'].count().rename("visitor purchase count"))
    .reset_index()
    .pipe(lambda df: df.groupby(by='itemid')['visitor purchase count'].mean()))
ax.hist(df, bins=40, edgecolor="white", color="darkkhaki")

ax.set_title("Purchase histogram of all items (log scale)")
ax.set_xlabel("Average number of purchase")
ax.set_ylabel("Frequency")
ax.set_yscale('log')

# 1
ax = axes[1]
df = (transaction_events
    .pipe(lambda df: df[df['itemid'].isin(items_recommended)])
    .pipe(lambda df: df.groupby(by=['visitorid', 'itemid'])['transactionid'].count().rename("visitor purchase count"))
    .reset_index()
    .pipe(lambda df: df.groupby(by='itemid')['visitor purchase count'].mean())
    .rename('mean purchase count')
    .sort_values(ascending=False)
    .reset_index()
)

ax.bar(df['itemid'].astype(str), df['mean purchase count'], color="purple")

ax.set_xlabel("Item ID")
ax.set_ylabel("Average number of purchase")
ax.set_title("Recommended items")

plt.suptitle('Item popularity based on mean purchase count')
plt.tight_layout()
plt.show()

## `UserBasedCollabFilterItemRecommender`

In [None]:
visitor_item_pivot = transaction_events.pivot_table(index='visitorid', columns='itemid', values='event', aggfunc='any').fillna(0).astype(int)
visitor_item_pivot
assert visitor_item_pivot.sum().sum() == transaction_events.groupby(by=['visitorid', 'itemid'])['itemid'].count().reset_index(drop=True).count()

In [None]:
visitor_item_similarity = cosine_similarity(visitor_item_pivot)
visitor_item_similarity

In [None]:
(visitor_item_pivot.shape, visitor_item_similarity.shape)

In [None]:
visitor_loc = visitor_item_pivot.index.get_loc(designated_visitor_id)
selected_visitor_pivot = visitor_item_pivot.iloc[visitor_loc, :]
selected_visitor_similarity = visitor_item_similarity[visitor_loc, :]

In [None]:
class UserBasedCollabFilterItemRecommender():
    def __init__(self, pivot, similarity, verb=False):
        self.pivot = pivot
        self.similarity = similarity
        self.verb = verb

        self.fitted = False
        self.recommended = False

        if verb:
            self.welcome()

    def welcome(self):
        print("User-based Collaborative Filter Item Recommender")

    def fit(self, userid):
        self.userid = userid
        self.user_loc = self.pivot.index.get_loc(self.userid)
        self.user_pivot = self.pivot.iloc[self.user_loc, :]
        self.user_similarity = self.similarity[self.user_loc, :]

        interacted_mask = self.user_pivot[self.user_pivot.gt(0)]
        interacted_itemids = interacted_mask.index

        if self.verb:
           print("Item # already interacted with:\n\n", interacted_itemids.to_list(), "\n\n")

        self.interacted_itemids = interacted_itemids
        self.fitted = True

        return self

    def recommend(self, k, n, sim_threshold):
        self.k = k
        self.n = n

        similar_user_locs = self.user_similarity.argpartition(-(k+1))[-(k+1):]
        similar_user_locs = similar_user_locs[similar_user_locs != self.user_loc]
        similar_user_locs = similar_user_locs[self.user_similarity[similar_user_locs] >= sim_threshold]

        similar_userids = self.pivot.iloc[similar_user_locs].index

        if self.verb:
           print(f"K={self.k} most similar user (similarity threshold: {sim_threshold}) #:\n\n", similar_userids.to_list(), "\n\n")

        items_recommended = (self.pivot.iloc[similar_user_locs]
                                           .sum(axis=0)
                                           .astype('int')
                                           .rename('Transaction frequency')
                                           .where(lambda x: x > 0)
                                           .where(lambda x: np.invert(np.isin(x.index, self.interacted_itemids)))
                                           .dropna()
                                           .sort_values(ascending=False)[:self.n]
                                           .astype('int')
                            )

        self.similar_userids = similar_userids
        self.items_recommended = items_recommended
        self.recommended = True

        if self.verb:
            print("Recommended items:", "\n")
            display(pd.Series(items_recommended.index.tolist(), name="itemid").to_frame().T)

        return items_recommended

    def get_fit_memory(self):
        assert self.fitted, "Model not fitted. Call .fit() first."
        assert self.interacted_itemids is not None
        return {
            'userid': self.userid,
            'interacted_itemids': self.interacted_itemids.to_list()
        }

    def get_recommend_memory(self):
        assert self.recommended, "Model has no recommendation memory. Call .recommend() first."
        assert self.similar_userids is not None
        assert self.items_recommended is not None
        return {
            'similar_userids': self.similar_userids.to_list(),
            'items_recommended': self.items_recommended.index.to_list(),
            'k': self.k,
            'n': self.n
        }

In [None]:
model = UserBasedCollabFilterItemRecommender(visitor_item_pivot, visitor_item_similarity, verb=True)
model.fit(designated_visitor_id)

recommendation = model.recommend(K, N, sim_threshold=0.1)

### Explanation

In [None]:
# designated user
display((designated_visitor_id, visitor_item_pivot.index.get_loc(designated_visitor_id)))

In [None]:
class UserBasedCFRVerifier():
    def __init__(self, model):
        self.model = model

    def run(self, designated_visitor_id):
        fm = self.model.get_fit_memory()
        rm = self.model.get_recommend_memory()

        # designater user
        assert designated_visitor_id == fm['userid']
        # interacted items
        df = transaction_events[transaction_events['visitorid'] == designated_visitor_id]
        assert (df['itemid'].nunique() == len(fm['interacted_itemids']))
        # similar users
        assert len(rm['similar_userids']) <= rm['k']
        assert np.invert(np.isin(designated_visitor_id, rm['similar_userids']))
        # recommended items
        assert np.invert(np.isin(fm['interacted_itemids'], rm['items_recommended'])).all()
        assert len(rm['items_recommended']) <= rm['n']

In [None]:
UserBasedCFRVerifier(model).run(designated_visitor_id)

In [None]:
fit_memory = model.get_fit_memory()
recommend_memory = model.get_recommend_memory()

In [None]:
# interacted items
print("Interacted itemids (acc. to model): ", fit_memory['interacted_itemids'])
print("Transaction records (history):")
display(transaction_events[condDesignatedVisitor])

In [None]:
# similar users
print(f"{recommend_memory['k']} similar user ids (acc. to model):\n", recommend_memory['similar_userids'])

# recommended items
print(f"{recommend_memory['n']} recommended item ids (acc. to model):\n", recommend_memory['items_recommended'])

In [None]:
# similarity of selected users
visitor_item_similarity[
    get_idx([designated_visitor_id], visitor_item_pivot),
    get_idx(recommend_memory['similar_userids'], visitor_item_pivot)
]

In [None]:
condSimilarUsers = transaction_events['visitorid'].isin(recommend_memory['similar_userids'])

df = (transaction_events[condSimilarUsers]
    .where(lambda x: x['itemid'].isin(recommend_memory['items_recommended']))
    .dropna()
    .sort_values(by=['itemid', 'visitorid']))

assert df['itemid'].nunique() == len(recommend_memory['items_recommended'])

display(df)

In [None]:
similar_visitor_ids = recommend_memory['similar_userids']

other_visitor_ids = (
    visitor_item_pivot
        .index
        .to_frame()
        .reset_index(drop=True)
        .where(lambda x: x != designated_visitor_id)
        .dropna()
)['visitorid'].tolist()

other_visitor_ids_not_in_similar_users = (visitor_item_pivot.loc[list(set(other_visitor_ids).difference(set(similar_visitor_ids)))]
                                            .index
                                            .tolist())

(len(visitor_item_pivot), len(similar_visitor_ids), len(other_visitor_ids), len(other_visitor_ids_not_in_similar_users))

In [None]:
df = pd.DataFrame(visitor_item_similarity[
    get_idx([designated_visitor_id], visitor_item_pivot),
    get_idx(other_visitor_ids, visitor_item_pivot)
], columns=['similarity'], index=other_visitor_ids)

df['in_similar_users'] = visitor_item_pivot.loc[other_visitor_ids].index.isin(similar_visitor_ids)

plt.figure(figsize=(6, 6))
plt.scatter(other_visitor_ids_not_in_similar_users, df.loc[other_visitor_ids_not_in_similar_users]['similarity'], alpha=0.75, color="none", edgecolor="cornflowerblue")
plt.scatter(visitor_item_pivot.loc[similar_visitor_ids].index, df.loc[similar_visitor_ids]['similarity'], alpha=0.75, color="none", edgecolor="seagreen", label="K most similar")
plt.xlabel("Other user indices")
plt.ylabel("Cosine similarity")
plt.title(f"Similarities of User#{designated_visitor_id} with others")
plt.legend(loc="lower right")
plt.margins(0.1, 0.2)
plt.show()

In [None]:
df = (transaction_events[condSimilarUsers]
    .where(lambda x: x['itemid'].isin(recommend_memory['items_recommended']))
    .dropna()
    .sort_values(by=['itemid', 'visitorid']))[['visitorid', 'itemid']]

df['interaction'] = np.ones(df.shape[0])

all_users = recommend_memory['similar_userids']
all_items = recommend_memory['items_recommended']

full_index = pd.MultiIndex.from_product([all_users, all_items], names=['visitorid', 'itemid'])
full_df = df.set_index(['visitorid', 'itemid']).reindex(full_index, fill_value=0).reset_index()
pivot = full_df.pivot(index='visitorid', columns='itemid', values='interaction')

plt.figure(figsize=(4, 5))
sns.heatmap(pivot, cmap='binary_r', cbar=False)
plt.title('Interaction between Similar users\nand Recommended items')
plt.xlabel('Recommended item ID')
plt.ylabel('Similar user ID')
plt.show()