<a href="https://colab.research.google.com/github/manohar029/Ecommerce-Implicit-Data-Recommender-System/blob/master/Ecomm_RecSys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# install the necessary packages in your colab session
!pip install implicit
!pip install ml_metrics

In [0]:
import pandas as pd
import numpy as np
from numpy.random import randint
import os
import implicit
import scipy.sparse as sparse
import ml_metrics as metrics
from google.colab import files
from google.colab import drive

In [0]:
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"

### Load and Pre-process the data

In [0]:
clicks_df = pd.read_csv('/content/gdrive/My Drive/Ecomm_RecSys/yoochoose-clicks.dat', 
                 sep=",", 
                 skiprows=1,  
                 names=['sessionID','timestamp','itemID','category'])

In [0]:
buys_df = pd.read_csv('/content/gdrive/My Drive/Ecomm_RecSys/yoochoose-buys.dat', 
                 sep=",", 
                 skiprows=1,  
                 names=['sessionID','timestamp','itemID','price','quantity'])

In [0]:
print(clicks_df.shape)
clicks_df.head()

In [0]:
print(buys_df.shape)
buys_df.head()

In [0]:
clicks_df['category'].value_counts()

In [0]:
## we'll select a category with maximum clicks to build our recommendation system on. 
## intuitively, it doesn't make sense to recommend items of one category if the user is searching for an item in some other category.
## Ex. you don't get recommendations of tshirts if you are searching for wallets in the 'accessories' category.

clicks_df = clicks_df[clicks_df['category']==0]
clicks_df.shape

In [0]:
## since we are selecting a single category, we only want the buys information from those sessions. 

buys_df = buys_df.loc[buys_df['sessionID'].isin(clicks_df.sessionID.unique())]

In [0]:
buys_df.drop(columns=['price','quantity'], inplace = True)
clicks_df.drop(columns=['category'], inplace = True)
clicks_df['timestamp'] = pd.to_datetime(clicks_df['timestamp'])
buys_df['timestamp'] = pd.to_datetime(buys_df['timestamp'])
buys_df['buys'] = 1

In [0]:
all_df = pd.concat([clicks_df,buys_df], axis=0, ignore_index=True)
all_df['itemID'].nunique()

In [0]:
del(clicks_df)
del(buys_df)

In [0]:
## NOTE: This is just for implementation purpose.

## Since we don't have any information of the users, we are considering a total of 0.5M users, and randomly assigning users to each row of all_df. 
## This gives uniform distribution of clicks and buys accross all the users which definitely may not be the same as what prevails in the industry. 
## Also, by doing this we might get some users buying an item without any clicks. 
## Anyway, we would not encounter this situation in practical scenarios as the userID will also be tracked and stored along with the clicks. 

userIDs = randint(1,500000,all_df.shape[0])

all_df['userID'] = userIDs
all_df.sort_values(by='timestamp', inplace=True)
print(all_df.shape)
all_df.head()

In [0]:
## we also need to look for the distribution of clicks accross months since we'll take a time based split to form the test set. 

all_df['month'] = all_df['timestamp'].dt.month 
print(all_df.shape)
print(all_df['month'].value_counts())

In [0]:
## Here we are making a time based split of the data. Later, the test_df will be preprocessed further to match the requirements of our approach.
## We are taking the last 2.5M rows into our test set. 
train_df = all_df[:-2500000]
test_df = all_df[-2500000:]

In [0]:
del(all_df)

In [0]:
print(train_df.shape)
train_df.head()

In [0]:
train_df['buys'].fillna(0, inplace = True)

In [0]:
train_clicks = train_df.loc[train_df['buys'] == 0]
train_buys = train_df.loc[train_df['buys'] == 1]

In [0]:
del(train_df)

In [0]:
train_clicks = train_clicks.groupby(['userID','itemID'])['buys'].count().reset_index().rename(columns = {'buys':'clicks'})
train_buys = train_buys.groupby(['userID','itemID'])['buys'].count().reset_index()

In [0]:
print(train_clicks.shape)
print(train_buys.shape)

In [0]:
train_final = pd.merge(train_clicks,train_buys, how='outer', on=['userID','itemID'])
print(train_final.shape)
train_final.head()

In [0]:
del(train_clicks)
del(train_buys)

In [0]:
train_final.fillna(0, inplace= True)

In [0]:
## we'll combine the implicit user activity data by weighting to form a rating. 
## If more implicit variables are available like page_views, clicks, add_to_carts, transactions etc., all of them can be combined likewise and the 
## weights can be decided by cross validation. 

train_final['rating'] = 0.25*train_final['clicks'] + 1*train_final['buys']
train_final.drop(columns=['clicks','buys'], inplace=True)

In [0]:
train_final['userID'] = train_final['userID'].astype("category")
train_final['itemID'] = train_final['itemID'].astype("category")
train_final['userID_enc'] = train_final['userID'].cat.codes
train_final['itemID_enc'] = train_final['itemID'].cat.codes

In [0]:
## we create the mappings for the userIDs and itemIDs with their encodings. This would be useful during the time of testing.

cat_item_map = dict(enumerate(train_final['itemID'].cat.categories))
item_cat_map = dict((v,k) for k,v in cat_item_map.items())
cat_user_map = dict(enumerate(train_final['userID'].cat.categories))
user_cat_map = dict((v,k) for k,v in cat_user_map.items())

### Training the Implicit ALS model.

In [0]:
## Implicit ALS model requires item-user sparsematrix as input, so below we are building the same. 
sparse_item_user = sparse.csr_matrix((train_final['rating'].astype(float), (train_final['itemID_enc'], train_final['userID_enc'])))
sparse_user_item = sparse.csr_matrix((train_final['rating'].astype(float), (train_final['userID_enc'], train_final['itemID_enc'])))

In [0]:
del(train_final)

In [0]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)

In [0]:
'''
# If you get any warning regarding 'OPENBLAS_NUM_THREADS' or 'MKL_NUM_THREADS' run the below commands and re-run the above cell.
os.environ['OPENBLAS_NUM_THREADS'] = "1"
os.environ['MKL_NUM_THREADS'] = "1"
'''

In [0]:
alpha_val = 15
sparse_item_user = (sparse_item_user * alpha_val).astype('double')

In [0]:
model.fit(sparse_item_user, show_progress=True)

### Pre-processing the test_df

In [0]:
## we want only the users and items in the test data which are already present in the training data.
## NOTE that this method can't be used for a cold start problem.

print(train_final['itemID'].nunique())
print(test_df['itemID'].nunique())
print(len(set(test_df.itemID).intersection(set(train_final.itemID))))

In [0]:
## Since we want to generate the recommendations with respect to the query of the user, our approach to create the test set is that the first occurence
## of a user and the item corresponding to that user's first occurence is considered as the query of that user. All the remaining items of that user's
## occurences are taken as  the ground truth items which we'll try to recommend using our recommender system. 

In [0]:
test_df = test_df.loc[test_df['itemID'].isin(list(set(test_df.itemID).intersection(set(train_final.itemID))))]
test_user_item_df = test_df.drop_duplicates('userID', keep='first').drop(columns=['buys','sessionID'])

In [0]:
temp_df = pd.concat([test_df[['itemID','timestamp','userID']],test_user_item_df], axis=0).drop_duplicates(keep=False).drop(
    columns=['timestamp']).reset_index(drop=True)
temp_df = temp_df.groupby('userID')['itemID'].apply(list).reset_index().rename(columns={'itemID':'itemID_list'})

In [0]:
valid_df = test_user_item_df.merge(temp_df, how='inner', on='userID').drop(columns=['timestamp'])
print(valid_df.shape)
valid_df.head()

In [0]:
## As a baseline, we can generate the recommendations by considering only the top n items corresponding to the query item. 
## Since it is item based similarity, we cannot capture the user behaviour/preferences into this i.e recommendation is not personalized.

n_similar = 6
item_recs = []

for item_id in valid_df['itemID'].values:

  similar = model.similar_items(item_cat_map[item_id], n_similar)
  similar = similar[1:]
  items_cat = [x[0] for x in similar]
  items = [cat_item_map[x] for x in items_cat]
  item_recs.append(items)

In [0]:
## Here, since we take the dot product of the user's latent vector with that of the item latent vectors, we'll be able to personalize the recommendations
## to the user.

n_rec = 5
user_recs = []

for user_id in valid_df['userID'].values:
  
  recommended = model.recommend(user_cat_map[user_id], sparse_user_item, n_rec)
  items_cat = [x[0] for x in recommended]
  items = [cat_item_map[x] for x in items_cat]
  user_recs.append(items)

In [0]:
## The above method is good for personalized recommendations, but, since we also want to consider the context of the query, we would be assuming that 
## all the products are equally similar to the query product, which would seldom be true. 
## Hence, we are taking the weighted average of both item based similarity scores and the user-item similarity scores. 

item_vecs = model.item_factors
user_vecs = model.user_factors
item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))
ensemble_recs = []
n_rec=3

for item_id,user_id in zip(valid_df['itemID'],valid_df['userID']):
  item_scores = item_vecs.dot(item_vecs[item_cat_map[item_id]])/item_norms
  
  user_interactions = sparse_user_item[user_cat_map[user_id],:].toarray()
  user_interactions = user_interactions.reshape(-1) + 1
  user_interactions[user_interactions >1] = 0
  rec_vector = user_vecs[user_cat_map[user_id],:].dot(item_vecs.T)
  user_scores = user_interactions*rec_vector

  ensemble_scores = 0.5*item_scores + 0.5*user_scores
  best = np.argpartition(ensemble_scores, -3)[-3:]
  top_items = sorted(zip(best, ensemble_scores[best]), key=lambda x: -x[1])

  top_items_cat = [x[0] for x in top_items]
  top_items = [cat_item_map[x] for x in top_items_cat]
  ensemble_recs.append(top_items)

In [0]:
actuals = list(valid_df['itemID_list'])

In [0]:
metrics.mapk(actuals,item_recs,3)

In [0]:
metrics.mapk(actuals,user_recs,3)

In [0]:
metrics.mapk(actuals,ensemble_recs,3)