In [2]:
import numpy as np
import pandas as pd

In [3]:
rating_df = pd.read_csv('data/events.csv')
rating_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221373311,781127,view,21989,
1,1433222147345,1076270,view,262799,
2,1433221380636,849453,view,123990,
3,1433223176926,629333,view,128394,
4,1433222897013,492414,view,279976,


In [3]:
# get the number of interactions for each user
rating_df['event_count'] = rating_df['visitorid'].map(rating_df.visitorid.value_counts())
rating_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,event_count
0,1433221373311,781127,view,21989,,1
1,1433222147345,1076270,view,262799,,4
2,1433221380636,849453,view,123990,,1
3,1433223176926,629333,view,128394,,1
4,1433222897013,492414,view,279976,,9


In [4]:
# sort data frame by timestemp for trian/ val split
rating_df = rating_df.sort_values('timestamp')
rating_df.head(10)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,event_count
34513,1430622118534,584571,view,436195,,2
34476,1430622162554,837890,view,2519,,1
34471,1430622330806,990356,view,369532,,2
34484,1430622469247,584571,view,436195,,2
34470,1430622609378,1002397,view,77392,,2
34514,1430622790487,1375898,view,64152,,1
34472,1430622933406,823085,view,131879,,3
34477,1430623098261,1061274,view,356129,,1
34485,1430623224021,823085,view,214519,,3
34473,1430623241016,122517,view,232129,,4


In [5]:
# map user id and movie id to integer starting from 0 to N (num of users) and M (num of items)
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

#re-assign the number of visitorid
rating_df['visitorid'] = user_encoder.fit_transform(rating_df.visitorid)
rating_df['itemid'] = item_encoder.fit_transform(rating_df.itemid)

In [6]:
#add 1 for starting from 0
num_users = rating_df.visitorid.max()+1
num_items = rating_df.itemid.max()+1
num_users, num_items

(27372, 9098)

In [7]:
# group by visitorid and call "rank" to know the number of click for that user. We set ascending=false so that the
# last click has appearance = 1

rating_df['appearance'] = rating_df.groupby('visitorid').timestamp.rank(ascending=False)
rating_df.head(15)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,event_count,appearance
34513,1430622118534,11315,view,8460,,2,2.0
34476,1430622162554,16213,view,69,,1,1.0
34471,1430622330806,19226,view,7229,,2,2.0
34484,1430622469247,11315,view,8460,,2,1.0
34470,1430622609378,19479,view,1573,,2,2.0
34514,1430622790487,26737,view,1315,,1,1.0
34472,1430622933406,15931,view,2638,,3,3.0
34477,1430623098261,20675,view,6962,,1,1.0
34485,1430623224021,15931,view,4263,,3,2.0
34473,1430623241016,2310,view,4602,,4,4.0


In [8]:
# train / val split
train_df = rating_df.loc[rating_df.appearance>1]
val_df = rating_df.loc[rating_df.appearance==1]
train_df.shape, val_df.shape

((38766, 7), (27371, 7))

In [9]:
#keep the first row as default 
train_df.drop_duplicates(['visitorid','itemid'],inplace=True)
train_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,event_count,appearance
34513,1430622118534,11315,view,8460,,2,2.0
34471,1430622330806,19226,view,7229,,2,2.0
34470,1430622609378,19479,view,1573,,2,2.0
34472,1430622933406,15931,view,2638,,3,3.0
34485,1430623224021,15931,view,4263,,3,2.0
...,...,...,...,...,...,...,...
34426,1442534955668,6782,view,8455,,7,3.0
34427,1442536564165,188,view,2481,,3,3.0
34393,1442540963889,19315,view,1793,,109,3.0
34392,1442541893119,19315,view,6726,,109,2.0


In [10]:
val_df

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,event_count,appearance
34476,1430622162554,16213,view,69,,1,1.0
34484,1430622469247,11315,view,8460,,2,1.0
34514,1430622790487,26737,view,1315,,1,1.0
34477,1430623098261,20675,view,6962,,1,1.0
34480,1430623362825,9763,view,7806,,2,1.0
...,...,...,...,...,...,...,...
34439,1442543328174,9271,view,8101,,1,1.0
34398,1442543539422,21742,view,7075,,1,1.0
34388,1442543684747,23035,view,5152,,2,1.0
34390,1442544741219,19315,view,1793,,109,1.0


In [11]:
#check MRR
def mean_reciprocal_rank(rs):
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])

In [12]:
counts = np.zeros(num_items)
item2counts = train_df.itemid.value_counts()
counts[item2counts.keys()] = item2counts.values
counts

array([4., 1., 1., ..., 1., 3., 0.])

In [13]:
# argsort: ascending value and show its index
# flip: reverse np.array
np.flip(np.argsort(counts))

array([4362, 6111, 5706, ..., 3615, 3611, 9097], dtype=int64)

In [14]:
hits = []
rankings = np.flip(np.argsort(counts))
for val_gt in val_df.itemid.values:
    hits.append(rankings == val_gt)
mean_reciprocal_rank(hits)    

0.02096631601316712