In [1]:
import numpy as np
import pandas as pd

In [2]:
rating_df = pd.read_csv('data/events.csv')
rating_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221373311,781127,view,21989,
1,1433222147345,1076270,view,262799,
2,1433221380636,849453,view,123990,
3,1433223176926,629333,view,128394,
4,1433222897013,492414,view,279976,


In [21]:
# sort data frame by timestemp for trian/ val split
rating_df = rating_df.sort_values('timestamp')
rating_df.head(10)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
34513,1430622118534,584571,view,436195,
34476,1430622162554,837890,view,2519,
34471,1430622330806,990356,view,369532,
34484,1430622469247,584571,view,436195,
34470,1430622609378,1002397,view,77392,
34514,1430622790487,1375898,view,64152,
34472,1430622933406,823085,view,131879,
34477,1430623098261,1061274,view,356129,
34485,1430623224021,823085,view,214519,
34473,1430623241016,122517,view,232129,


In [22]:
# map user id and item id to integer starting from 0 to N (num of users) and M (num of items)
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

rating_df['visitorid'] = user_encoder.fit_transform(rating_df.visitorid)
rating_df['itemid'] = item_encoder.fit_transform(rating_df.itemid)

In [23]:
num_users = rating_df.visitorid.max()+1
num_items = rating_df.itemid.max()+1
num_users, num_items

(27372, 9098)

In [8]:
# group by visitorid and call "rank" to know the number of click for that user. We set ascending=false so that the
# last click has appearance = 1

rating_df['appearance'] = rating_df.groupby('visitorid').timestamp.rank(ascending=False)
rating_df.head(15)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,appearance
8,1433191760912,3,view,4,,2.0
7,1433193703544,3,view,5,,1.0
6,1433193877348,0,view,1,,1.0
5,1433194739377,1,view,8,,3.0
0,1433221373311,2,view,0,,2.0
2,1433221380636,4,view,2,,1.0
1,1433222147345,2,view,6,,1.0
4,1433222897013,1,view,7,,2.0
3,1433223176926,1,view,3,,1.0


In [9]:
# train / val split
train_df = rating_df.loc[rating_df.appearance>1]
val_df = rating_df.loc[rating_df.appearance==1]
train_df.shape, val_df.shape

((4, 6), (5, 6))

In [10]:
# remove duplicate (visitorid,itemid) pair

train_df.drop_duplicates(['visitorid','itemid'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [13]:
from scipy.sparse import csr_matrix
# construct user-item rating matrix
user2item = csr_matrix( (np.ones(len(train_df)), (train_df.visitorid.values ,train_df.itemid.values)), shape=(num_users, num_items), dtype=np.float32 )
user2item.sum()

4.0

In [15]:
!pip install lightfm
from lightfm import LightFM
# training

model = LightFM(no_components=32, loss='warp')
model.fit(interactions=user2item,epochs=150)

Collecting lightfm
  Downloading lightfm-1.16.tar.gz (310 kB)
Collecting requests
  Downloading requests-2.25.0-py2.py3-none-any.whl (61 kB)
Collecting urllib3<1.27,>=1.21.1
  Downloading urllib3-1.26.2-py2.py3-none-any.whl (136 kB)
Collecting idna<3,>=2.5
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
Collecting chardet<4,>=3.0.2
  Using cached chardet-3.0.4-py2.py3-none-any.whl (133 kB)
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py): started
  Building wheel for lightfm (setup.py): finished with status 'done'
  Created wheel for lightfm: filename=lightfm-1.16-cp37-cp37m-win_amd64.whl size=426513 sha256=685c60b26fc556ac456885cfdc989201975a734a85bcdd8f3078469538acd98c
  Stored in directory: c:\users\gpu\appdata\local\pip\cache\wheels\f8\56\28\5772a3bd3413d65f03aa452190b00898b680b10028a1021914
Successfully built lightfm
Installing collected packages: urllib3, idna, chardet, requests, lightfm
Successfully installed chardet-3.0.4 idna-2.10 li

  "LightFM was compiled without OpenMP support. "


<lightfm.lightfm.LightFM at 0x9923b88>

In [16]:
def mean_reciprocal_rank(rs):
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


In [17]:
# obtain the user and item ids in the validation set
val_user_ids = val_df.visitorid.values
val_gts = val_df.itemid.values

In [19]:
!pip install tqdm
from tqdm import tqdm
def evaluate_prediction(model, val_user_ids, val_gts):
    '''
    Return the average mrr for each users
    args:
        predictions: np.array user-item predictions
    returns:
        ndcg: float, computed MRR
    '''

    hits = []
    
    # iterate over user and item pair in the validation set
    for target_user, val_gt in tqdm(zip(val_user_ids, val_gts)):

        
        predictions = model.predict(user_ids= np.array([target_user] * num_items), item_ids=np.arange(num_items) )
        rankings = np.flip(np.argsort(predictions))
        hits.append(rankings == val_gt)

    mrr = mean_reciprocal_rank(hits)

    return mrr
mrr = evaluate_prediction(model, val_user_ids[:100],val_gts[:100])
mrr

5it [00:00, ?it/s]

Collecting tqdm
  Downloading tqdm-4.54.1-py2.py3-none-any.whl (69 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.54.1





0.14912698412698414