In [1]:
import numpy as np
import pandas as pd

In [2]:
rating_df = pd.read_csv('data/events_small.csv')
rating_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221373311,781127,view,21989,
1,1433222147345,781127,view,262799,
2,1433221380636,849453,view,123990,
3,1433223176926,629333,view,128394,
4,1433222897013,629333,view,279976,


In [3]:
# sort data frame by timestemp for trian/ val split
rating_df = rating_df.sort_values('timestamp')
rating_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
34513,1430622118534,584571,view,436195,
34476,1430622162554,837890,view,2519,
34471,1430622330806,990356,view,369532,
34484,1430622469247,584571,view,436195,
34470,1430622609378,1002397,view,77392,


In [4]:
# map user id and item id to integer starting from 0 to N (num of users) and M (num of items)
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

rating_df['visitorid'] = user_encoder.fit_transform(rating_df.visitorid)
rating_df['itemid'] = item_encoder.fit_transform(rating_df.itemid)

In [5]:
num_users = rating_df.visitorid.max()+1
num_items = rating_df.itemid.max()+1

In [6]:
# group by visitorid and call "rank" to know the number of click for that user. We set ascending=false so that the
# last click has appearance = 1

rating_df['appearance'] = rating_df.groupby('visitorid').timestamp.rank(ascending=False)
rating_df.head(15)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,appearance
34513,1430622118534,11315,view,8460,,2.0
34476,1430622162554,16213,view,69,,1.0
34471,1430622330806,19226,view,7229,,2.0
34484,1430622469247,11315,view,8460,,1.0
34470,1430622609378,19479,view,1573,,2.0
34514,1430622790487,26737,view,1315,,1.0
34472,1430622933406,15931,view,2638,,3.0
34477,1430623098261,20675,view,6962,,1.0
34485,1430623224021,15931,view,4263,,2.0
34473,1430623241016,2310,view,4602,,4.0


In [7]:
# train / val split
train_df = rating_df.loc[rating_df.appearance>1]
val_df = rating_df.loc[rating_df.appearance==1]
train_df.shape, val_df.shape

((38766, 6), (27371, 6))

In [8]:
# group the dataframe by visitor id such that each row of the resulting dataframe contains visitorid and all the item he viewed
train_df = train_df.groupby('visitorid')['itemid'].apply(set).reset_index().rename(columns={'itemid':'viewed_items'})
train_df.head()

Unnamed: 0,visitorid,viewed_items
0,2,{8791}
1,6,{7523}
2,8,{3361}
3,14,{1890}
4,17,{3784}


In [9]:
# obtain positive training samples
positive_user_ids = np.hstack(train_df.apply( lambda row: [row.visitorid ] * (len(row.viewed_items) ), axis=1))
positive_item_ids = np.hstack(train_df.viewed_items.apply(list))
positive_ratings = np.ones(len(positive_item_ids))

In [10]:
# obtain the user and item ids in the validation set
val_user_ids = val_df.visitorid.values
val_gts = val_df.itemid.values

In [11]:
def mean_reciprocal_rank(rs):
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


In [12]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, concatenate, Flatten, Activation, Add, Dropout, Multiply
def get_mlp_model():
    user_inp = Input((1,), name='user_input')
    user_hidden = Embedding(input_dim=num_users, output_dim=64, name='user_embedding')(user_inp)
    user_hidden = Flatten()(user_hidden)
    
    item_inp = Input((1,), name='item_input')
    item_hidden = Embedding(input_dim=num_items, output_dim=64, name='item_embedding')(item_inp)
    item_hidden = Flatten()(item_hidden)
    
    hidden = concatenate([user_hidden, item_hidden])
    hidden = Dense(128, activation='relu', name='latent1')(hidden)
    hidden = Dropout(0.2)(hidden)
    hidden = Dense(64, activation='relu', name='latent2')(hidden)    
    hidden = Dropout(0.2)(hidden)    
    output = Dense(1, activation='sigmoid', name='latent3')(hidden)
    
    model = Model(inputs=[user_inp, item_inp], outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model
model = get_mlp_model()
model.summary()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 64)        1751808     user_input[0][0]                 
__________________________________________________________________________________________________
item_embedding (Embedding)      (None, 1, 64)        582272      item_input[0][0]                 
__________________________________________________________________________________________________
flatten_1 

In [13]:
all_item_ids = set(np.arange(num_items))
negative_sample_size=4
def sample_negative(row):
    '''
    Raomdly sample items that the users have never viewed.
    '''
    # 對於每一個看過的物品（正面樣本），抽樣“negative_sample_size”個負面樣本
    # For each viewed item (positive sample), ramdomlu sample "negative_sample_size" negative samples.
    return np.random.choice(list(all_item_ids - row.viewed_items), size=negative_sample_size * len(row.viewed_items), replace=False)

In [14]:
epochs = 20

for _ in range(epochs):

    # 每一次迴圈開始時抽樣負面訓練樣本
    # sample negative items for every epoch
    train_df['negative_samples'] = train_df.apply(sample_negative, axis=1)
    
    # 將負面訓練樣本拿出來
    # obtain negative training samples
    negative_user_ids = np.hstack(train_df.apply( lambda row: [row.visitorid ] * (len(row.negative_samples)), axis=1))
    negative_item_ids = np.hstack(train_df.negative_samples)
    negative_raitngs = np.zeros(len(negative_item_ids))
        
    # 將正面和負面樣本並起來
    # concatenate positve and negative trianing samples
    train_user_ids = np.hstack([positive_user_ids, negative_user_ids])
    train_item_ids = np.hstack([positive_item_ids, negative_item_ids])
    train_ratings = np.hstack([positive_ratings, negative_raitngs])
    
    # 訓練模型
    # train the model
    model.fit([train_user_ids, train_item_ids], train_ratings,\
        epochs=1, batch_size=512)   



Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [15]:
from tqdm import tqdm
def evaluate_prediction(model, val_user_ids, val_gts):
    '''
    Return the 
    average mrr for each users
    args:
        predictions: np.array user-item predictions
    returns:
        ndcg: float, computed MRR
    '''
    hits = []

    # 迴圈跑過validation set每一個使用者和物品的pair
    # iterate over user and item pair in the validation set
    for target_user, val_gt in tqdm(zip(val_user_ids, val_gts)):
        
        predictions = model.predict([ np.array([target_user] * num_items), np.arange(num_items) ], batch_size=1024).reshape(-1)
        rankings = np.flip(np.argsort(predictions))
        hits.append(rankings == val_gt)

    mrr = mean_reciprocal_rank(hits)

    return mrr
mrr = evaluate_prediction(model, val_user_ids, val_gts)
mrr

27371it [08:11, 55.67it/s]


0.24179100068337941