In [3]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

### Load the MovieLens dataset

In [4]:
dataset = pd.read_csv("/Users/clementbosc/Downloads/ml-100k/u.data", sep='\t', 
                      names="user_id,item_id,rating,timestamp".split(","))
dataset.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
dataset.user_id = dataset.user_id.astype('category').cat.codes.values
dataset.item_id = dataset.item_id.astype('category').cat.codes.values

dataset.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,195,241,3,881250949
1,185,301,3,891717742
2,21,376,1,878887116
3,243,50,2,880606923
4,165,345,1,886397596


In [6]:
def load_rating_file_as_matrix(filename):
    '''
    Read .rating file and Return dok matrix.
    The first line of .rating file is: num_users\t num_items
    '''
    # Get number of users and items
    num_users, num_items = 0, 0
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            u, i = int(arr[0]), int(arr[1])
            num_users = max(num_users, u)
            num_items = max(num_items, i)
            line = f.readline()
    # Construct matrix
    mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            user, item, rating = int(arr[0]), int(arr[1]), float(arr[2])
            if (rating > 0):
                mat[user, item] = 1.0
            line = f.readline()    
    return mat

In [7]:
def load_rating_file_as_list(filename):
    ratingList = []
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            user, item = int(arr[0]), int(arr[1])
            ratingList.append([user, item])
            line = f.readline()
    return ratingList

In [8]:
def load_negative_file(filename):
    negativeList = []
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            negatives = []
            for x in arr[1: ]:
                negatives.append(int(x))
            negativeList.append(negatives)
            line = f.readline()
    return negativeList

In [9]:
def get_train_instances(train, num_negatives):
    user_input, item_input, labels = [],[],[]
    num_users = train.shape[0]
    for (u, i) in train.keys():
        # positive instance
        user_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_items)
            while (u, j) in train:
                j = np.random.randint(num_items)
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
    return user_input, item_input, labels

In [10]:
train = load_rating_file_as_matrix('neural_collaborative_filtering/Data/ml-1m.train.rating')
testRatings = load_rating_file_as_list("neural_collaborative_filtering/Data/ml-1m.test.rating")
testNegatives = load_negative_file("neural_collaborative_filtering/Data/ml-1m.test.negative")

In [11]:
num_users, num_items = train.shape
print(num_users, 'users')
print(num_items, 'items')

6040 users
3706 items


### Build the model

In [12]:
from keras.layers import Embedding, Input, Dense, merge, Reshape, Flatten
from keras.models import Model
from keras.optimizers import Adagrad, Adam, SGD, RMSprop
from keras.regularizers import l2

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [13]:
num_negatives = 4 # Number of negative instances to pair with a positive instance.
regs=[0,0] # Regularization for user and item embeddings.
num_factors = 8 # Embedding size
epochs = 4 # Number of epochs
batch_size = 256
learning_rate = 0.001
model_out_file = 'train_'

In [14]:
def get_model(num_users, num_items, latent_dim, regs=[0,0]):
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

    MF_Embedding_User = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embedding',
                                  init = 'normal', W_regularizer = l2(regs[0]), input_length=1)
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embedding',
                                  init = 'normal', W_regularizer = l2(regs[1]), input_length=1)

    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(MF_Embedding_User(user_input))
    item_latent = Flatten()(MF_Embedding_Item(item_input))

    # Element-wise product of user and item embeddings 
    predict_vector = merge([user_latent, item_latent], mode = 'mul')

    # Final prediction layer
    prediction = Dense(1, activation='sigmoid', init='lecun_uniform', name = 'prediction')(predict_vector)

    model = Model(input=[user_input, item_input], output=prediction)
    
    return model

In [15]:
# Build model
model = get_model(num_users, num_items, num_factors, regs)

#Compiling model
model.compile(optimizer=SGD(lr=learning_rate), loss='binary_crossentropy')

for epoch in range(epochs):
    # Generate training instances
    user_input, item_input, labels = get_train_instances(train, num_negatives)
    
    # Training
    hist = model.fit([np.array(user_input), np.array(item_input)], #input
                     np.array(labels), # labels 
                     batch_size=batch_size, epochs=1, verbose=0, shuffle=True)
    
    model.save_weights(model_out_file+str(epoch)+'.model', overwrite=True)

  
  
  from ipykernel import kernelapp as app
  name=name)
  


### Predict

In [16]:
import heapq # for retrieval topK
import math

In [20]:
class Evaluate:
    def __init__(self, testRatings, testNegatives, model, K):
        self.testRatings = testRatings
        self.testNegatives = testNegatives
        self.model = model
        self.K = K
        
    def get_results(self):
        hits, ndcgs = [],[]
        
        for idx in range(len(self.testRatings)):
            (hr,ndcg, ranklist) = self.eval_one_rating(idx)
            hits.append(hr)
            ndcgs.append(ndcg)
            
        return (hits, ndcgs, ranklist)
    
    def getHitRatio(self, ranklist, gtItem):
        for item in ranklist:
            if item == gtItem:
                return 1
        return 0

    def getNDCG(self, ranklist, gtItem):
        for i in range(len(ranklist)):
            item = ranklist[i]
            if item == gtItem:
                return math.log(2) / math.log(i+2)
        return 0
    
    def eval_one_rating(self, idx):
        rating = self.testRatings[idx]
        items = self.testNegatives[idx]
        u = rating[0]
        gtItem = rating[1]
        items.append(gtItem)

        # Get prediction scores
        map_item_score = {}
        users = np.full(len(items), u, dtype = 'int32')
        predictions = self.model.predict([users, np.array(items)], 
                                     batch_size=100, verbose=0)
        for i in range(len(items)):
            item = items[i]
            map_item_score[item] = predictions[i]
        items.pop()

        # Evaluate top rank list
        ranklist = heapq.nlargest(self.K, map_item_score, key=map_item_score.get)
        hr = self.getHitRatio(ranklist, gtItem)
        ndcg = self.getNDCG(ranklist, gtItem)
        
        return (hr, ndcg, ranklist)
        

In [21]:
_eval = Evaluate(testRatings, testNegatives, model, 10)
hits, ndcgs, ranklist = _eval.get_results()

In [22]:
ranklist

[532, 824, 807, 1006, 512, 213, 1526, 1812, 1755, 1687]

In [87]:
ndcgs

[0.33333333333333337,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0.3562071871080222,
 0,
 0,
 0,
 0,
 0.43067655807339306,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0.5,
 1.0,
 0,
 0,
 0.3562071871080222,
 0,
 0,
 0,
 0.30102999566398114,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0.2890648263178878,
 0,
 0,
 1.0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0.3868528072345416,
 0.33333333333333337,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0.43067655807339306,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0.5,
 0,
 0,
 0.6309297535714574,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0.43067655807339306,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0.6309297535714574,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0