# Code By Dhruv Panchal

# train.rating:

    Train file.
    Each Line is a training instance: userID\t itemID\t rating\t timestamp (if have)
    Use case: This file is the training set. It contains all user–item interactions except the last one for each user (because last is held out for testing).


# test.rating:

    Test file (positive instances).
    Each Line is a testing instance: userID\t itemID\t rating\t timestamp (if have)
    Use case: This is the positive ground truth for evaluation.
    When we test, we ask: “Can the model rank this item higher than negatives for the user?”


# test.negative

    Test file (negative instances).
    Each line corresponds to the line of test.rating, containing 99 negative samples.
    Each line is in the format: (userID,itemID)\t negativeItemID1\t negativeItemID2 ...
    
    Interpretation (line 1):
        User 0, ground-truth test item = 25.
        Negatives = [1064, 174, 2791, ...] → randomly sampled items user 0 did not rate.

# Big Picture

    Train data = all past user interactions (except last one).

    Test data = last interaction per user.

    Negatives = 99 random unseen items for each user’s test.

    Task = rank the true item above the negatives.


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import logging
from time import time
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO)

In [6]:
os.getcwd()

'c:\\Users\\dhruv\\Documents\\DA-IICT\\Arpit_rana\\MajorProject\\NCF_Pytorch'

In [None]:
os.chdir("../")
os.getcwd()


'c:\\Users\\dhruv\\Documents\\DA-IICT\\Arpit_rana\\MajorProject'

In [10]:
# train_data = all past user interactions (except last one).

train_df = pd.read_csv(r"previous_code\neural_collaborative_filtering\Data\ml-1m.train.rating", sep="\t", header=None, names=["UserID","ItemID","Rating","Timestamp"])


train_df.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp
0,0,32,4,978824330
1,0,34,4,978824330
2,0,4,5,978824291
3,0,35,4,978824291
4,0,30,4,978824291


In [82]:
len(train_df)

994169

In [19]:
train_df.to_csv(r"NCF_Pytorch\train_data.csv", index=False)

In [11]:
test_df = pd.read_csv(r"previous_code\neural_collaborative_filtering\Data\ml-1m.test.rating", sep="\t", header=None, names=["UserID","ItemID","Rating","Timestamp"])


test_df.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp
0,0,25,5,978824351
1,1,133,3,978300174
2,2,207,4,978298504
3,3,208,4,978294282
4,4,222,2,978246585


In [20]:
test_df.to_csv(r"NCF_Pytorch\test_data.csv", index=False)

In [None]:
negative = []

with open(r"previous_code\neural_collaborative_filtering\Data\ml-1m.test.negative", "r") as file:
    for line in file:
        part = line.split("\t")
        user_item = part[0].strip("()").split(",")
        user, pos_items = int(user_item[0]), int(user_item[1])
        negative_items = list(map(int, part[1:]))
        negative.append((user, pos_items, negative_items))
        
test_negative_df = pd.DataFrame(negative, columns=["UserID", "ItemID", "NegativeItems"])
test_negative_df.head()

Unnamed: 0,UserID,ItemID,NegativeItems
0,0,25,"[1064, 174, 2791, 3373, 269, 2678, 1902, 3641,..."
1,1,133,"[1072, 3154, 3368, 3644, 549, 1810, 937, 1514,..."
2,2,207,"[2216, 209, 2347, 3, 1652, 3397, 383, 2905, 22..."
3,3,208,"[3023, 1489, 1916, 1706, 1221, 1191, 2671, 81,..."
4,4,222,"[1794, 3535, 108, 593, 466, 2048, 854, 1378, 1..."


In [22]:
test_negative_df.dtypes

UserID            int64
ItemID            int64
NegativeItems    object
dtype: object

In [25]:
type(test_negative_df.iloc[0,2])

list

In [21]:
test_negative_df.to_csv(r"NCF_Pytorch\test_negative_data.csv", index=False)

In [29]:
import scipy.sparse as sp

In [30]:
class Dataset(object):
    '''
    Dataset Class for making dataset input for the models
    trainMatrix: training Matrix of the data
    testRatings: positive test interactions
    testNegatives: negative test interactions sampled for each user
    '''

    def __init__(self, path):
        '''
        Constructor
        '''
        self.trainMatrix = self.load_rating_file_as_matrix(path + ".train.rating")
        self.testRatings = self.load_rating_file_as_list(path + ".test.rating")
        self.testNegatives = self.load_negative_file(path + ".test.negative")
        assert len(self.testRatings) == len(self.testNegatives)

        self.num_users, self.num_items = self.trainMatrix.shape

    def load_rating_file_as_list(self, filename):
        ratingList = []
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                user, item = int(arr[0]), int(arr[1])
                ratingList.append([user, item])
                line = f.readline()
        return ratingList

    def load_negative_file(self, filename):
        negativeList = []
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                negatives = []
                for x in arr[1: ]:
                    negatives.append(int(x))
                negativeList.append(negatives)
                line = f.readline()
        return negativeList

    def load_rating_file_as_matrix(self, filename):
        '''
        Read .rating file and Return dok matrix.
        The first line of .rating file is: num_users\t num_items
        '''
        # Get number of users and items
        num_users, num_items = 0, 0
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                u, i = int(arr[0]), int(arr[1])
                num_users = max(num_users, u)
                num_items = max(num_items, i)
                line = f.readline()
        # Construct matrix
        mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                user, item, rating = int(arr[0]), int(arr[1]), float(arr[2])
                if (rating > 0):
                    mat[user, item] = 1.0
                line = f.readline()
        return mat

In [31]:
data_instance = Dataset(r"previous_code\neural_collaborative_filtering\Data\ml-1m")
data_instance.num_users, data_instance.num_items

(6040, 3706)

In [46]:
data_instance.trainMatrix

<Dictionary Of Keys sparse matrix of dtype 'float32'
	with 994169 stored elements and shape (6040, 3706)>

In [49]:
len(data_instance.testNegatives), len(data_instance.testNegatives[0])

(6040, 99)

In [51]:
len(data_instance.testRatings)

6040

## Evaluation

In [52]:
import math
import heapq # for retrieval topK
import multiprocessing
import numpy as np
from time import time
#from numba import jit, autojit

# Global variables that are shared across processes
_model = None
_testRatings = None
_testNegatives = None
_K = None

def evaluate_model(model, testRatings, testNegatives, K, num_thread):
    
    """
    Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
    Return: score of each test rating.
    """
    
    global _model
    global _testRatings
    global _testNegatives
    global _K
    _model = model
    _testRatings = testRatings
    _testNegatives = testNegatives
    _K = K

    hits, ndcgs = [],[]
    if(num_thread > 1): # Multi-thread
        pool = multiprocessing.Pool(processes=num_thread)
        res = pool.map(eval_one_rating, range(len(_testRatings)))
        pool.close()
        pool.join()
        hits = [r[0] for r in res]
        ndcgs = [r[1] for r in res]
        return (hits, ndcgs)
    # Single thread
    for idx in range(len(_testRatings)):
        (hr,ndcg) = eval_one_rating(idx)
        hits.append(hr)
        ndcgs.append(ndcg)
    return (hits, ndcgs)

def eval_one_rating(idx):
    rating = _testRatings[idx]
    items = _testNegatives[idx]
    u = rating[0]
    gtItem = rating[1]
    items.append(gtItem)
    # Get prediction scores
    map_item_score = {}
    users = np.full(len(items), u, dtype = 'int32')
    predictions = _model.predict([users, np.array(items)],
                                 batch_size=100, verbose=0)
    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i]
    items.pop()

    # Evaluate top rank list
    ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
    hr = getHitRatio(ranklist, gtItem)
    ndcg = getNDCG(ranklist, gtItem)
    return (hr, ndcg)

def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

In [53]:
topK = 10
evaluation_threads = 1 #mp.cpu_count()

## GMF

In [56]:
from pathlib import Path 

Path(os.getcwd(), "previous_code/neural_collaborative_filtering/Data")

WindowsPath('c:/Users/dhruv/Documents/DA-IICT/Arpit_rana/MajorProject/previous_code/neural_collaborative_filtering/Data')

In [67]:
configurations = {
    'path': Path(os.getcwd(), "previous_code/neural_collaborative_filtering/Data"),
    'dataset': 'ml-1m',
    'regs': [0, 0],
    'lr': 0.001,          ## Learning Rate
    'batch_size': 256,    ## Batch Size
    'epochs': 1,          ## Training Epochs
    'learner': 'adam',
    'num_factors': 10,
    'num_layers': 3,
    'num_neg': 2,
    'verbose': 2,
    'out': True,
}

print('Configurations: ')
for key, value in configurations.items():
  print(f'{key} : {value}')

Configurations: 
path : c:\Users\dhruv\Documents\DA-IICT\Arpit_rana\MajorProject\previous_code\neural_collaborative_filtering\Data
dataset : ml-1m
regs : [0, 0]
lr : 0.001
batch_size : 256
epochs : 1
learner : adam
num_factors : 10
num_layers : 3
num_neg : 2
verbose : 2
out : True


In [58]:
# Loading data
t1 = time()

dataset_path = os.path.join(configurations['path'], configurations['dataset'])
# print(dataset_path)

dataset = Dataset(dataset_path)

train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
num_users, num_items = train.shape
print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d"
      %(time()-t1, num_users, num_items, train.nnz, len(testRatings)))


Load data done [18.5 s]. #user=6040, #item=3706, #train=994169, #test=6040


In [71]:
train

<Dictionary Of Keys sparse matrix of dtype 'float32'
	with 994169 stored elements and shape (6040, 3706)>

In [88]:
len(train)

994169

In [78]:
len(testRatings), len(testRatings[0])

(6040, 2)

In [76]:
len(testNegatives), len(testNegatives[0])

(6040, 99)

In [60]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Input, Model
from tensorflow.keras import initializers, regularizers
from tensorflow.keras.layers import Embedding, Dense, Flatten, concatenate, multiply
from tensorflow.keras.optimizers import Adam, Adagrad, RMSprop, SGD

In [61]:
def get_GMF_model(num_users, num_items, latent_dim, regs=[0,0]):
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

    MF_Embedding_User = Embedding(input_dim=num_users, output_dim=latent_dim, name='user_embedding',
                                    embeddings_initializer=initializers.RandomNormal(stddev=0.01),
                                    embeddings_regularizer=regularizers.l2(regs[0]))(user_input)
    MF_Embedding_Item = Embedding(input_dim=num_items, output_dim=latent_dim, name='item_embedding',
                                    embeddings_initializer=initializers.RandomNormal(stddev=0.01),
                                    embeddings_regularizer=regularizers.l2(regs[1]))(item_input)

    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(MF_Embedding_User)
    item_latent = Flatten()(MF_Embedding_Item)

    # Element-wise product of user and item embeddings
    predict_vector = multiply([user_latent, item_latent])

    # Final prediction layer
    #prediction = Lambda(lambda x: K.sigmoid(K.sum(x)), output_shape=(1,))(predict_vector)
    prediction = Dense(1, activation='sigmoid', name = 'prediction')(predict_vector)

    model = Model(inputs=[user_input, item_input],
                outputs=prediction)

    return model

In [63]:
def get_train_instances(train, num_negatives):
    user_input, item_input, labels = [],[],[]
    num_users = train.shape[0]
    for (u, i) in train.keys():
        # positive instance
        user_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_items)
            while (u, j) in train:
                j = np.random.randint(num_items)
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
    return user_input, item_input, labels

In [64]:
# Build model
model = get_GMF_model(num_users, num_items, configurations['num_factors'], configurations['regs'])
if configurations['learner'].lower() == "adagrad":
    model.compile(optimizer=Adagrad(learning_rate=configurations['lr']), loss='binary_crossentropy')
elif configurations['learner'].lower() == "rmsprop":
    model.compile(optimizer=RMSprop(learning_rate=configurations['lr']), loss='binary_crossentropy')
elif configurations['learner'].lower() == "adam":
    model.compile(optimizer=Adam(learning_rate=configurations['lr']), loss='binary_crossentropy')
else:
    model.compile(optimizer=SGD(learning_rate=configurations['lr']), loss='binary_crossentropy')
print(model.summary())


None


In [65]:
topK = 10
evaluation_threads = 1 #mp.cpu_count()

In [66]:
# Init performance
t1 = time()
(hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK, evaluation_threads)
hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
#mf_embedding_norm = np.linalg.norm(model.get_layer('user_embedding').get_weights())+np.linalg.norm(model.get_layer('item_embedding').get_weights())
#p_norm = np.linalg.norm(model.get_layer('prediction').get_weights()[0])
print('Init: HR = %.4f, NDCG = %.4f\t [%.1f s]' % (hr, ndcg, time()-t1))

Init: HR = 0.0940, NDCG = 0.0442	 [530.4 s]


In [89]:
# Train model

best_hr, best_ndcg, best_iter = hr, ndcg, -1

for epoch in range(configurations['epochs']):
    t1 = time()
    # Generate training instances
    print(len(train))
    user_input, item_input, labels = get_train_instances(train, configurations['num_neg'])
    print(len(user_input))
    print(len(item_input))
    # break
    
    # Training
    print([np.array(user_input).shape, np.array(item_input).shape])

    break

    hist = model.fit([np.array(user_input), np.array(item_input)], #input
                      np.array(labels), # labels
                      batch_size=configurations['batch_size'], epochs=1, verbose=0, shuffle=True)
    t2 = time()

    # Evaluation
    if epoch % configurations['verbose'] == 0:
        
        (hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK, evaluation_threads)
        
        hr, ndcg, loss = np.array(hits).mean(), np.array(ndcgs).mean(), hist.history['loss'][0]
        
        print('Iteration %d [%.1f s]: HR = %.4f, NDCG = %.4f, loss = %.4f [%.1f s]'
              % (epoch,  t2-t1, hr, ndcg, loss, time()-t2))
        
        if hr > best_hr:
        
            best_hr, best_ndcg, best_iter = hr, ndcg, epoch
            model_out_file = '%s_GMF_%d.weights.h5' %(configurations['dataset'], configurations['num_factors'])
        
            if configurations['out'] > 0:
                model.save_weights(model_out_file, overwrite=True)
        
            print("End. Best Iteration %d:  HR = %.4f, NDCG = %.4f. " %(best_iter, best_hr, best_ndcg))
            
# if configurations['out'] > 0:
    
#     print("The best GMF model is saved to %s" %(model_out_file))

994169
2982507
2982507
[(2982507,), (2982507,)]


In [87]:
994169*2

1988338