In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import backend as K

from tensorflow.keras import initializers
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Lambda, Activation
from tensorflow.keras.layers import Embedding, Input, Dense, Reshape, Flatten, Dropout, Concatenate
from tensorflow.keras.models import Model
#from tensorflow.keras.constraints import maxnorm
from tensorflow.keras.optimizers import Adagrad, Adam, SGD, RMSprop

class Args(object):
    """A simulator of parser in jupyter notebook"""
    def __init__(self):
        self.path = 'Data/'
        self.dataset = 'ml-1m'
        self.epochs = 100
        self.batch_size = 256
        self.layers = '[64,32,16,8]'
        self.reg_layers = '[0,0,0,0]'
        self.num_neg = 4
        self.lr = 0.001
        self.learner = 'adam'
        self.verbose = 1
        self.out = 1
        self.num_genres = 18

In [25]:
# Get model 
def init_normal(shape=[0,0.05], seed=None):
    mean, stddev = shape
    return initializers.RandomNormal(mean=mean, stddev=stddev, seed=seed)


def get_model(num_users, num_items, num_genres, layers=[20,10], reg_layers=[0,0]):
    assert len(layers) == len(reg_layers)
    num_layer = len(layers) #Number of layers in the MLP
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')
    genre_input = Input(shape=(1,), dtype='int32', name = 'genre_input')
    
    #one_hot_labels = keras.utils.to_categorical(labels, num_classes=10)

    MLP_Embedding_User = Embedding(input_dim = num_users, output_dim = int(layers[0]/2), name = 'user_embedding',
                                  embeddings_initializer = init_normal(), embeddings_regularizer = l2(reg_layers[0]), input_length=1)
    MLP_Embedding_Item = Embedding(input_dim = num_items, output_dim = int(layers[0]/2), name = 'item_embedding',
                                  embeddings_initializer = init_normal(), embeddings_regularizer = l2(reg_layers[0]), input_length=1)   
    
    # Crucial to flatten an embedding vector!
    user_latent =Flatten()(MLP_Embedding_User(user_input))
    item_latent =Flatten()(MLP_Embedding_Item(item_input))
    
    # The 0-th layer is the concatenation of embedding layers
    concat = Concatenate()
    vector = concat([user_latent, item_latent])
    
    # MLP layers
    for idx in range(1, num_layer):
        layer = Dense(layers[idx], kernel_regularizer= l2(reg_layers[idx]), activation='relu', name = 'layer%d' %idx)
        vector = layer(vector)
        
    # Final prediction layer, genres take place
    
    # num_genres + 1 because there would be a default task 
    # we may use average as default in the future.
    vector = Dense(num_genres + 1, activation='sigmoid', kernel_initializer='lecun_uniform', name = 'prediction')(vector)
    
    ## be ware
    
    prediction = Lambda(sel_task)(vector) ## beware
    
    model = Model(inputs=[user_input, item_input, genre_input], 
                  outputs=prediction)
    
    return model

In [21]:
if __name__ == "__main__":
    args = Args()
    path = args.path
    dataset = args.dataset
    layers = eval(args.layers)
    reg_layers = eval(args.reg_layers)
    num_negatives = args.num_neg
    learner = args.learner
    learning_rate = args.lr
    batch_size = args.batch_size
    epochs = args.epochs
    verbose = args.verbose
    num_genres = args.num_genres
    
    
    data = pd.read_csv("movielens/genre_set.csv")
    topK = 10
    evaluation_threads = 1 #mp.cpu_count()
    #print("MLP arguments: %s " %(args))
    #model_out_file = 'Pretrain/%s_MLP_%s_%d.h5' %(args.dataset, args.layers, time())
    
    # num_users and num_items should be determined before splitting!!!
    num_users = len(data['userId'].unique())
    num_items = len(data['itemId'].unique()) 
    # Load data
    X_train, X_test = train_test_split(data, test_size=0.1, random_state=42)

In [6]:
X_test.head()

Unnamed: 0,userId,itemId,rating,genre
19090,127,165,5.0,"[3, 4, 8, 11]"
99678,664,9064,1.0,"[4, 7, 13]"
18455,119,2763,3.0,[4]
35755,256,2190,4.0,"[0, 1, 2, 14]"
66536,467,1333,4.0,"[7, 15]"


In [26]:
# Build model!
model = get_model(num_users, num_items, num_genres ,layers, reg_layers)
if learner.lower() == "adagrad": 
    model.compile(optimizer=Adagrad(lr=learning_rate), loss='binary_crossentropy')
elif learner.lower() == "rmsprop":
    model.compile(optimizer=RMSprop(lr=learning_rate), loss='binary_crossentropy')
elif learner.lower() == "adam":
    model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy')
else:
    model.compile(optimizer=SGD(lr=learning_rate), loss='binary_crossentropy') 

ValueError: Output tensors to a Model must be the output of a TensorFlow `Layer` (thus holding past layer metadata). Found: Tensor("strided_slice_6:0", shape=(19,), dtype=float32)

In [23]:
num_users

671

In [16]:
len(X_train['userId'].unique())

671

In [20]:
len(X_test['itemId'].unique())

3526

In [25]:
np.mean([12,3,3])

6.0

In [32]:
a = np.array([1,2,3,4,5])
b = [1,2,3]
np.mean(a[b])

3.0

In [29]:
a[1:3]

[2, 3]