In [6]:
import numpy as np
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, concatenate, Input, Reshape, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.models import load_model
import csv
import os
import errno
import operator
import sys
import pickle

In [7]:
user_ids = []
movie_ids = []
ratings = []
with open('data/train.csv', 'rt') as trainfile:
    reader = csv.reader(trainfile, delimiter=',')
    next(reader)
    for row in reader:
        # print(row)
        user_ids.append(row[1])
        movie_ids.append(row[2])
        ratings.append(row[3])
'''
user_ids = np.array(user_ids)
movie_ids = np.array(movie_ids)
ratings = np.array(ratings)
'''

'\nuser_ids = np.array(user_ids)\nmovie_ids = np.array(movie_ids)\nratings = np.array(ratings)\n'

In [8]:
print(len(set(user_ids))) # user_ids are continuous
print(len(set(movie_ids))) # some movie ids are not continuous

6040
3688


In [9]:
# takes around 15 seconds
user_tokenizer = Tokenizer()
movie_tokenizer = Tokenizer()

# arg: list of texts
user_tokenizer.fit_on_texts(user_ids)
movie_tokenizer.fit_on_texts(movie_ids)

# arg: list of texts
user_tokens = user_tokenizer.texts_to_sequences(user_ids)
movie_tokens = movie_tokenizer.texts_to_sequences(movie_ids)
# returns: [[1], [2], [3]]
# [[1], [2], [3]] put into embedding layer --> [[wordvec], [wordvec], [wordvec]]

In [10]:
len(user_tokens)

899873

In [11]:
# sanity check
print(np.unique(user_tokens))
print(np.unique(movie_tokens))

[   1    2    3 ..., 6038 6039 6040]
[   1    2    3 ..., 3686 3687 3688]


In [12]:
batch_size = 50
dimension = 10

user_id = Input(shape=(1,))
user_emb = Embedding(input_dim=len(set(user_ids))+1, output_dim=dimension, input_length=1)(user_id)
user_emb = Flatten()(user_emb)

movie_id = Input(shape=(1,))
movie_emb = Embedding(input_dim=len(set(movie_ids))+1, output_dim=dimension, input_length=1)(movie_id)
movie_emb = Flatten()(movie_emb)

user_dot_movie = keras.layers.dot(inputs=[user_emb, movie_emb], axes=1)

user_b_input = Input(shape=(1,)) # will be np.zeros(1)
movie_b_input = Input(shape=(1,)) # will be np.zeros(1)
user_b = Dense(1, input_shape=(1,))(user_b_input) #  0*W + b
movie_b = Dense(1, input_shape=(1,))(movie_b_input)
rating_pred = keras.layers.add(inputs=[user_dot_movie, user_b, movie_b])

model = Model(inputs=[user_id, movie_id, user_b_input, movie_b_input], outputs=rating_pred)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
print(model.summary())
model.fit([np.array(user_tokens), np.array(movie_tokens), np.zeros((len(user_tokens), 1)), np.zeros((len(movie_tokens), 1))], np.array(ratings), validation_split=0.1, epochs=10, batch_size=batch_size)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1, 10)         60410       input_1[0][0]                    
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 1, 10)         36890       input_2[0][0]                    
___________________________________________________________________________________________

KeyboardInterrupt: 