In [1]:
import json
import numpy as np
import pandas as pd
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
with open('trainset_movie.json','r') as fopen:
    trainset = json.loads(fopen.read())
    
with open('testset_movie.json','r') as fopen:
    testset = json.loads(fopen.read())

In [3]:
movie_info = {}
with open('movies.dat','r',encoding = "ISO-8859-1") as fopen:
    for i in list(filter(None,fopen.read().split('\n'))):
        i = i.split('::')
        movie_info[int(i[0])]={'title':i[1][:-7],'genre':i[2].split('|')}

In [4]:
def fn(creator):
    train_df = {'user_id': [], 'gender_id': [], 'age_id': [], 'job_id': [], 'movie_id': [],
                'category_ids': [], 'movie_title': [], 'score': []}
    for train_sample in creator:
        train_sample = train_sample[1]
        uid = train_sample[0]
        mov_id = train_sample[4]
        mov_dict = movie_info[mov_id]
        train_df['user_id'].append(train_sample[0])
        train_df['gender_id'].append(train_sample[1])
        train_df['age_id'].append(train_sample[2])
        train_df['job_id'].append(train_sample[3])
        train_df['movie_id'].append(train_sample[4])
        category_ids = [str(idx) for idx in train_sample[5]]
        train_df['category_ids'].append(' '.join(category_ids))
        movie_title_idx = [str(idx+1) for idx in train_sample[6]]
        train_df['movie_title'].append(' '.join(movie_title_idx))
        train_df['score'].append(train_sample[7][0])
    return train_df

In [5]:
train_df = pd.DataFrame(fn(trainset))
test_df = pd.DataFrame(fn(testset))

In [6]:
train_df.head()

Unnamed: 0,age_id,category_ids,gender_id,job_id,movie_id,movie_title,score,user_id
0,0,1,1,10,1193,1829 4270 329 989 1177 2285,5.0,1
1,0,10 8 9,1,10,661,1723 1909 989 3295 3852,1.0,1
2,0,9 7,1,10,914,240 742 2373,1.0,1
3,0,1,1,10,3408,4764 3837,3.0,1
4,0,10 8 13,1,10,2355,4869 2236 2942,5.0,1


In [7]:
def get_data(df):

    category_ids = []
    for category_id in df['category_ids'].values:
        temp_li = [0] * 18
        category_id_li = category_id.split()
        for idx in category_id_li:
            temp_li[int(idx)] = 1
        category_ids.append(temp_li)
    
    movie_titles = []
    for mov_title in df['movie_title'].values:
        temp_li = [0] * 10
        mov_title_li = mov_title.split()
        for i in range(len(mov_title_li[:10])):
            temp_li[i] = int(mov_title_li[i])
        movie_titles.append(temp_li)
        
    return df['score'],df['user_id'].values,df['gender_id'].values,df['gender_id'].values,df['job_id'].values,df['movie_id'].values,np.array(category_ids),np.array(movie_titles)

In [8]:
Y,user_ids,gender_ids,age_ids,job_ids,movie_ids,category_ids,movie_titles = get_data(train_df)

In [9]:
def global_max_pooling(x):
    batch_size = tf.shape(x)[0]
    num_units = x.get_shape().as_list()[-1]
    x = tf.layers.max_pooling1d(x, x.get_shape().as_list()[1], 1)
    x = tf.reshape(x, [batch_size, num_units])
    return x

def cos_sim(x1, x2, scale=1):
    x1_norm = tf.nn.l2_normalize(x1, -1)
    x2_norm = tf.nn.l2_normalize(x2, -1)
    cos_sim = tf.reduce_sum(tf.multiply(x1_norm, x2_norm), -1)
    return scale * cos_sim

class Model:
    def __init__(self,movie_id_size,job_id_size,user_id_size,
                age_id_size,movie_title_vocab_size,learning_rate):
        self.user_id = tf.placeholder(tf.int32, [None])
        self.gender_id = tf.placeholder(tf.int32, [None])
        self.age_id = tf.placeholder(tf.int32, [None])
        self.job_id = tf.placeholder(tf.int32, [None])
        self.movie_id = tf.placeholder(tf.int32, [None])
        self.category_ids = tf.placeholder(tf.int32, [None, 18])
        self.movie_title = tf.placeholder(tf.int32, [None, 10])
        self.Y = tf.placeholder(tf.float32, [None])
        
        with tf.variable_scope('user_id'):
            user_id_embed = tf.contrib.layers.embed_sequence(
                 ids = self.user_id,
                 vocab_size = user_id_size,
                 embed_dim = 32)
            user_id_fc = tf.layers.dense(user_id_embed, 32)
            
        with tf.variable_scope('gender_id'):
            gender_id_embed = tf.contrib.layers.embed_sequence(
                ids = self.gender_id,
                vocab_size = 2,
                embed_dim = 16)
            gender_id_fc = tf.layers.dense(gender_id_embed, 16)
            
        with tf.variable_scope('age_id'):
            age_id_embed = tf.contrib.layers.embed_sequence(
                ids = self.age_id,
                vocab_size = age_id_size,
                embed_dim = 16)
            age_id_fc = tf.layers.dense(age_id_embed, 16)
            
        with tf.variable_scope('job_id'):
            job_id_embed = tf.contrib.layers.embed_sequence(
                ids = self.job_id,
                vocab_size = job_id_size,
                embed_dim = 16)
            job_id_fc = tf.layers.dense(job_id_embed, 16)
            
        user_feats = tf.concat([user_id_fc, gender_id_fc, age_id_fc, job_id_fc], -1)
        self.user_feats = tf.layers.dense(user_feats, 200, tf.tanh)
        
        with tf.variable_scope('movie_id'):
            movie_id_embed = tf.contrib.layers.embed_sequence(
                ids = self.movie_id,
                vocab_size = movie_id_size,
                embed_dim = 32)
            movie_id_fc = tf.layers.dense(movie_id_embed, 32)
            
        with tf.variable_scope('category_ids'):
            category_fc = tf.layers.dense(tf.to_float(self.category_ids), 32)

        with tf.variable_scope('movie_title'):
            movie_title_embed = tf.contrib.layers.embed_sequence(
                ids = self.movie_title,
                vocab_size = movie_title_vocab_size,
                embed_dim = 32)
            movie_title_conv = tf.layers.conv1d(movie_title_embed, 32, 3)
            movie_title_fc = global_max_pooling(movie_title_conv)
            
        movie_feats = tf.concat([movie_id_fc, category_fc, movie_title_fc], -1)
        self.movie_feats = tf.layers.dense(movie_feats, 200, tf.tanh)
        self.global_step = tf.Variable(0, trainable=False)
        predicted_score = cos_sim(self.user_feats, self.movie_feats, scale=5)
        self.cost = tf.reduce_mean(tf.squared_difference(predicted_score, self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost,global_step=self.global_step)

In [10]:
size_user_ids = np.unique(user_ids).shape[0]
size_movie_ids = np.unique(movie_ids).shape[0]
size_job_ids = np.unique(job_ids).shape[0]
size_age_ids = np.unique(age_ids).shape[0]
learning_rate = 1e-4
size_movie_titles = 5175
epoch = 1
batch_size = 128

In [11]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(size_movie_ids,size_job_ids,size_user_ids,size_age_ids,size_movie_titles,learning_rate)
sess.run(tf.global_variables_initializer())

In [12]:
for i in range(epoch):
    total_cost = 0
    for k in range(0, (Y.shape[0] // batch_size)*batch_size,batch_size):
        batch_user = user_ids[k:k+batch_size]
        batch_movie = movie_ids[k:k+batch_size]
        batch_job = job_ids[k:k+batch_size]
        batch_age = age_ids[k:k+batch_size]
        batch_gender = gender_ids[k:k+batch_size]
        batch_category = category_ids[k:k+batch_size]
        batch_title = movie_titles[k:k+batch_size]
        batch_y = Y[k:k+batch_size]
        step, loss, _ = sess.run([model.global_step, model.cost, model.optimizer],
                                feed_dict={model.user_id:batch_user, model.gender_id:batch_gender,
                                           model.age_id:batch_age, model.job_id: batch_job,
                                           model.movie_id:batch_movie, model.category_ids:batch_category,
                                           model.movie_title: batch_title, model.Y:batch_y})
        if step % 500 == 0 or step == 1:
            print('epoch %d, step %d, loss %f'%(i+1,step,loss))
        total_cost += loss
    total_cost /= (Y.shape[0] // batch_size)
    print('epoch %d, avg loss %f'%(i+1,total_cost))

epoch 1, step 1, loss 9.271559
epoch 1, step 500, loss 2.213113
epoch 1, step 1000, loss 4.583295
epoch 1, step 1500, loss 1.837167
epoch 1, step 2000, loss 2.927567
epoch 1, step 2500, loss 4.564452
epoch 1, step 3000, loss 2.903585
epoch 1, step 3500, loss 2.202424
epoch 1, step 4000, loss 4.401066
epoch 1, step 4500, loss 3.354321
epoch 1, step 5000, loss 4.106241
epoch 1, step 5500, loss 4.069396
epoch 1, step 6000, loss 4.347765
epoch 1, step 6500, loss 3.043517
epoch 1, step 7000, loss 4.156744
epoch 1, avg loss 4.032794
