# 1. collect data
https://grouplens.org/datasets/movielens/

# 2. preparations

In [1]:
import pandas as pd
import numpy as np
%tensorflow_version 1.x
import tensorflow as tf

TensorFlow 1.x selected.


In [2]:
from google.colab import files
uploaded = files.upload() # upload local datasets to google colab

Saving movies.csv to movies (1).csv


In [3]:
uploaded = files.upload()

Saving ratings.csv to ratings (1).csv


In [4]:
uploaded = files.upload()

Saving tags.csv to tags (1).csv


In [5]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
tags_df = pd.read_csv('tags.csv')

In [6]:
# replace movieId with row number
movies_df['movieRow'] = movies_df.index

In [7]:
# select useful columns
movies_selected = movies_df[['movieRow', 'movieId', 'title']]

In [8]:
# replace the movieId in 'ratings'
ratings_df = pd.merge(ratings_df, movies_selected, on = 'movieId')

In [9]:
# select useful columns
ratings_selected = ratings_df[['userId', 'movieRow', 'rating']]

In [10]:
# create matrices
userNo = ratings_selected['userId'].max() + 1
movieNo = movies_selected['movieRow'].max() + 1
rating = np.zeros((movieNo, userNo))  # matrix 1, users' ratings of movies

flag = 0
ratings_selected_len = np.shape(ratings_selected)[0]
for index, row in ratings_selected.iterrows():
  rating[int(row['movieRow']), int(row['userId'])]
  flag += 1

In [11]:
record = rating > 0  # if the user rated the movie, yes = 1, no = 0
record = np.array(record, dtype = int)

# 3. create model

In [12]:
def normalizeRatings(rating, record):
  m, n = rating.shape
  rating_mean = np.zeros((m, 1))
  rating_norm = np.zeros((m, n))
  for i in range(m):
    idx = record[1, : ] != 0
    rating_mean[i] = np.mean(rating[i, idx])
    rating_norm[i, idx] -= rating_mean[i]
  return rating_norm, rating_mean

In [13]:
# normalize the matrices
rating_norm, rating_mean = normalizeRatings(rating, record)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [14]:
# deal with na
rating_norm = np.nan_to_num(rating_norm) 
rating_mean = np.nan_to_num(rating_mean)

In [20]:
num_features = 10
X_parameters = tf.Variable(tf.random.normal([movieNo, num_features], stddev = 0.35))
theta_parameters = tf.Variable(tf.random.normal([userNo, num_features], stddev = 0.35))
loss = 1/2 * tf.reduce_sum(((tf.matmul(X_parameters, theta_parameters, transpose_b=True) - rating_norm) * record) ** 2) + 1/2 * (tf.reduce_sum(X_parameters ** 2) + tf.reduce_sum(theta_parameters ** 2))
optimizer = tf.train.AdamOptimizer(1e-4)
train = optimizer.minimize(loss)

# 4. train model

In [23]:
tf.summary.scalar('loss', loss)
summaryMerged = tf.summary.merge_all()

filename = './movie_tensorboard'
writer = tf.summary.FileWriter(filename)

In [24]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

# use tensorboard to visualize the loss
for i in range(5000):
  _, movie_summary = sess.run([train, summaryMerged])
  writer.add_summary(movie_summary, i)

# 5. evaluate model

In [26]:
current_x_parameter, current_theta_parameter = sess.run([X_parameters, theta_parameters])
predicts = np.dot(current_x_parameter, current_theta_parameter.T) + rating_mean
errors = np.sqrt(np.sum(predicts - rating)**2)
errors

90.79283399759224

# 6. movie recommendation

In [32]:
user_id = input('type in user ID: ')
sortedResult = predicts[: , int(user_id)].argsort()[::-1]

idx = 0
print('Top 20 Movies Recommended For User %s is: ' % user_id)
for i in sortedResult:
  print('score: %.2f, movie name: %s' % (predicts[i, int(user_id)], movies_df.iloc[i]['title']))
  idx += 1
  if idx == 20: break

type in user ID: 166
Top 20 Movies Recommended For User 166 is: 
score: 0.18, movie name: Wagons East (1994)
score: 0.17, movie name: Creepshow 2 (1987)
score: 0.14, movie name: Sex and the City (2008)
score: 0.14, movie name: Dragon Ball Z: Dead Zone (Doragon bôru Z 1: Ora no Gohan wo kaese) (1989)
score: 0.14, movie name: Melancholia (2011)
score: 0.13, movie name: Peter Pan (2003)
score: 0.13, movie name: Balls Out: Gary the Tennis Coach (2009)
score: 0.13, movie name: Mad Love (1995)
score: 0.13, movie name: Nell (1994)
score: 0.12, movie name: Taxi Driver (1976)
score: 0.12, movie name: License to Drive (1988)
score: 0.12, movie name: Pope of Greenwich Village, The (1984)
score: 0.12, movie name: Super, The (1991)
score: 0.12, movie name: Heart and Souls (1993)
score: 0.12, movie name: First Snow (2006)
score: 0.12, movie name: Coco Before Chanel (Coco avant Chanel) (2009)
score: 0.11, movie name: The Lair of the White Worm (1988)
score: 0.11, movie name: Day for Night (La Nuit Am