# Tensorflow 2.0 Matrix Factorization for prediction of movie ratings using tf.feature_column and tf.Estimator API
Dataset:https://grouplens.org/datasets/movielens/1m/

### Import Tensorflow 2.0

In [None]:
# Use Tensorflow 2.0 
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
  # Load the TensorBoard extension
  %load_ext tensorboard
except Exception:
  pass

In [None]:
import tensorflow as tf
AUTOTUNE = tf.data.experimental.AUTOTUNE
keras = tf.keras
print(tf.__version__)

### Load Data from Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!mkdir -vp data
!cp -r '/content/drive/My Drive/collab_data/ml-1m' './data'
!ls data/ml-1m

# If dataset is big, just make a symbolic link (This doesn't make data loading any faster, but prevents possible issues with whe white space in the path)
# !ln -s '/content/drive/My Drive/collab_data/ml-1m' './data'

## Data Preprocessing / Pipeline

In [None]:
import pandas as pd
import os

dataset_path = 'data/ml-1m'

user_columns = 'UserID::Gender::Age::Occupation::Zip-code'.split('::')
movie_columns = 'MovieID::Title::Genres'.split('::')
ratings_columns = 'UserID::MovieID::Rating::Timestamp'.split('::')

ratings_path = os.path.join(dataset_path, 'ratings.dat')

df_movies = pd.read_csv(os.path.join(dataset_path, 'movies.dat'), sep='::', header=None)
df_ratings = pd.read_csv(ratings_path, sep='::', names=ratings_columns)
df_users = pd.read_csv(os.path.join(dataset_path, 'users.dat'), sep='::', header=None)

df_ratings['UserID'] = df_ratings['UserID'] -1
df_ratings['MovieID'] = df_ratings['MovieID'] -1

df_ratings.head(3)

In [None]:
# | drop the Timestamp column, as we don't need that for our recommender system
cleaned_ratings_path = os.path.join(dataset_path, 'ratings-cleaned.dat')
df_ratings.to_csv(cleaned_ratings_path, sep=';', index=False, header=False)
# df_ratings.drop('Timestamp', axis=1).to_csv(cleaned_ratings_path, sep=';', index=False, header=False)

In [None]:
# | check if there are duplicates i.e. if a user rated the same item twice
df_ratings_copy = df_ratings.copy()
df_ratings_copy = df_ratings_copy.drop('Timestamp', axis=1)
len(df_ratings_copy) - len(df_ratings_copy.drop_duplicates())

# | check number of unique users & movies
nr_users = len(df_ratings['UserID'].unique())
nr_movies = len(df_ratings['MovieID'].unique())

print('#Users: {}, #Movies: {}'.format(nr_users, nr_movies))

### A) Create dataset using CsvDataset()
Wont work out of the box due to the '::' separator. Only slingle char separators are supported!<br>
Another issue is that the column names are lost, i.e. the features won't be read as a dictionary {col_name: value}, which is e.g. needed if using tf.feature_column

In [None]:
# dataset = tf.data.experimental.CsvDataset(cleaned_ratings_path, [tf.constant([], dtype=tf.int32)]*3, field_delim=';')

### B) Create dataset manually using TextLineDataset()

In [None]:
NR_EPOCHS = 1
BATCH_SIZE = 32
ratings_columns = 'UserID::MovieID::Rating'.split('::')


def input_fn(file_path, buffer_size=10000, nr_epochs=10, batch_size=32):
  def preprocess(line):
    # line = tf.strings.regex_replace(line, "::", ";") # note: this function cannot be converted into a graph... will be slow
    fields = tf.io.decode_csv(line, record_defaults=[tf.constant([], dtype=tf.int32)]*3, field_delim=";", select_cols=[0,1,2])
    
    # features = tf.stack(fields[:-1]) # decode_csv() function returns a list of scalar tensors (one per column) but we need to return 1D tensor arrays
    label = tf.stack(fields[-1])
    label = tf.dtypes.cast(label, tf.dtypes.int32)
    features = dict(zip(ratings_columns[:-1], fields[:-1]))
    
    return features, label 
  
  dataset = tf.data.TextLineDataset(file_path)
  dataset = dataset.map(preprocess, num_parallel_calls=4)
  dataset = dataset.cache()
  dataset = dataset.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=buffer_size, count=NR_EPOCHS))
  dataset = dataset.batch(batch_size)
  dataset = dataset.prefetch(1)
  
  return dataset


def predict_input_fn(file_path, buffer_size=10000, batch_size=1):
  def preprocess(line):
    # line = tf.strings.regex_replace(line, "::", ";")
    fields = tf.io.decode_csv(line, record_defaults=[tf.constant([], dtype=tf.int32)]*3, field_delim=";", select_cols=[0,1,2])

    features = dict(zip(ratings_columns[:-1], fields[:-1]))
    
    return features 
  
  dataset = tf.data.TextLineDataset(file_path)
  dataset = dataset.map(preprocess, num_parallel_calls=4)
  dataset = dataset.batch(batch_size)
  
  return dataset

dataset = input_fn(cleaned_ratings_path, 10000, NR_EPOCHS, BATCH_SIZE)

In [None]:
for features, label in dataset.take(1):
  print(features)  
  print(label)

### Define the feature columns
The probability of a collision if there are k categories is approximately equal to: 1 - exp(-k*(k-1)/2/hash_bucket_size) 

In [None]:
import math
import scipy.special


def calculate_collision_probability(nr_categories, hash_bucket_size):
  # | caluclates probability of event where at least one collision occurs
  return 1 - math.exp(-nr_categories*(nr_categories-1)/(2*hash_bucket_size)) 


def estimate_number_of_collisions(nr_categories, hash_bucket_size): 
  return nr_categories - hash_bucket_size * (1 - ((hash_bucket_size-1)/hash_bucket_size)**nr_categories)

calculate_collision_probability(nr_users, 100000)
estimate_number_of_collisions(nr_users, 1000)

In [None]:
print(df_ratings['UserID'].max(), df_ratings['MovieID'].max())

In [None]:
feature_columns = []

# cat_col = tf.feature_column.categorical_column_with_hash_bucket(key=key, hash_bucket_size=1000) # currently there seems to be a bug with categorical_column_with_hash_bucket()

# user_id = tf.feature_column.categorical_column_with_identity('UserID', num_buckets=6038)
# movie_id = tf.feature_column.categorical_column_with_identity('MovieID', num_buckets=3951)
user_id = tf.feature_column.categorical_column_with_hash_bucket('UserID', hash_bucket_size=1000, dtype=tf.dtypes.int32)
movie_id = tf.feature_column.categorical_column_with_hash_bucket('MovieID', hash_bucket_size=1000, dtype=tf.dtypes.int32)

feature_columns.append(tf.feature_column.embedding_column(user_id, 50))
feature_columns.append(tf.feature_column.embedding_column(movie_id, 50))

In [None]:
# | check outputs of feature layer

feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

for f, l in dataset.take(1):
  print(f)

out = feature_layer(f)
user_id_tensor, movie_id_tensor = tf.split(out, 2, axis=1)
p = tf.reduce_sum(tf.multiply(user_id_tensor, movie_id_tensor), axis=1)
tf.keras.losses.MSE(l, p)

In [None]:
# Caution tf.keras.losses.MSE() is not the same as tf.keras.losses.MeanSquaredError() (althouc docs lists it as alias)

import inspect
print(inspect.signature(tf.keras.losses.MeanSquaredError()))
print(inspect.signature(tf.keras.losses.MSE)) # note: MSE without ()
print(inspect.signature(tf.metrics.RootMeanSquaredError)) # note: MSE without ()

### Define the model function

In [None]:
class MyModel(tf.keras.Model):
  def __init__(self, feature_columns):
    super(MyModel, self).__init__()
    self.embedding_layer = tf.keras.layers.DenseFeatures(feature_columns)

  @tf.function
  def call(self, x):
    feature_tensor = self.embedding_layer(x)
    
    # | Split into two equal parts, as hash_bucket_size of both feature_columns is equal
    user_id_tensor, movie_id_tensor = tf.split(feature_tensor, num_or_size_splits=2, axis=1)
    
    # | dot product of user vector with movie vector -> the more correlation between these vectors, the higher the ranking should be (label)
    predicted_ranking = tf.reduce_sum(tf.multiply(user_id_tensor, movie_id_tensor), axis=1)

    return predicted_ranking
                

def my_model_fn(
   features, # This is batch_features from input_fn
   labels,   # This is batch_labels from input_fn
   mode,     # An instance of tf.estimator.ModeKeys (tf.estimator.ModeKeys.TRAIN, EVAL, PREDICT)
   params):  # Additional configuration, Any params passed to the Estimator constructor are in turn passed on to the model_fn
  
  model = MyModel(feature_columns=params['feature_columns'])
  
  if mode == tf.estimator.ModeKeys.PREDICT: 
    return tf.estimator.EstimatorSpec(mode=mode,
                                      predictions={'predicted_rankings': model(features)})
  
  with tf.GradientTape() as tape:  
    predicted_rankings = model(features)
    loss_obj = tf.keras.losses.MeanSquaredError()
    loss = tf.sqrt(loss_obj(labels, predicted_rankings))
    # loss = tf.sqrt(tf.reduce_mean(tf.square(predicted_rankings - tf.cast(labels, tf.float32))))

  # | Evaluation Metrics
  rmse_obj = tf.metrics.RootMeanSquaredError(name='rmse_obj')
  rmse = rmse_obj.update_state(y_true=labels, y_pred=predicted_rankings)
  
  mae_obj = tf.metrics.MeanAbsoluteError(name='mae_obj')
  mae = mae_obj.update_state(y_true=labels, y_pred=predicted_rankings)
  
  eval_metric_ops = {'rmse': rmse_obj, 'mae': mae_obj}
  
  # | Tensorboars
  tf.summary.scalar('loss', loss)
  tf.summary.scalar('mae', mae_obj.result())
  tf.summary.scalar('rmse', rmse_obj.result())
  
  if mode == tf.estimator.ModeKeys.TRAIN:
    gradients = tape.gradient(loss, model.embedding_layer.trainable_variables)
    optimizer_obj = tf.keras.optimizers.Adam(learning_rate=0.0001)
    optimizer_obj.iterations = tf.compat.v1.train.get_or_create_global_step()
    train_op = optimizer_obj.apply_gradients(zip(gradients, model.embedding_layer.trainable_variables))
    
    return tf.estimator.EstimatorSpec(mode=mode,
                                      predictions=predicted_rankings,
                                      loss=loss,
                                      train_op=train_op,
                                      eval_metric_ops=eval_metric_ops)
  
  if mode == tf.estimator.ModeKeys.EVAL:
    return tf.estimator.EstimatorSpec(mode=mode,
                                      loss=loss,
                                      eval_metric_ops=eval_metric_ops)


model = tf.estimator.Estimator(
    model_dir='model_dir',
    model_fn=my_model_fn,
    params={
        'feature_columns': feature_columns
    })

In [None]:
model.train(input_fn=lambda: input_fn(cleaned_ratings_path))

In [None]:
%tensorboard --logdir model_dir

In [None]:
!kill 1893

In [None]:
model.evaluate(input_fn=lambda: input_fn(cleaned_ratings_path))

In [None]:
predictions = model.predict(input_fn=lambda: predict_input_fn(cleaned_ratings_path).take(10))

list(predictions)

In [None]:
rm -rf model_dir


In [None]:
ls model_dir

In [None]:
rm graph*