## Content Based Filtering

In [None]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate
from recsysNN_utils import *
from public_tests_cbf import *

pd.set_option("display.precision", 1)

import tensorflow as tf
tf.get_logger().setLevel('ERROR')

In [None]:
item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre=load_data()

num_user_features=user_train.shape[1]-3
num_item_features=item_train.shape[1]-1

uvs=3
ivs=3
u_s=3
i_s=1
scale_data=True

print(f"Number of training vectors: {len(item_train)}")

In [None]:
pprint_train(user_train, user_features, uvs, u_s, maxcount=5)

In [None]:
pprint_train(item_train, item_features, ivs, i_s, maxcount=5, user=False)

In [None]:
# Scaling the training data

if scale_data:
  item_train_save=item_train
  user_train_save=user_train

  # Create the scaler object
  scaler_item=StandardScaler()
  # Compute mean and std
  scaler_item.fit(item_train)
  # Scale using learned mean/std
  item_train=scaler_item.transform(item_train)

  scaler_user=StandardScaler()
  scaler_user.fit(user_train)
  user_train=scaler_user.transform(user_train)

  print(np.allclose(item_train_save, scaler_item.inverse_transform(item_train)))
  print(np.allclose(user_train_save, scaler_user.inverse_transform(user_train)))

In [None]:
item_train, item_test=train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test=train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test=train_test_split(y_train, train_size=0.80, shuffle=True, random_state=1)

print(f"Item training data: {item_train.shape}")
print(f"Item testing data: {item_test.shape}")

In [None]:
pprint_train(user_train, user_features, uvs, u_s, maxcount=5)

In [None]:
# Scaling the target between -1 and 1

scaler=MinMaxScaler((-1, 1))
scaler.fit(y_train.reshape(-1, 1))

y_norm_train=scaler.transform(y_train.reshape(-1, 1))
y_norm_test=scaler.transform(y_test.reshape(-1, 1))

print(y_norm_train.shape, y_norm_test.shape)

### Neural Network for Content Based Learning

In [None]:
num_outputs=32
tf.random.set_seed(1)

user_NN=tf.keras.models.Sequential([
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_outputs, activation='linear')
])

item_NN=tf.keras.models.Sequential([
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_outputs, activation='linear')
])

# Create input and point to the base network
input_user=tf.keras.layers.Input(shape=(num_user_features,))
vu=user_NN(input_user)
vu=tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vu)

input_item=tf.keras.layers.Input(shape=(num_item_features,))
vm=item_NN(input_item)
vm=tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vm)

# Compute dot product of two vectors vu and vm
output=tf.keras.layers.Dot(axes=1)([vu, vm])

# Specify inputs and output to the model
model=Model([input_user, input_item], output)

model.summary()

In [None]:
tf.random.set_seed(1)

cost_fn=tf.keras.losses.MeanSquaredError()
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01)

model.compile(optimizer=optimizer, loss=cost_fn)

In [None]:
tf.random.set_seed(1)

model.fit([user_train[:, u_s:], item_train[:, i_s:]], y_norm_train, epochs=30)

In [None]:
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], y_norm_test)

### Predictions for a New User

In [None]:
new_user_id=5000
new_rating_ave=1.0
new_action=1.0
new_adventure=1
new_animation=1
new_childrens=1
new_comedy=5
new_crime=1
new_documentary=1
new_drama=1
new_fantasy=1
new_horror=1
new_mystery=1
new_romance=5
new_scifi=5
new_thriller=1
new_rating_count=3

user_vec=np.array([[new_user_id, new_rating_count, new_rating_ave,
                    new_action, new_adventure, new_animation, new_childrens,
                    new_comedy, new_crime, new_documentary,
                    new_drama, new_fantasy, new_horror, new_mystery,
                    new_romance, new_scifi, new_thriller]])

In [None]:
# Generate and replicate the user vector to match the number of movies in the dataset
user_vecs=gen_user_vecs(user_vec, len(item_vecs))

# Scale the vector and make predictions for all movies
sorted_idx, sorted_ypu, sorted_items, sorted_user=predict_uservec(user_vecs, item_vecs, model, u_s, i_s,
                                                                  scaler, scaler_user, scaler_item, scaledata=scale_data)

print_pred_movies(sorted_ypu, sorted_user, sorted_items, movie_dict, maxcount=10)

### Predictions for an Existing User

In [None]:
uid=36

user_vecs, y_vecs=get_user_vecs(uid, scaler_user.inverse_transform(user_train), item_vecs, user_to_genre)

sorted_idx, sorted_ypu, sorted_items, sorted_user=predict_uservec(user_vecs, item_vecs, model, u_s, i_s,
                                                                  scaler, scaler_user, scaler_item, scaledata=scale_data)

sorted_y=y_vecs[sorted_idx]

print_existing_user(sorted_ypu, sorted_y.reshape(-1, 1), sorted_user, sorted_items, item_features, ivs, uvs, movie_dict, maxcount=10)

### Finding Similar Items

A similarity measure is the squared distance between the two vectors $ \mathbf{v_m^{(k)}}$ and $\mathbf{v_m^{(i)}}$ :
$$\left\Vert \mathbf{v_m^{(k)}} - \mathbf{v_m^{(i)}}  \right\Vert^2 = \sum_{l=1}^{n}(v_{m_l}^{(k)} - v_{m_l}^{(i)})^2\tag{1}$$

In [None]:
def sq_dist(a, b):
  d=sum(np.square(a-b))
  return (d)

In [None]:
test_sq_dist(sq_dist)

In [None]:
a=np.array([1.1, 2.1, 3.1])
b=np.array([1.0, 2.0, 3.0])

dist=sq_dist(a, b)
print(f"Squared distance between a and b: {dist}")

In [None]:
input_item_m=tf.keras.layers.Input(shape=(num_item_features,))
vm_m=item_NN(input_item_m)
vm_m=tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vm_m)
model_m=Model(input_item_m, vm_m)
model_m.summary()

In [None]:
scaled_item_vecs=scaler_item.transform(item_vecs)
vms=model_m.predict(scaled_item_vecs[:, i_s:])
print(f"Size of all predicted movie feature vectors: {vm.shape}")

In [None]:
count=50
dim=len(vms)
dist=np.zeros((dim, dim))

for i in range(dim):
  for j in range(dim):
    dist[i, j]=sq_dist(vms[i, :], vms[j, :])

m_dist=ma.masked_array(dist, mask=np.identity(dist.shape[0]))

disp=[['movie1', 'genres', 'movie2', 'genres']]
for i in range(count):
  min_idx=np.argmin(m_dist[i])
  movie1_id=int(item_vecs[i, 0])
  movie2_id=int(item_vecs[min_idx, 0])
  genre1, _=get_item_genre(item_vecs[i, :], ivs, item_features)
  genre2, _=get_item_genre(item_vecs[min_idx, :], ivs, item_features)

  disp.append([
    movie_dict[movie1_id]['title'], genre1,
    movie_dict[movie2_id]['title'], genre2
  ])

table=tabulate.tabulate(disp, tablefmt='html', headers='firstrow', floatfmt=[".1f", ".1f", ".0f", ".2f", ".2f"])
table