<a href="https://colab.research.google.com/github/nick-ching23/Movie_Recommender/blob/main/Movie_Rec_Neural_Net.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# Implementation of content-based filtering using deep learning
By Nicholas Ching


In [96]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
import tabulate
import csv
from google.colab import files
from numpy import genfromtxt
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from collections import defaultdict
from IPython.display import HTML

pd.set_option("display.precision", 1)

In [97]:
# PD formatting for outputs

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [None]:
"""
Upload the following files from local computer:
  - content_item_train.csv
  - user_item_train.csv
  - y_train.csv
  - item_vecs.csv
  - movies.csv
"""
uploaded = files.upload()


In [None]:
content_filepath = 'content_item_train.csv'
user_filepath = 'user_item_train.csv'
y_filepath = 'y_train.csv'
item_vecs_filepath = 'item_vecs.csv'
movies_file_path = 'movies.csv'


item_train = pd.read_csv(content_filepath)
user_train = pd.read_csv(user_filepath)
y_train = np.genfromtxt(y_filepath , delimiter=',')
y_train = y_train[1:]

item_vecs = pd.read_csv(item_vecs_filepath)
item_vecs_array = item_vecs.to_numpy()


In [None]:
item_vecs = item_vecs.drop("title", axis=1)
item_vecs_array = item_vecs.to_numpy()


In [None]:
num_user_features = user_train.shape[1] - 3  # remove userid, rating count and ave rating during training
num_item_features = item_train.shape[1] - 1  # remove movie id at train time

u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items

<h2>Formatting data into usable data frames </h2>


- In the case of the user train: all one hot encoded averages for each user

- In the case of the content train: year, avg rating, one hot encoding genres


In [None]:
user_train_unscaled = user_train
item_train_unscaled = item_train
y_train_unscaled    = y_train

scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.reshape(-1, 1))


# Testing unscaled items = inverse of scaled transformed items
print(np.allclose(item_train_unscaled, scalerItem.inverse_transform(item_train)))
print(np.allclose(user_train_unscaled, scalerUser.inverse_transform(user_train)))


In [None]:
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)

print(f"movie training data shape: {item_train.shape}")
print(f"movie test data shape: {item_test.shape}")
print(f"user training data shape: {item_train.shape}")
print(f"user test data shape: {item_test.shape}")
print(f"y training data shape: {item_train.shape}")
print(f"y test data shape: {item_test.shape}")



In [None]:
num_outputs = 32
tf.random.set_seed(1)

user_neural_net = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs),
])

item_neural_net = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs),
])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_neural_net(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_neural_net(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)



In [None]:
tf.random.set_seed(1)

cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,loss=cost_fn)

In [None]:
tf.random.set_seed(1)
model.fit([user_train[:, u_s:], item_train[:, i_s:]], y_train, epochs=30)

In [None]:
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], y_test)

<h1> Predictions </h1>
<p> for a new user </p>


In [None]:
movie_dict = defaultdict(dict)
count = 0

with open(movies_file_path, newline='') as csvfile:
  reader = csv.reader(csvfile, delimiter=',', quotechar='"')
  for line in reader:
    if count == 0:
        count += 1
    else:
        count += 1
        movie_id = int(line[0])
        movie_dict[movie_id]["title"] = line[1]
        movie_dict[movie_id]["genres"] = line[2]

In [None]:
new_user_id = 5000
new_rating_ave = 0.0
new_action = 2.5
new_adventure = 4.0
new_animation = 5.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.5
new_drama = 5
new_fantasy = 0.0
new_film_noir = 2.5
new_horror = 0.5
new_imax = 2.5
new_musical = 0.0
new_mystery = 0.0
new_romance = 2.5
new_scifi = 0.0
new_thriller = 1.5
new_war = 5.0
new_western = 0.5
new_rating_count = 2

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_film_noir, new_horror, new_imax, new_musical, new_mystery,
                      new_romance, new_scifi, new_thriller, new_war, new_western]])


In [None]:
user_vecs = np.tile(user_vec, (len(item_vecs_array), 1))

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs_array)

# make a prediction
y_p = model.predict([suser_vecs[:, u_s:], sitem_vecs[:, i_s:]])

# unscale y prediction
y_pu = scalerTarget.inverse_transform(y_p)


# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first

print(sorted_index)

sorted_ypu   = y_pu[sorted_index]
sorted_items = item_vecs_array[sorted_index]  #using unscaled vectors for display


In [None]:
count = 0
disp = [["pred_y", "movie id", "rating ave", "title", "genres"]]

for i in range(0, sorted_ypu.shape[0]):
    if count == 100:
        break
    count += 1
    movie_id = sorted_items[i, 0].astype(int)
    disp.append([np.around(sorted_ypu[i, 0], 1), sorted_items[i, 0].astype(int), np.around(sorted_items[i, 2].astype(float), 1),
                 movie_dict[movie_id]['title'], movie_dict[movie_id]['genres']])

headers = disp[0]  # First row for headers
data = disp[1:]    # Remaining rows for data
df = pd.DataFrame(data, columns=headers)


# Print the DataFrame
print(df.head(10).to_string(index=False))  # 'to_string' prints the entire DataFrame as a string and 'index=False' hides the index column