<a href="https://colab.research.google.com/github/nickgreenquist/recsys/blob/main/RecSys_Content_Based_Filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import numpy.ma as ma
from numpy import genfromtxt
from collections import defaultdict
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate
pd.set_option("display.precision", 1)

In [3]:
import argparse
import pathlib
import tempfile
from zipfile import ZipFile

import requests
import os

MOVIELENS_URLS = {
    'latest': "http://files.grouplens.org/datasets/movielens/ml-latest.zip",
    'latest-small': "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
}

In [4]:
# @title Download Movielens
def download_movielens(
        dest='movielens',
        package='latest-small',
        mkdir=True,
        verbose=False,
    ):
    url = MOVIELENS_URLS.get(package)
    if not url:
        raise Exception(f"Movie lens package: {package} was not found.")
    if verbose is True:
        print(f"Downloading from {url}")
    output_dir = pathlib.Path(dest).resolve()
    if not output_dir.exists():
        if mkdir:
            output_dir.mkdir(exist_ok=True)
        else:
            raise Exception(f"{output_dir} does not exist. Pass `mkdir=True`")
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        total_size_in_bytes= int(r.headers.get('content-length', 0))
        with tempfile.NamedTemporaryFile(mode='rb+') as temp_f:
            downloaded = 0
            dl_iteration = 0
            chunk_size = 8192
            total_chunks = total_size_in_bytes / chunk_size if total_size_in_bytes else 100
            for chunk in r.iter_content(chunk_size=chunk_size):
                if verbose is True:
                    downloaded += chunk_size
                    dl_iteration += 1
                    percent = (100 * dl_iteration * 1.0/total_chunks)
                    if dl_iteration % 10 == 0 and percent < 100:
                        print(f'Completed {percent:2f}%')
                    elif percent >= 99.9:
                        print(f'Download completed. Now unzipping...')
                temp_f.write(chunk)
            with ZipFile(temp_f, 'r') as zipf:
                zipf.extractall(output_dir)
                if verbose is True:
                    print(f"\n\nUnzipped.\n\nFiles downloaded and unziped to:\n\n{pathlib.Path(dest).resolve()}")

In [5]:
download_movielens(
    "movielens",
    mkdir=True,
    package='latest-small',
    verbose=True
    )

Downloading from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Completed 8.374548%
Completed 16.749097%
Completed 25.123645%
Completed 33.498194%
Completed 41.872742%
Completed 50.247290%
Completed 58.621839%
Completed 66.996387%
Completed 75.370936%
Completed 83.745484%
Completed 92.120032%
Download completed. Now unzipping...


Unzipped.

Files downloaded and unziped to:

/content/movielens


In [6]:
os.listdir('movielens/ml-latest-small')

['movies.csv', 'README.txt', 'tags.csv', 'ratings.csv', 'links.csv']

In [7]:
ratings_df = pd.read_csv('movielens/ml-latest-small/ratings.csv')
movies_df = pd.read_csv('movielens/ml-latest-small/movies.csv')

# CREATE DATASET

In [8]:
def munge_title(title):
    i = title.rfind(' (')
    if i != -1:
        title = title[:i]
    for suff_word in ['The', 'A', 'An']:
        suffix = ', {}'.format(suff_word)
        if title.endswith(suffix):
            title = suff_word + ' ' + title[:-len(suffix)]
    return title

def get_year(title):
    l = title.rfind('(') + 1
    try:
        return int(title[l:l+4])
    except ValueError:
        print(title, end='\t')
        return 0

def get_genre_set(genres):
  return set(genres.split('|'))

In [9]:
# create lookups for MOVIE metadata

all_genres = set()
all_genres_sorted = []

movie_to_title = {}
movie_to_year = {}
movie_to_genres = {}

movie_ids = movies_df.movieId.tolist()
titles = movies_df.title.tolist()
genres = movies_df.genres.tolist()

for i in range(len(movie_ids)):
  id = movie_ids[i]
  title = titles[i]

  movie_to_title[id] = munge_title(title)
  movie_to_year[id] = get_year(title)
  movie_to_genres[id] = set(genres[i].split('|'))

  all_genres.update(movie_to_genres[id])

all_genres_sorted = sorted(list(all_genres))

Babylon 5	Ready Player One	Hyena Road	The Adventures of Sherlock Holmes and Doctor Watson	Nocturnal Animals	Paterson	Moonlight	The OA	Cosmos	Maria Bamford: Old Baby	Generation Iron 2	Black Mirror	

In [10]:
movie_to_num_users = {}
movie_to_avg_rating = {}

movie_ids = ratings_df.movieId.tolist()
ratings = ratings_df.rating.tolist()

for i in range(len(movie_ids)):
  id = movie_ids[i]

  if id not in movie_to_num_users: movie_to_num_users[id] = 0
  if id not in movie_to_avg_rating: movie_to_avg_rating[id] = 0

  movie_to_num_users[id] += 1
  movie_to_avg_rating[id] += ratings[i]

for id in movie_to_avg_rating.keys():
  movie_to_avg_rating[id] /= movie_to_num_users[id]

In [11]:
# create lookup for USER metadata

user_to_num_ratings = {}
user_to_avg_rating = {}
user_to_genre_num_ratings = {}
user_to_genre_to_avg_rating = {}

user_ids = ratings_df.userId.tolist()
ratings = ratings_df.rating.tolist()
movie_ids = ratings_df.movieId.tolist()

for i in range(len(user_ids)):
  user_id = user_ids[i]
  movie_id = movie_ids[i]
  rating = ratings[i]

  if user_id not in user_to_num_ratings: user_to_num_ratings[user_id] = 0
  if user_id not in user_to_avg_rating: user_to_avg_rating[user_id] = 0
  if user_id not in user_to_genre_num_ratings: user_to_genre_num_ratings[user_id] = {}
  if user_id not in user_to_genre_to_avg_rating: user_to_genre_to_avg_rating[user_id] = {}

  user_to_num_ratings[user_id] += 1
  user_to_avg_rating[user_id] += rating

  if movie_id in movie_to_genres:
    for genre in movie_to_genres[movie_id]:
      if genre not in user_to_genre_num_ratings[user_id]: user_to_genre_num_ratings[user_id][genre] = 0
      if genre not in user_to_genre_to_avg_rating[user_id]: user_to_genre_to_avg_rating[user_id][genre] = 0

      user_to_genre_num_ratings[user_id][genre] += 1
      user_to_genre_to_avg_rating[user_id][genre] += rating

for id in user_to_avg_rating.keys():
  user_to_avg_rating[id] /= user_to_num_ratings[id]

for id in user_to_genre_to_avg_rating.keys():
  for genre in user_to_genre_to_avg_rating[id].keys():
    user_to_genre_to_avg_rating[id][genre] /= user_to_genre_num_ratings[id][genre]

In [44]:
# Construct USER training df, ITEM training df, and label df

user_d = {}
movie_d = {}
y_train = []
i = 0

user_ids = ratings_df.userId.tolist()
movie_ids = ratings_df.movieId.tolist()
ratings = ratings_df.rating.tolist()

for j in range(len(user_ids)):
  user_id = user_ids[j]
  movie_id = movie_ids[j]
  user_d[i] = {
      'user_id': user_id,
      'rating_count': user_to_num_ratings[user_id],
      'rating_avg': user_to_avg_rating[user_id],
      **{genre: user_to_genre_to_avg_rating[user_id][genre] if genre in user_to_genre_to_avg_rating[user_id] else 0 for genre in all_genres_sorted},
  }

  movie_d[i] = {
      'movie_id': movie_id,
      # 'movie_title': movie_to_title[movie_id],
      'movie_year': movie_to_year[movie_id],
      'rating_avg': movie_to_avg_rating[movie_id],
      **{genre: 1 if movie_id in movie_to_genres and genre in movie_to_genres[movie_id] else 0 for genre in all_genres_sorted},
  }

  y_train.append(ratings[j])
  i += 1

# converat label vector to numpy
y_train = np.array(y_train)

In [45]:
user_train = pd.DataFrame.from_dict(user_d, orient ='index')

In [46]:
item_train = pd.DataFrame.from_dict(movie_d, orient ='index')

In [47]:
len(user_train), len(item_train)

(100836, 100836)

In [48]:
user_train.head(3)

Unnamed: 0,user_id,rating_count,rating_avg,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,232,4.4,0.0,4.3,4.4,4.7,4.5,4.3,4.4,...,5.0,3.5,0.0,4.7,4.2,4.3,4.2,4.1,4.5,4.3
1,1,232,4.4,0.0,4.3,4.4,4.7,4.5,4.3,4.4,...,5.0,3.5,0.0,4.7,4.2,4.3,4.2,4.1,4.5,4.3
2,1,232,4.4,0.0,4.3,4.4,4.7,4.5,4.3,4.4,...,5.0,3.5,0.0,4.7,4.2,4.3,4.2,4.1,4.5,4.3


In [49]:
item_train.head(3)

Unnamed: 0,movie_id,movie_year,rating_avg,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,3.9,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1995,3.3,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,6,1995,3.9,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [50]:
print(f"y_train[:5]: {y_train[:5]}")

y_train[:5]: [4. 4. 4. 5. 5.]


# DATA PREPROCESSING

In [51]:
num_user_features = user_train.shape[1] - 3  # remove userid, rating count and ave rating during training
num_item_features = item_train.shape[1] - 1  # remove movie id at train time

uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 3  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items

scaledata = True  # applies the standard scalar to data if true

print(f"Number of training vectors: {len(item_train)}")

Number of training vectors: 100836


In [66]:
# scale user training data
user_train_save = user_train

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train_scaled = scalerUser.transform(user_train)

# scale item training data
item_train_save = item_train

scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train_scaled = scalerItem.transform(item_train)

# verify scaling worked
print(np.allclose(item_train_save, scalerItem.inverse_transform(item_train_scaled)))
print(np.allclose(user_train_save, scalerUser.inverse_transform(user_train_scaled)))

True
True


In [53]:
item_train_train, item_test = train_test_split(item_train_scaled, train_size=0.90, shuffle=True, random_state=1)
user_train_train, user_test = train_test_split(user_train_scaled, train_size=0.90, shuffle=True, random_state=1)
y_train_train, y_test = train_test_split(y_train, train_size=0.90, shuffle=True, random_state=1)
print(f"movie/item training data shape: {item_train_train.shape}")
print(f"movie/item test  data shape: {item_test.shape}")

movie/item training data shape: (90752, 23)
movie/item test  data shape: (10084, 23)


In [54]:
scaler = MinMaxScaler((-1, 1))
scaler.fit(y_train_train.reshape(-1, 1))
ynorm_train = scaler.transform(y_train_train.reshape(-1, 1))
ynorm_test = scaler.transform(y_test.reshape(-1, 1))
print(ynorm_train.shape, ynorm_test.shape)

(90752, 1) (10084, 1)


# MODEL TRAINING

In [55]:
num_outputs = 32
tf.random.set_seed(1)

user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear'),
])

item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear'),
])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = keras.Model([input_user, input_item], output)

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 22)]         0           []                               
                                                                                                  
 sequential_2 (Sequential)      (None, 32)           42400       ['input_3[0][0]']                
                                                                                                  
 sequential_3 (Sequential)      (None, 32)           42912       ['input_4[0][0]']                
                                                                                            

In [56]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.1)
model.compile(optimizer=opt,
              loss=cost_fn)

In [57]:
tf.random.set_seed(1)

model.fit([user_train_train[:, u_s:], item_train_train[:, i_s:]], ynorm_train, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x786982f5b580>

In [65]:
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], ynorm_test)



0.11771972477436066

# INFERENCE

In [67]:
# get item embedding for every movie
movie_vectors = []
movie_id_to_vector_list_index = {}
movie_vector_index_to_movie_id = {}

cols = item_train.columns

col_to_data = {}
for col in cols:
  col_to_data[col] = item_train[col].tolist()

for i in range(len(item_train)):
  item_vector = []
  for col in col_to_data.keys():
    item_vector.append(col_to_data[col][i])

  if col_to_data['movie_id'][i] not in movie_id_to_vector_list_index:
    movie_vectors.append(item_vector)
    movie_id_to_vector_list_index[col_to_data['movie_id'][i]] = len(movie_vectors) - 1
    movie_vector_index_to_movie_id[len(movie_vectors) - 1] = col_to_data['movie_id'][i]

movie_vectors = np.array(movie_vectors)

In [68]:
item_train.head(3)

Unnamed: 0,movie_id,movie_year,rating_avg,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,3.9,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1995,3.3,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,6,1995,3.9,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [103]:
movie_vectors[movie_id_to_vector_list_index[1]]

array([1.00000000e+00, 1.99500000e+03, 3.92093023e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [105]:
# scale movie_vectors using all training movie vectors
scalerItemInference = StandardScaler()
scalerItemInference.fit(movie_vectors)
movie_vectors_scaled = scalerItemInference.transform(movie_vectors)

movie_vectors_scaled[movie_id_to_vector_list_index[1]]

array([-0.80945154,  0.03866622,  0.75702469, -0.05923489, -0.48115447,
        2.58944585,  3.86535685,  3.69385683,  1.26134606, -0.37449153,
       -0.21718144, -0.89950891,  3.39097547, -0.09390603, -0.33420867,
       -0.12851782, -0.18830688, -0.25023211, -0.44229263, -0.33477881,
       -0.49101694, -0.20193859, -0.13218965])

In [106]:
all_genres_sorted

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [107]:
new_user_id = 5000
new_rating_count = 5
new_rating_ave = 4.0

user_genre_to_user_vec_index = {
    **{all_genres_sorted[i]: i for i in range(len(all_genres_sorted))},
}

user_vec = np.array([0 for i in range(len(user_train_train[0]))])
user_vec[0] = 5000 # new user_id
user_vec[1] = 5 # new rating_count
user_vec[2] = 4.0 # new rating_vg

# Set genre avg_rating score to '5' to tell model this is a fan of that genre.
user_vec[user_genre_to_user_vec_index['Horror'] + u_s] = 5.0

user_vec = np.array([user_vec for i in range(len(movie_vectors))])

In [113]:
predictions = model.predict([user_vec[:, u_s:], movie_vectors_scaled[:, i_s:]])



In [114]:
predictions_with_titles = []

for i in range(len(predictions)):
  pred_value = predictions[i]
  movie_id = movie_vector_index_to_movie_id[i]
  predictions_with_titles.append((pred_value[0], movie_to_title[movie_id], '|'.join(movie_to_genres[movie_id])))

predictions_with_titles.sort(key=lambda x:x[0], reverse=True)

In [115]:
user_recs_d = {}
i = 0

for tup in predictions_with_titles:
  user_recs_d[i] = {
      'movie_title': tup[1],
      'movie_genres': tup[2],
      'score': tup[0]
  }
  i += 1

user_recs_df = pd.DataFrame.from_dict(user_recs_d, orient ='index')

In [116]:
user_recs_df.head(10)

Unnamed: 0,movie_title,movie_genres,score
0,"Cabinet of Dr. Caligari, The (Cabinet des Dr. ...",Fantasy|Crime|Horror,0.9
1,Rosemary's Baby,Horror|Thriller|Drama,0.9
2,"Black Sabbath (Tre volti della paura, I)",Horror,0.9
3,Blood Feast,Horror,0.9
4,What Ever Happened to Baby Jane?,Horror|Thriller|Drama,0.9
5,The Body Snatcher,Horror|Thriller|Drama,0.9
6,Misery,Horror|Thriller|Drama,0.9
7,'Salem's Lot,Mystery|Horror|Thriller|Drama,0.9
8,Eraserhead,Horror|Drama,0.9
9,Thesis (Tesis),Horror|Thriller|Drama,0.9
