# Dataset

In [34]:
import numpy as np
import pandas as pd
from IPython.display import display

# load data
ratings = pd.read_csv("./Data/ratings.csv")
items = pd.read_csv("./Data/movies.csv")

# drop timestamp column
ratings = ratings.drop(['timestamp'], axis=1)

In [35]:
print("Rating Dataframe")
display(ratings.head())
print('Numbers of ratings: {}'.format(len(ratings.index)))

print("\nItem Dataframe")
display(items.head())
print('Numbers of items: {}'.format(len(items.index)))

Rating Dataframe


Unnamed: 0,user_id,item_id,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


Numbers of ratings: 100836

Item Dataframe


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Numbers of items: 9742


# Popular-based


In [36]:
def weighted_rating(v, m, R, C):
    '''
    v -> average rating for each item (float)
    m -> minimum votes required to be classified as popular (float)
    R -> average rating for the item (pd.Series)
    C -> average rating for the whole dataset (pd.Series)

    Source: IMDB
    '''
    return ((v / (v + m)) * R) + ((m / (v + m)) * C)

def calculate_popular_based_score(rating_df, item_df, user_col, item_col, rating_col):
    # group ratings by item_id
    vote_count = (
        rating_df
        .groupby(item_col, as_index=False)
        .agg({user_col: 'count', rating_col: 'mean'}))
    
    vote_count.columns = [item_col, 'vote_count', 'avg_rating']
    
    # calculate input parameters
    C = np.mean(vote_count['avg_rating'])
    m = np.percentile(vote_count['vote_count'], 70)
    vote_count = vote_count[vote_count['vote_count'] >= m]
    R = vote_count['avg_rating']
    v = vote_count['vote_count']
    vote_count['weighted_rating'] = weighted_rating(v, m, R, C)
    
    # merge DataFrame (left join)
    vote_count = vote_count.merge(item_df, on = [item_col], how = 'left')
    popular_items = vote_count.loc[:,[item_col, 'genres', 'vote_count', 'avg_rating', 'weighted_rating']]

    return popular_items

# init constant
USER_COL = 'user_id'
ITEM_COL = 'item_id'
RATING_COL = 'rating'

# calculate popular_based score and sort descending
popular_items = calculate_popular_based_score(ratings, items, USER_COL, ITEM_COL, RATING_COL)
popular_items = pd.merge(items, popular_items)
popular_items = popular_items.sort_values('weighted_rating', ascending = False)

display(popular_items.head(10))

Unnamed: 0,item_id,title,genres,vote_count,avg_rating,weighted_rating
183,318,"Shawshank Redemption, The (1994)",Crime|Drama,317,4.429022,4.403818
408,858,"Godfather, The (1972)",Crime|Drama,192,4.289062,4.25295
1276,2959,Fight Club (1999),Action|Crime|Drama|Thriller,218,4.272936,4.241498
569,1221,"Godfather: Part II, The (1974)",Crime|Drama,129,4.25969,4.208361
42,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,204,4.237745,4.205389
152,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251,4.231076,4.204795
374,750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,97,4.268041,4.200357
561,1213,Goodfellas (1990),Crime|Drama,126,4.25,4.198024
294,527,Schindler's List (1993),Drama|War,220,4.225,4.195318
2480,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,149,4.238255,4.194469


# Content-based

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

# get items that have been rated
rated_items = items.loc[items[ITEM_COL].isin(ratings[ITEM_COL])].copy()

# split and extract genres
genre = rated_items['genres'].str.split("|", expand=True)

# get all distinct genre
all_genre = set()
for column in genre.columns:
    distinct_genre = genre[column].str.lower().str.strip().unique()
    all_genre.update(distinct_genre)
all_genre.remove('(no genres listed)')
all_genre.remove(None)

# create item-genre matrix
item_genre_mat = rated_items[[ITEM_COL, 'genres']].copy()
item_genre_mat['genres'] = item_genre_mat['genres'].str.lower().str.strip()

# create genres column 
for genre in all_genre:
    item_genre_mat[genre] = np.where(item_genre_mat['genres'].str.contains(genre), 1, 0)

item_genre_mat = item_genre_mat.drop(['genres'], axis=1)
item_genre_mat = item_genre_mat.set_index(ITEM_COL)

# compute similarity matix
corr_mat = cosine_similarity(item_genre_mat)

# get dictionary with key is name and value is index
ind2name = { index: name for index, name in enumerate(item_genre_mat.index)} 
name2ind = { name: index for index, name in ind2name.items() }

# get top-k similar items
def top_k_items(item_id, top_k, corr_mat, dict_name):
    # sort and get top k correlation value
    top_items = corr_mat[item_id, :].argsort()[-top_k:][::-1]
    top_items = [dict_name[index] for index in top_items]
    return top_items

RECOMMEND_ITEM_ID = 1

similar_items = top_k_items(name2ind[RECOMMEND_ITEM_ID],
                            top_k = 25,
                            corr_mat = corr_mat,
                            dict_name = ind2name)

# display result
print('Item ID {}'.format(RECOMMEND_ITEM_ID))
display(items.loc[items['item_id'] == RECOMMEND_ITEM_ID])
print("Top 25 similar movie to item ID {}".format(RECOMMEND_ITEM_ID))
display(items.loc[items[ITEM_COL].isin(similar_items)])

del corr_mat

Item ID 1


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


Top 25 similar movie to item ID 1


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
559,673,Space Jam (1996),Adventure|Animation|Children|Comedy|Fantasy|Sc...
1706,2294,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy
2355,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2809,3754,"Adventures of Rocky and Bullwinkle, The (2000)",Adventure|Animation|Children|Comedy|Fantasy
3000,4016,"Emperor's New Groove, The (2000)",Adventure|Animation|Children|Comedy|Fantasy
3194,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...
3568,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
5490,26340,"Twelve Tasks of Asterix, The (Les douze travau...",Action|Adventure|Animation|Children|Comedy|Fan...
5977,36397,Valiant (2005),Adventure|Animation|Children|Comedy|Fantasy|War


# Collaborative Filtering

In [38]:
user_ratings = pd.merge(items, ratings)

user_ratings = user_ratings.pivot_table(index=['user_id'],columns=['item_id'],values='rating')
user_ratings = user_ratings.dropna(axis=1,thresh=10).fillna(0)

item_similarity_df = user_ratings.corr(method='pearson')

similar_items = top_k_items(name2ind[RECOMMEND_ITEM_ID],
                            top_k = 25,
                            corr_mat = item_similarity_df.to_numpy(),
                            dict_name = ind2name)

# display result
print('Item ID {}'.format(RECOMMEND_ITEM_ID))
display(items.loc[items['item_id'] == RECOMMEND_ITEM_ID])
print("Top 25 similar movie to item ID {}".format(RECOMMEND_ITEM_ID))
display(items.loc[items[ITEM_COL].isin(similar_items)])

Item ID 1


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


Top 25 similar movie to item ID 1


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
27,28,Persuasion (1995),Drama|Romance
131,158,Casper (1995),Adventure|Children
182,214,Before the Rain (Pred dozhdot) (1994),Drama|War
184,216,Billy Madison (1995),Comedy
229,266,Legends of the Fall (1994),Drama|Romance|War|Western
274,315,"Specialist, The (1994)",Action|Drama|Thriller
280,321,Strawberry and Chocolate (Fresa y chocolate) (...,Drama
294,336,"Walking Dead, The (1995)",Drama|War
323,365,Little Buddha (1993),Drama


In [39]:
from scipy.sparse import csr_matrix

user_ratings = pd.merge(items, ratings)
aver = user_ratings.drop(USER_COL, axis=1).groupby(ITEM_COL).mean()
user_ratings = user_ratings.pivot_table(index=['user_id'],columns=['item_id'],values='rating')
user_ratings = user_ratings.dropna(axis=1,thresh=10).fillna(0)

# compute similarity
item_corr_mat = cosine_similarity(user_ratings.T)

similar_items = top_k_items(name2ind[RECOMMEND_ITEM_ID],
                            top_k = 25,
                            corr_mat = item_corr_mat,
                            dict_name = ind2name)

# display result
print('Item ID {}'.format(RECOMMEND_ITEM_ID))
display(items.loc[items['item_id'] == RECOMMEND_ITEM_ID])
print("Top 25 similar movie to item ID {}".format(RECOMMEND_ITEM_ID))
display(items.loc[items[ITEM_COL].isin(similar_items)])

Item ID 1


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


Top 25 similar movie to item ID 1


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
26,27,Now and Then (1995),Children|Drama
27,28,Persuasion (1995),Drama|Romance
70,78,"Crossing Guard, The (1995)",Action|Crime|Drama|Thriller
131,158,Casper (1995),Adventure|Children
148,176,Living in Oblivion (1995),Comedy
158,187,Party Girl (1995),Comedy
178,210,Wild Bill (1995),Western
182,214,Before the Rain (Pred dozhdot) (1994),Drama|War
229,266,Legends of the Fall (1994),Drama|Romance|War|Western


In [40]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow.keras as keras
from sklearn.model_selection import train_test_split

from typing import Dict, Text

def df_to_ds(df):
    # convert pd.DataFrame to tf.data.Dataset
    ds = tf.data.Dataset.from_tensor_slices(
        (dict(df[['user_id','item_id']]), df['rating']))
    
    # convert Tuple[Dict[Text, tf.Tensor], tf.Tensor] to Dict[Text, tf.Tensor]
    ds = ds.map(lambda x, y: {
    'user_id' : x['user_id'],
    'item_id' : x['item_id'],
    'rating' : y
    })

    return ds.batch(256)

class RankingModel(keras.Model):
    def __init__(self, user_id, item_id, embedding_size):
        super().__init__()
        
        # user model
        input = keras.Input(shape=(), dtype=tf.string)
        x = keras.layers.StringLookup(
            vocabulary = user_id, mask_token = None
            )(input)
        output = keras.layers.Embedding(
            input_dim = len(user_id) + 1,
            output_dim = embedding_size,
            name = 'embedding'
        )(x)
        self.user_model = keras.Model(inputs = input,
                                        outputs = output,
                                        name = 'user_model')

        # item model
        input = keras.Input(shape=(), dtype=tf.string)
        x = keras.layers.StringLookup(
            vocabulary = item_id, mask_token = None
            )(input)
        output = keras.layers.Embedding(
            input_dim = len(item_id) + 1,
            output_dim = embedding_size,
            name = 'embedding'
        )(x)
        self.item_model = keras.Model(inputs = input,
                                    outputs = output,
                                    name = 'item_model')

        # rating model
        user_input = keras.Input(shape=(embedding_size,), name='user_emb')
        item_input = keras.Input(shape=(embedding_size,), name='item_emb')
        x = keras.layers.Concatenate(axis=1)([user_input, item_input])
        x = keras.layers.Dense(256, activation = 'relu')(x)
        x = keras.layers.Dense(64, activation = 'relu')(x)
        output = keras.layers.Dense(1)(x)
        
        self.rating_model = keras.Model(
            inputs = {
                'user_id' : user_input,
                'item_id' : item_input
            },
            outputs = output,
            name = 'rating_model'
        )

    def call(self, inputs: Dict[Text, tf.Tensor]) -> tf.Tensor:

        user_emb = self.user_model(inputs['user_id'])
        item_emb = self.item_model(inputs['item_id'])

        prediction = self.rating_model({
            'user_id' : user_emb,
            'item_id' : item_emb
        })
        
        return prediction

class GMFModel(tfrs.models.Model):
    def __init__(self, user_id, item_id, embedding_size):
        super().__init__()
        self.ranking_model = RankingModel(user_id, item_id, embedding_size)
        self.task = tfrs.tasks.Ranking(
            loss = keras.losses.MeanSquaredError(),
            metrics = [keras.metrics.RootMeanSquaredError()]
        )
    
    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        
        return self.ranking_model(
            {
                'user_id' : features['user_id'], 
                'item_id' : features['item_id']
            })

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        return self.task(labels = features.pop('rating'),
                            predictions = self.ranking_model(features))

# preprocessing
train, test = train_test_split(ratings, train_size = .8, random_state=42)
train, test = df_to_ds(train), df_to_ds(test)

# init model
embedding_size = 64
model = GMFModel(user_id.astype(str),
                    item_id.astype(str),
                    embedding_size)
model.compile(
    optimizer = keras.optimizers.Adagrad(learning_rate = .01)
)

# fitting the model
model.fit(train, epochs=3, verbose=0)

# evaluate with the test data
result = model.evaluate(test, return_dict=True, verbose=0)
print("\nEvaluation on the test set:")
display(result)

# extract item embedding
item_emb = model.ranking_model.item_model.layers[-1].get_weights()[0]

item_corr_mat = cosine_similarity(item_emb)

print("\nThe top-k similar movie to item_id 99")
similar_items = top_k_items(name2ind['99'],
                            top_k = 10,
                            corr_mat = item_corr_mat,
                            dict_name=ind2name)

display(items.loc[items[ITEM_COL].isin(similar_items)])

del item_corr_mat

NameError: name 'user_id' is not defined