In [45]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
import time

pd.set_option('display.max_columns', 50)

from evaluation_and_testing.testing import evaluate_train_test_split_explicit
from evaluation_and_testing.testing import evaluate_leave_one_out_explicit
from evaluation_and_testing.testing import evaluate_train_test_split_implicit
from evaluation_and_testing.testing import evaluate_leave_one_out_implicit

from recommenders.recommender import Recommender

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load the data

In [46]:
ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')

display(ml_movies_df.head(10))

# Filter the data to reduce the number of movies
seed = 6789
rng = np.random.RandomState(seed=seed)
left_ids = rng.choice(ml_movies_df['item_id'], size=1000, replace=False)

ml_ratings_df = ml_ratings_df.loc[ml_ratings_df['item_id'].isin(left_ids)]
ml_movies_df = ml_movies_df.loc[ml_movies_df['item_id'].isin(left_ids)]
ml_df = ml_df.loc[ml_df['item_id'].isin(left_ids)]

print("Number of chosen interactions: {}".format(len(ml_ratings_df)))

display(ml_df.head(10))

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


Number of chosen interactions: 9692


Unnamed: 0,user_id,item_id,rating,timestamp,title,genres
572,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
573,5,50,4.0,847434881,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
574,6,50,1.0,845553381,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
575,7,50,4.5,1106635993,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
576,8,50,5.0,839463644,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
577,16,50,4.0,1377476781,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
578,17,50,4.5,1305697013,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
579,18,50,5.0,1455049343,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
580,23,50,4.0,1107163741,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
581,24,50,4.0,1458942023,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


# Baseline recommenders

**Task 1.** Implement the MostPopularRecommender (check the slides for class 1), evaluate it with leave-one-out procedure for implicit feedback, print HR@1, HR@3, HR@5, HR@10, NDCG@1, NDCG@3, NDCG@5, NDCG@10.

In [3]:
class MostPopularRecommender(Recommender):
    """
    Base recommender class.
    """
    
    def __init__(self):
        """
        Initialize base recommender params and variables.
        """
        self.most_popular_items=None
    
    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.
        
        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items 
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
        """
        
        items_count = interactions_df.loc[["item_id", "user_id"]].groupby(by="item_id").count()
        
        self.most_popular_items = items_count.sort_values("user_id",ascending=False)
        self.most_popular_items = self.most_popular_items.rename(columns={"user_id":"popularity"})
    
    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns 
        top n_recommendations for each user.
        
        :param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations 
            for each user.
        :rtype: pd.DataFrame
        """
        
        recommendations=pd.DataFrame(columns=["user_id", "item_id", "score"])

        for index,user in users_df.iterrows():
            
            user_id=[user["user_id"]]*n_recommendations
            item_id=self.most_popular_items.index[:n_recommendations]
            score=self.most_popular_items["popularity"][:n_recommendations]
            
            user_recommendations=pd.DataFrame({"user_id":user_id,"item_id":item_id,"score":score})
            
            recommendations=pd.concat([recommendations, user_recommendations])

        return recommendations
    
# Quick test of the recommender

most_popular_recommender = MostPopularRecommender()
most_popular_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = most_popular_recommender.recommend(pd.DataFrame([[1], [2], [6]], columns=['user_id']), ml_movies_df, 10)

recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(HTML(recommendations.to_html()))

most_popular_recommender = MostPopularRecommender()

most_popular_results = [['MostPopularRecommender'] + list(evaluate_leave_one_out_implicit(
    most_popular_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

most_popular_results = pd.DataFrame(
    most_popular_results, 
    columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(most_popular_results.to_html()))

KeyError: "None of [Index(['item_id', 'user_id'], dtype='object')] are in the [index]"

**Task 2.** Implement the HighestRatedRecommender (check the slides for class 1), but recommend only those movies which got at least 50 ratings. Evaluate it with leave-one-out procedure for implicit feedback, print HR@1, HR@3, HR@5, HR@10, NDCG@1, NDCG@3, NDCG@5, NDCG@10.

In [None]:
class HighestRatedRecommender(Recommender):
    """
    Base recommender class.
    """
    
    def __init__(self):
        """
        Initialize base recommender params and variables.
        """
        self.highest_rated_items=None
    
    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.
        
        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items 
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
        """
        
        items_mean = interactions_df[["item_id", "rating"]].groupby(by="item_id").mean()
        items_count = interactions_df[["item_id", "rating"]].groupby(by="item_id").count()
        items_mean = items_mean[items_count["rating"]>=50]
        
        self.highest_rated_items = items_mean.sort_values("rating",ascending=False)
    
    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns 
        top n_recommendations for each user.
        
        :param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations 
            for each user.
        :rtype: pd.DataFrame
        """
        
        recommendations=pd.DataFrame(columns=["user_id", "item_id", "score"])

        for index,user in users_df.iterrows():
            
            user_id=[user["user_id"]]*n_recommendations
            item_id=self.highest_rated_items.index[:n_recommendations]
            score=self.highest_rated_items["rating"][:n_recommendations]
            
            user_recommendations=pd.DataFrame({"user_id":user_id,"item_id":item_id,"score":score})
            
            recommendations=pd.concat([recommendations, user_recommendations])

        return recommendations
    
# Quick test of the recommender

highest_rated_recommender = HighestRatedRecommender()
highest_rated_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = highest_rated_recommender.recommend(pd.DataFrame([[1], [2], [6]], columns=['user_id']), ml_movies_df, 10)

recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(HTML(recommendations.to_html()))

highest_rated_recommender = HighestRatedRecommender()

highest_rated_results = [['HighestRatedRecommender'] + list(evaluate_leave_one_out_implicit(
    highest_rated_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id', 'rating']], ml_movies_df, max_evals=300, seed=6789))]

highest_rated_results = pd.DataFrame(
    highest_rated_results, 
    columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(highest_rated_results.to_html()))

**Task 3.** Implement the RandomRecommender (check the slides for class 1), evaluate it with leave-one-out procedure for implicit feedback, print HR@1, HR@3, HR@5, HR@10, NDCG@1, NDCG@3, NDCG@5, NDCG@10.

In [None]:
class RandomRecommender(Recommender):
    """
    Base recommender class.
    """
    
    def __init__(self, seed=0):
        """
        Initialize base recommender params and variables.
        
        :param int seed: Seed for the random number generator.
        """
        self.seed=seed
        self.rng=np.random.RandomState(seed=seed)
        self.all_items=None
    
    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.
        
        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items 
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
        """
        
        self.all_items = items_df["item_id"].unique().tolist()
    
    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns 
        top n_recommendations for each user.
        
        :param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations 
            for each user.
        :rtype: pd.DataFrame
        """
        
        recommendations=pd.DataFrame(columns=["user_id", "item_id", "score"])

        for index,user in users_df.iterrows():
            
            user_id=[user["user_id"]]*n_recommendations
            item_id=self.rng.choice(self.all_items,n_recommendations)
            score=[1]*n_recommendations
            
            user_recommendations=pd.DataFrame({"user_id":user_id,"item_id":item_id,"score":score})
            
            recommendations=pd.concat([recommendations, user_recommendations])

        return recommendations
    
# Quick test of the recommender

random_recommender = RandomRecommender(seed=6789)
random_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = random_recommender.recommend(pd.DataFrame([[1], [2], [6]], columns=['user_id']), ml_movies_df, 10)

recommendations = recommendations.merge(ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(HTML(recommendations.to_html()))

random_recommender = RandomRecommender(seed=seed)

random_results = [['RandomRecommender'] + list(evaluate_leave_one_out_implicit(
    random_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id', 'rating']], ml_movies_df, max_evals=300, seed=seed))]

random_results = pd.DataFrame(
    random_results, 
    columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(random_results.to_html()))

# Linear Regression Recommender

For every movie we transform its genres into one-hot encoded features and we normalize them, for every user we count percentages for all genres how often do they appear among films watched by the user, we multiply both vectors (for the item and the user) to obtain explanatory variables, and then we fit a linear regression model to those features and actual ratings.

**Task 4.** Implement the calculate_user_genres method for calculating a DataFrame with one row per user and columns corresponding to genres (e.g. 'user_action', 'user_drama') with values calculated as follows:

- count the number of times a given user watched a given genre,
- apply a natural logarithm to this value plus one,
- normalize those values so that the sum of them over all columns within a row is equal to 1.

Implement the calculate_item_genres method for replacing the 'genres' column with one column per genre (e.g. 'action', 'drama') with values calculated as follows:

- place 1 in every column for which the genre appears in genres,
- normalize those values so that the sum of them over all genre columns within a row is equal to 1.

If item_features is None, then first find all genres and prepare a list of them. If item_features is not None, then create columns based on this list. Return both the transformed DataFrame and the list of genres.  Do not use MultiLabelBinarizer.

Note that in this second method you have to preserve the remaining structure of the DataFrame.

Evaluate the LinearRegressionRecommender with it using leave-one-out procedure for implicit feedback, print HR@1, HR@3, HR@5, HR@10, NDCG@1, NDCG@3, NDCG@5, NDCG@10.

In [47]:
# Prepare data for tests
interactions_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')
interactions_df.loc[:, 'genres'] = interactions_df['genres'].str.replace("-", "_", regex=False)
interactions_df.loc[:, 'genres'] = interactions_df['genres'].str.replace(" ", "_", regex=False)
interactions_df.loc[:, 'genres'] = interactions_df['genres'].str.replace("(", "", regex=False)
interactions_df.loc[:, 'genres'] = interactions_df['genres'].str.replace(")", "", regex=False)
interactions_df.loc[:, 'genres'] = interactions_df['genres'].str.lower()
interactions_df.loc[:, 'genres'] = interactions_df['genres'].str.split("|")

In [40]:
def calculate_user_genres(interactions_df):
    
    users_df = interactions_df[["user_id", "genres"]].copy()
    users_df = users_df.explode("genres")
    users_df["val"] = 1
    users_df = users_df.pivot_table(index="user_id", columns="genres", values="val", aggfunc="count")
    users_df = np.log(users_df) + 1
    users_df = users_df / users_df.sum(axis=1).values.reshape(-1, 1)
    users_df.columns.name = None
    users_df = users_df.fillna(0)
    users_df = users_df.add_prefix("user_")
    
    return users_df

In [41]:
# Test the calculate_user_genres method
display(calculate_user_genres(interactions_df))

Unnamed: 0_level_0,user_action,user_adventure,user_animation,user_children,user_comedy,user_crime,user_documentary,user_drama,user_fantasy,user_film_noir,user_horror,user_imax,user_musical,user_mystery,user_no_genres_listed,user_romance,user_sci_fi,user_thriller,user_war,user_western
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.097344,0.090767,0.061857,0.076914,0.090767,0.082288,0.000000,0.070337,0.061857,0.000000,0.000000,0.000000,0.029475,0.029475,0.0,0.076914,0.070337,0.082288,0.049906,0.029475
2,0.150146,0.000000,0.000000,0.000000,0.000000,0.150146,0.000000,0.247570,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.088679,0.088679,0.186102,0.000000,0.088679
3,0.265122,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.156585,0.000000,0.156585,0.000000,0.000000,0.000000,0.0,0.000000,0.265122,0.156585,0.000000,0.000000
4,0.035794,0.035794,0.060604,0.075117,0.121623,0.093401,0.000000,0.132724,0.060604,0.035794,0.000000,0.035794,0.075117,0.000000,0.0,0.105445,0.000000,0.060604,0.035794,0.035794
5,0.000000,0.000000,0.065801,0.065801,0.065801,0.065801,0.000000,0.138092,0.065801,0.000000,0.000000,0.065801,0.065801,0.065801,0.0,0.138092,0.000000,0.065801,0.065801,0.065801
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.065697,0.061817,0.032736,0.050452,0.089150,0.071694,0.019335,0.100631,0.063854,0.019335,0.040576,0.032736,0.019335,0.050452,0.0,0.083761,0.056958,0.071694,0.050452,0.019335
607,0.109915,0.078301,0.000000,0.037311,0.104163,0.089035,0.000000,0.123222,0.037311,0.000000,0.078301,0.000000,0.037311,0.037311,0.0,0.063173,0.089035,0.078301,0.037311,0.000000
608,0.081849,0.067353,0.049301,0.070057,0.089954,0.073502,0.000000,0.079708,0.067353,0.000000,0.065841,0.031989,0.031989,0.045085,0.0,0.062396,0.073502,0.078133,0.031989,0.000000
609,0.213077,0.360770,0.000000,0.213077,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.213077,0.000000,0.000000,0.000000


In [121]:
from sklearn.preprocessing import MultiLabelBinarizer

def calculate_item_genres(item_genres, item_features=None):
    
#     return_item_genres = item_genres.copy()
#     return_item_genres = return_item_genres.explode("genres")
#     return_item_genres["val"] = 1
#     return_item_genres = return_item_genres.pivot_table(index="item_id", columns="genres", values="val", aggfunc=lambda x:len(x.unique()))
#     return_item_genres = return_item_genres / return_item_genres.sum(axis=1).values.reshape(-1, 1)
#     return_item_genres.columns.name = None
#     return_item_genres = return_item_genres.fillna(0)
#     return_item_features = item_features
#     if item_features is not None:
#         return_item_genres = return_item_genres[return_item_genres.columns.intersection(return_item_features)]
#     return_item_genres = return_item_genres.add_prefix("user_")
#     return_item_genres["rating"] = 0
#     return_item_genres[None] = 0

#     for index,row in item_genres.iterrows():
#         for genre in row["genres"]:
#             item_genres[index][genre] = 1

    
    display(item_genres)
    ret_item_genres = item_genres[['user_id', 'genres']].copy()
    display(ret_item_genres)
    ret_item_genres = ret_item_genres.explode('genres')
    ret_item_genres['val'] = 1
    ret_item_genres = ret_item_genres.pivot_table(index='user_id', columns='genres', values='val', aggfunc='count')
    ret_item_genres = ret_item_genres / ret_item_genres.sum(axis=1).values.reshape(-1, 1)
    ret_item_genres = ret_item_genres.rename_axis(None, axis=1).fillna(0)
    
    if item_features is not None:
        ret_item_genres = ret_item_genres[ret_item_genres.columns.intersection(item_features)]
    
    ret_item_features = ret_item_genres.columns.tolist()
    
    ret_item_genres = ret_item_genres.add_prefix('user_')
    
    merge_item_genres = item_genres[["user_id","rating"]].copy()
    ret_item_genres = ret_item_genres.merge(merge_item_genres, on='user_id')
    
    return ret_item_genres, ret_item_features


In [122]:
# Test the method

display(interactions_df)
item_genres, item_features = calculate_item_genres(interactions_df)
display(item_genres)
print(item_features)

item_genres, item_features = calculate_item_genres(interactions_df, ['comedy', 'crime', 'drama', 'horror', 'test_category'])
display(item_genres)
print(item_features)


Unnamed: 0,user_id,item_id,rating,timestamp,title,genres,crime,mystery,thriller,action,comedy,romance,adventure,sci_fi,drama,children,animation,musical,fantasy,war,western,horror,imax,film_noir,documentary,no_genres_listed
0,1,50,5.0,964982931,"Usual Suspects, The (1995)","[crime, mystery, thriller]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,5,50,4.0,847434881,"Usual Suspects, The (1995)","[crime, mystery, thriller]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,6,50,1.0,845553381,"Usual Suspects, The (1995)","[crime, mystery, thriller]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3,7,50,4.5,1106635993,"Usual Suspects, The (1995)","[crime, mystery, thriller]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,8,50,5.0,839463644,"Usual Suspects, The (1995)","[crime, mystery, thriller]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9687,610,110746,4.0,1493850133,Hatchet III (2013),"[comedy, horror]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9688,610,113159,3.5,1493848692,Life After Beth (2014),"[comedy, horror, romance]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9689,610,114707,3.0,1493848404,Horns (2014),"[horror, mystery]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9690,610,130840,4.5,1493848621,Spring (2015),"[horror, romance, sci_fi]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


Unnamed: 0,user_id,item_id,rating,timestamp,title,genres,crime,mystery,thriller,action,comedy,romance,adventure,sci_fi,drama,children,animation,musical,fantasy,war,western,horror,imax,film_noir,documentary,no_genres_listed
0,1,50,5.0,964982931,"Usual Suspects, The (1995)","[crime, mystery, thriller]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,5,50,4.0,847434881,"Usual Suspects, The (1995)","[crime, mystery, thriller]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,6,50,1.0,845553381,"Usual Suspects, The (1995)","[crime, mystery, thriller]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3,7,50,4.5,1106635993,"Usual Suspects, The (1995)","[crime, mystery, thriller]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,8,50,5.0,839463644,"Usual Suspects, The (1995)","[crime, mystery, thriller]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9687,610,110746,4.0,1493850133,Hatchet III (2013),"[comedy, horror]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9688,610,113159,3.5,1493848692,Life After Beth (2014),"[comedy, horror, romance]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9689,610,114707,3.0,1493848404,Horns (2014),"[horror, mystery]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9690,610,130840,4.5,1493848621,Spring (2015),"[horror, romance, sci_fi]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


Unnamed: 0,user_id,genres
0,1,"[crime, mystery, thriller]"
1,5,"[crime, mystery, thriller]"
2,6,"[crime, mystery, thriller]"
3,7,"[crime, mystery, thriller]"
4,8,"[crime, mystery, thriller]"
...,...,...
9687,610,"[comedy, horror]"
9688,610,"[comedy, horror, romance]"
9689,610,"[horror, mystery]"
9690,610,"[horror, romance, sci_fi]"


Unnamed: 0,user_id,user_action,user_adventure,user_animation,user_children,user_comedy,user_crime,user_documentary,user_drama,user_fantasy,user_film_noir,user_horror,user_imax,user_musical,user_mystery,user_no_genres_listed,user_romance,user_sci_fi,user_thriller,user_war,user_western,rating
0,1,0.149254,0.119403,0.044776,0.074627,0.119403,0.089552,0.0,0.059701,0.044776,0.000000,0.000000,0.000000,0.014925,0.014925,0.0,0.074627,0.059701,0.089552,0.029851,0.014925,5.0
1,1,0.149254,0.119403,0.044776,0.074627,0.119403,0.089552,0.0,0.059701,0.044776,0.000000,0.000000,0.000000,0.014925,0.014925,0.0,0.074627,0.059701,0.089552,0.029851,0.014925,3.0
2,1,0.149254,0.119403,0.044776,0.074627,0.119403,0.089552,0.0,0.059701,0.044776,0.000000,0.000000,0.000000,0.014925,0.014925,0.0,0.074627,0.059701,0.089552,0.029851,0.014925,4.0
3,1,0.149254,0.119403,0.044776,0.074627,0.119403,0.089552,0.0,0.059701,0.044776,0.000000,0.000000,0.000000,0.014925,0.014925,0.0,0.074627,0.059701,0.089552,0.029851,0.014925,3.0
4,1,0.149254,0.119403,0.044776,0.074627,0.119403,0.089552,0.0,0.059701,0.044776,0.000000,0.000000,0.000000,0.014925,0.014925,0.0,0.074627,0.059701,0.089552,0.029851,0.014925,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9687,610,0.151899,0.063291,0.015823,0.015823,0.107595,0.069620,0.0,0.110759,0.034810,0.003165,0.110759,0.031646,0.003165,0.034810,0.0,0.022152,0.085443,0.123418,0.009494,0.006329,4.0
9688,610,0.151899,0.063291,0.015823,0.015823,0.107595,0.069620,0.0,0.110759,0.034810,0.003165,0.110759,0.031646,0.003165,0.034810,0.0,0.022152,0.085443,0.123418,0.009494,0.006329,3.5
9689,610,0.151899,0.063291,0.015823,0.015823,0.107595,0.069620,0.0,0.110759,0.034810,0.003165,0.110759,0.031646,0.003165,0.034810,0.0,0.022152,0.085443,0.123418,0.009494,0.006329,3.0
9690,610,0.151899,0.063291,0.015823,0.015823,0.107595,0.069620,0.0,0.110759,0.034810,0.003165,0.110759,0.031646,0.003165,0.034810,0.0,0.022152,0.085443,0.123418,0.009494,0.006329,4.5


['action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film_noir', 'horror', 'imax', 'musical', 'mystery', 'no_genres_listed', 'romance', 'sci_fi', 'thriller', 'war', 'western']


Unnamed: 0,user_id,item_id,rating,timestamp,title,genres,crime,mystery,thriller,action,comedy,romance,adventure,sci_fi,drama,children,animation,musical,fantasy,war,western,horror,imax,film_noir,documentary,no_genres_listed
0,1,50,5.0,964982931,"Usual Suspects, The (1995)","[crime, mystery, thriller]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,5,50,4.0,847434881,"Usual Suspects, The (1995)","[crime, mystery, thriller]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,6,50,1.0,845553381,"Usual Suspects, The (1995)","[crime, mystery, thriller]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3,7,50,4.5,1106635993,"Usual Suspects, The (1995)","[crime, mystery, thriller]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,8,50,5.0,839463644,"Usual Suspects, The (1995)","[crime, mystery, thriller]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9687,610,110746,4.0,1493850133,Hatchet III (2013),"[comedy, horror]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9688,610,113159,3.5,1493848692,Life After Beth (2014),"[comedy, horror, romance]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9689,610,114707,3.0,1493848404,Horns (2014),"[horror, mystery]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
9690,610,130840,4.5,1493848621,Spring (2015),"[horror, romance, sci_fi]",1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


Unnamed: 0,user_id,genres
0,1,"[crime, mystery, thriller]"
1,5,"[crime, mystery, thriller]"
2,6,"[crime, mystery, thriller]"
3,7,"[crime, mystery, thriller]"
4,8,"[crime, mystery, thriller]"
...,...,...
9687,610,"[comedy, horror]"
9688,610,"[comedy, horror, romance]"
9689,610,"[horror, mystery]"
9690,610,"[horror, romance, sci_fi]"


Unnamed: 0,user_id,user_comedy,user_crime,user_drama,user_horror,rating
0,1,0.119403,0.089552,0.059701,0.000000,5.0
1,1,0.119403,0.089552,0.059701,0.000000,3.0
2,1,0.119403,0.089552,0.059701,0.000000,4.0
3,1,0.119403,0.089552,0.059701,0.000000,3.0
4,1,0.119403,0.089552,0.059701,0.000000,5.0
...,...,...,...,...,...,...
9687,610,0.107595,0.069620,0.110759,0.110759,4.0
9688,610,0.107595,0.069620,0.110759,0.110759,3.5
9689,610,0.107595,0.069620,0.110759,0.110759,3.0
9690,610,0.107595,0.069620,0.110759,0.110759,4.5


['comedy', 'crime', 'drama', 'horror']


In [109]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MultiLabelBinarizer

from recommenders.recommender import Recommender

class LinearRegressionRecommender(Recommender):
    """
    Linear regression recommender class.
    """
    
    def __init__(self):
        """
        Initialize recommender params and variables.
        """
        self.model = None
        self.mlb = None
        self.users_dict = None
        self.user_features = None
        
        self.uses_dot_product = True
    
    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.
        
        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items 
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
        """
        
        # Transform genres to a more code-friendly form
        
        interactions_df = pd.merge(interactions_df, items_df, on='item_id')
        interactions_df = self._transform_genres(interactions_df)
        
        # Prepare user features
        
        users_df = interactions_df[['user_id', 'genres']].copy()
        users_df = users_df.explode('genres')
        users_df['val'] = 1
        users_df = users_df.pivot_table(index='user_id', columns='genres', values='val', aggfunc='count')
        users_df = users_df / users_df.sum(axis=1).values.reshape(-1, 1)
        users_df = users_df.rename_axis(None, axis=1).fillna(0)
        users_df = users_df.add_prefix('user_')
#         display(users_df.head(10))
        
        self.users_dict = users_df.to_dict('index')
        
        self.user_features = users_df.columns.tolist()
        
        interactions_df = interactions_df.merge(users_df, on='user_id')
#         display(interactions_df.head(10))
                
        # Prepare item features
        
        # Transform genres into binary values
        
        self.mlb = MultiLabelBinarizer()
        interactions_df = interactions_df.join(
            pd.DataFrame(self.mlb.fit_transform(interactions_df.pop('genres')),
                         columns=self.mlb.classes_,
                         index=interactions_df.index))
        
        # Normalize the values so that each movie's genres sum up to 1
        
        interactions_df[self.mlb.classes_] = interactions_df[self.mlb.classes_] \
            / interactions_df[self.mlb.classes_].sum(axis=1).values.reshape(-1, 1)
        
#         display(interactions_df.loc[:, self.mlb.classes_].head(10))

        # Prepare input data and fit the model
    
        # Dot product for personalization
        if self.uses_dot_product:
            interactions_df[self.mlb.classes_] = interactions_df[self.mlb.classes_] \
                * interactions_df[self.user_features].values
    
#         display(interactions_df.head(10))
        
        if self.uses_dot_product:  # Personalized
            x = interactions_df.loc[:, self.mlb.classes_].values
        else:  # Non-personalized
            x = interactions_df.loc[:, list(self.mlb.classes_) + self.user_features].values
        
        y = interactions_df['rating'].values
    
        self.model = LinearRegression().fit(x, y)
    
    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns 
        top n_recommendations for each user.
        
        :param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations 
            for each user.
        :rtype: pd.DataFrame
        """
        
        # Transform the item to be scored into proper features
        
        items_df = items_df.copy()
        items_df = self._transform_genres(items_df)
        
        items_df = items_df.join(
            pd.DataFrame(self.mlb.transform(items_df.pop('genres')),
                         columns=self.mlb.classes_,
                         index=items_df.index))
        
        items_df[self.mlb.classes_] = items_df[self.mlb.classes_] \
            / items_df[self.mlb.classes_].sum(axis=1).values.reshape(-1, 1)

        # Score the item
    
        recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        
        for ix, user in users_df.iterrows():
            if user['user_id'] in self.users_dict:
                user_df = pd.DataFrame.from_dict({user['user_id']: self.users_dict[user['user_id']]}, orient='index')
            else:
                user_df = pd.DataFrame.from_dict(
                    {user['user_id']: [1 / len(self.user_features)]*len(self.user_features)}, orient='index')
                user_df.columns = self.user_features
#             display(user_df)
#             display(items_df)
            input_df = items_df.copy()
            
            if self.uses_dot_product:
                input_df[self.mlb.classes_] = items_df[self.mlb.classes_] * user_df.values
#                 display(input_df)
                scores = self.model.predict(input_df.loc[:, self.mlb.classes_].values)
            else:
                input_df = input_df.merge(user_df, how='cross')
#                 display(input_df)
                scores = self.model.predict(input_df.loc[:, list(self.mlb.classes_) + self.user_features].values)
    
            chosen_pos = np.argsort(-scores)[:n_recommendations]
        
            user_recommendations = []
            for item_pos in chosen_pos:
                user_recommendations.append(
                    {
                        'user_id': user['user_id'],
                        'item_id': input_df.iloc[item_pos]['item_id'],
                        'score': scores[item_pos]
                    }
                )
                
            user_recommendations = pd.DataFrame(user_recommendations)

            recommendations = pd.concat([recommendations, user_recommendations])

        return recommendations
    
    def _transform_genres(self, df):
        """
        Transforms a string with genres into a list of cleaned genre names.
        
        :param pd.DataFrame df: A DataFrame with 'genres' column.
        """
        df.loc[:, 'genres'] = df['genres'].str.replace("-", "_", regex=False)
        df.loc[:, 'genres'] = df['genres'].str.replace(" ", "_", regex=False)
        df.loc[:, 'genres'] = df['genres'].str.replace("(", "", regex=False)
        df.loc[:, 'genres'] = df['genres'].str.replace(")", "", regex=False)
        df.loc[:, 'genres'] = df['genres'].str.lower()
        df.loc[:, 'genres'] = df['genres'].str.split("|")
        return df

In [9]:
# Quick test of the recommender

lr_recommender = LinearRegressionRecommender()
lr_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = lr_recommender.recommend(pd.DataFrame([[3], [5], [39]], columns=['user_id']), ml_movies_df, 10)

recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
display(recommendations)

Unnamed: 0,user_id,item_id,score,title,genres
0,3,51562,3.501376,Babylon 5: The Gathering (1993),Sci-Fi
1,3,5468,3.501376,20 Million Miles to Earth (1957),Sci-Fi
2,3,176371,3.501376,Blade Runner 2049 (2017),Sci-Fi
3,3,147384,3.501376,Doctor Who: The Runaway Bride (2007),Sci-Fi
4,3,2698,3.501376,Zone 39 (1997),Sci-Fi
5,3,2661,3.501376,It Came from Outer Space (1953),Sci-Fi
6,3,2311,3.501376,2010: The Year We Make Contact (1984),Sci-Fi
7,3,3926,3.425357,Voyage to the Bottom of the Sea (1961),Adventure|Sci-Fi
8,3,4953,3.425357,"People That Time Forgot, The (1977)",Adventure|Sci-Fi
9,3,3994,3.425357,Unbreakable (2000),Drama|Sci-Fi


### Train-test split test

In [10]:
lr_recommender = LinearRegressionRecommender()

t0 = time.time()

results = [['LinearRegressionRecommender'] + list(evaluate_train_test_split_explicit(
    lr_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id', 'rating']], ml_movies_df, seed=seed))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'RMSE', 'MRE', 'TRE'])

display(results)

print('Total evaluation time: {}'.format(time.time() - t0))

Unnamed: 0,Recommender,RMSE,MRE,TRE
0,LinearRegressionRecommender,1.016594,0.348461,0.230529


Total evaluation time: 30.466177701950073


### Leave-one-out test

In [11]:
lr_recommender = LinearRegressionRecommender()

t0 = time.time()

results = [['LinearRegressionRecommender'] + list(evaluate_leave_one_out_explicit(
    lr_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id', 'rating']], ml_movies_df, seed=seed))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'RMSE', 'MRE', 'TRE'])

display(results)

print('Total evaluation time: {}'.format(time.time() - t0))

Unnamed: 0,Recommender,RMSE,MRE,TRE
0,LinearRegressionRecommender,1.045415,0.401399,0.242301


Total evaluation time: 29.6688334941864


# TF-IDF Recommender
TF-IDF stands for term frequency–inverse document frequency. Typically Tf-IDF method is used to assign keywords (words describing the gist of a document) to documents in a corpus of documents.

In our case we will treat users as documents and genres as words.

Term-frequency is given by the following formula:
<center>
$$
    \text{tf}(g, u) = f_{g, u}
$$
</center>
where $f_{g, i}$ is the number of times genre $g$ appear for movies watched by user $u$.

Inverse document frequency is defined as follows:
<center>
$$
    \text{idf}(g) = \log \frac{N}{n_g}
$$
</center>
where $N$ is the number of users and $n_g$ is the number of users with $g$ in their genres list.

Finally, tf-idf is defined as follows:
<center>
$$
    \text{tfidf}(g, u) = \text{tf}(g, u) \cdot \text{idf}(g)
$$
</center>

In our case we will measure how often a given genre appears for movies watched by a given user vs how often it appears for all users. To obtain a movie score we will take the average of its genres' scores for this user.

**Task 5.** Implement the following method for calculating a TF-IDF scores in a form of a dict (use defaultdict):

`{(1, 'action'): 0.45306430692185395, (1, 'adventure'): 0.39370003643934415, (1, 'animation'): 0.20886242957049514, ...}`

without using TfidfVectorizer (you can use loops).

Evaluate the TFIDFRecommender with it using leave-one-out procedure for implicit feedback, print HR@1, HR@3, HR@5, HR@10, NDCG@1, NDCG@3, NDCG@5, NDCG@10.

In [131]:
def calculate_tf_idf_scores(interactions_df):
    
    users = interactions_df["user_id"].unique().tolist()
    n_users = len(users)
    genres = list(set("|".join(interactions_df['genres'].tolist()).split("|")))
    
    interactions_df["genres"] = interactions_df["genres"].str.split("|")
    
    users_genres = defaultdict(lambda: [])
    tfidf_scores = defaultdict(lambda: 0.0)
    
    for index,row in interactions_df.iterrows():
        users_genres[row["user_id"]] += row["genres"]
    
    for user in users:
        for genre in genres:
            tf = users_genres[user].count(genre)
            users_with_genre = 0
            for _user,_genres in users_genres.items():
                users_with_genre += 1 if genre in _genres else 0
            idf = np.log(n_users/users_with_genre)
            tfidf = tf*idf
            tfidf_scores[(user,genre)] = tfidf
    
    return tfidf_scores

In [132]:
# Test the method
interactions_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')
interactions_df.loc[:, 'genres'] = interactions_df['genres'].str.replace("-", "_", regex=False)
interactions_df.loc[:, 'genres'] = interactions_df['genres'].str.replace(" ", "_", regex=False)
interactions_df.loc[:, 'genres'] = interactions_df['genres'].str.replace("(", "", regex=False)
interactions_df.loc[:, 'genres'] = interactions_df['genres'].str.replace(")", "", regex=False)
interactions_df.loc[:, 'genres'] = interactions_df['genres'].str.lower()
        
tfidf_scores = calculate_tf_idf_scores(interactions_df)
print(tfidf_scores)

defaultdict(<function calculate_tf_idf_scores.<locals>.<lambda> at 0x7f1a0458faf0>, {(1, 'action'): 2.198046500486951, (1, 'romance'): 1.5340801351916244, (1, 'film_noir'): 0.0, (1, 'thriller'): 1.7060590271175973, (1, 'adventure'): 2.6013213256521714, (1, 'horror'): 0.0, (1, 'imax'): 0.0, (1, 'sci_fi'): 1.2729788914477906, (1, 'crime'): 2.3086654110482208, (1, 'children'): 3.1107481541030797, (1, 'mystery'): 0.6473875634104785, (1, 'animation'): 2.628413572457773, (1, 'war'): 2.121743921370525, (1, 'western'): 1.1786549963416462, (1, 'fantasy'): 1.6501390107578158, (1, 'comedy'): 1.791840171673406, (1, 'documentary'): 0.0, (1, 'musical'): 0.9297589489250216, (1, 'drama'): 0.6840579435170525, (1, 'no_genres_listed'): 0.0, (5, 'action'): 0.0, (5, 'romance'): 0.9204480811149747, (5, 'film_noir'): 0.0, (5, 'thriller'): 0.28434317118626623, (5, 'adventure'): 0.0, (5, 'horror'): 0.0, (5, 'imax'): 0.8641616664392084, (5, 'sci_fi'): 0.0, (5, 'crime'): 0.3847775685080368, (5, 'children'): 0.62

In [55]:
class TFIDFRecommender(Recommender):
    """
    Recommender based on the TF-IDF method.
    """
    
    def __init__(self):
        """
        Initialize base recommender params and variables.
        """
        self.tfidf_scores = None
    
    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.
        
        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items 
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
        """
        
        self.tfidf_scores = defaultdict(lambda: 0.0)

        # Prepare the corpus for tfidf calculation
        
        interactions_df = pd.merge(interactions_df, items_df, on='item_id')
        interactions_df.loc[:, 'genres'] = interactions_df['genres'].str.replace("-", "_", regex=False)
        interactions_df.loc[:, 'genres'] = interactions_df['genres'].str.replace(" ", "_", regex=False)
        interactions_df.loc[:, 'genres'] = interactions_df['genres'].str.lower()
                
        self.tfidf_scores = calculate_tf_idf_scores(interactions_df)
    
    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns 
        top n_recommendations for each user.
        
        :param pd.DataFrame users_df: DataFrame with users and their features for which recommendations should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations 
            for each user.
        :rtype: pd.DataFrame
        """
        
        recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        
        # Transform genres to a unified form used by the vectorizer
        
        items_df = items_df.copy()
        items_df.loc[:, 'genres'] = items_df['genres'].str.replace("-", "_", regex=False)
        items_df.loc[:, 'genres'] = items_df['genres'].str.replace(" ", "_", regex=False)
        items_df.loc[:, 'genres'] = items_df['genres'].str.lower()
        items_df.loc[:, 'genres'] = items_df['genres'].str.split("|")
                
        # Score items    
        
        for uix, user in users_df.iterrows():
            items = []
            for iix, item in items_df.iterrows():
                score = 0.0
                for genre in item['genres']:
                    score += self.tfidf_scores[(user['user_id'], genre)]
                score /= len(item['genres'])
                items.append((item['item_id'], score))
                
            items = sorted(items, key=lambda x: x[1], reverse=True)
            user_recommendations = pd.DataFrame({'user_id': user['user_id'],
                                                 'item_id': [item[0] for item in items][:n_recommendations],
                                                 'score': [item[1] for item in items][:n_recommendations]})

            recommendations = pd.concat([recommendations, user_recommendations])

        return recommendations

In [56]:
# Print movies watched by user 3, 5, 39

active_user_movies = ml_df.loc[(ml_df['user_id'] == 3) | (ml_df['user_id'] == 5) | (ml_df['user_id'] == 39)]
print("Active users history")
display(active_user_movies)

Active users history


Unnamed: 0,user_id,item_id,rating,timestamp,title,genres
573,5,50,4.0,847434881,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
588,39,50,5.0,974788030,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
7762,39,1213,4.0,974789540,Goodfellas (1990),Crime|Drama
9403,39,1500,4.0,974789727,Grosse Pointe Blank (1997),Comedy|Crime|Romance
11108,39,2078,3.0,974788976,"Jungle Book, The (1967)",Animation|Children|Comedy|Musical
13425,39,2692,5.0,974789111,Run Lola Run (Lola rennt) (1998),Action|Crime
14203,39,2947,3.0,974789327,Goldfinger (1964),Action|Adventure|Thriller
18810,3,5181,5.0,1306463718,Hangar 18 (1980),Action|Sci-Fi|Thriller
18827,3,7991,5.0,1306463684,Death Race 2000 (1975),Action|Sci-Fi
18832,3,70946,5.0,1306463815,Troll 2 (1990),Fantasy|Horror


In [57]:
# Quick test of the recommender

tfidf_recommender = TFIDFRecommender()
tfidf_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = tfidf_recommender.recommend(pd.DataFrame([[3], [5], [39]], columns=['user_id']), ml_movies_df, 10)

recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(recommendations)

Recommendations


Unnamed: 0,user_id,item_id,score,title,genres
0,3,1322,2.931373,Amityville 1992: It's About Time (1992),Horror
1,3,1976,2.931373,Friday the 13th Part 3: 3D (1982),Horror
2,3,1980,2.931373,Friday the 13th Part VII: The New Blood (1988),Horror
3,3,2517,2.931373,Christine (1983),Horror
4,3,2878,2.931373,Hell Night (1981),Horror
5,3,3017,2.931373,Creepshow 2 (1987),Horror
6,3,3021,2.931373,"Funhouse, The (1981)",Horror
7,3,3652,2.931373,"City of the Living Dead (a.k.a. Gates of Hell,...",Horror
8,3,3839,2.931373,Phantasm IV: Oblivion (1998),Horror
9,3,3908,2.931373,Urban Legends: Final Cut (2000),Horror


### Train-test split test

In [58]:
tfidf_recommender = TFIDFRecommender()

t0 = time.time()

results = [['TFIDFRecommender'] + list(evaluate_train_test_split_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(results)

print('Total evaluation time: {}'.format(time.time() - t0))

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,TFIDFRecommender,0.017897,0.040268,0.051454,0.085011,0.017897,0.030547,0.034973,0.045844


Total evaluation time: 27.81628179550171


### Leave-one-out test

In [59]:
tfidf_recommender = TFIDFRecommender()

t0 = time.time()

results = [['TFIDFRecommender'] + list(evaluate_leave_one_out_implicit(
    tfidf_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id']], ml_movies_df, max_evals=300, seed=6789))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(results)

print('Total evaluation time: {}'.format(time.time() - t0))

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,TFIDFRecommender,0.003333,0.006667,0.006667,0.023333,0.003333,0.005436,0.005436,0.010932


Total evaluation time: 703.2800462245941


**Task 6\*.** Implement an SVRRecommender by replacing the LinearRegression model with the SVR model (`from sklearn.svm import SVR`). Use separate features for users and items, not the dot product.

Tune params of the SVR model to obtain as good results as you can. 

To do tuning properly:

- divide the set into training, validation and test sets (randomly divide the dataset in proportions 60%-20%-20%),
- train the model with different sets of tunable parameters on the training set, 
- choose the best tunable params based on results on the validation set, 
- provide the final evaluation metrics on the test set for the best model obtained during tuning.

Recommended method of tuning: use hyperopt. Install the package using the following command: `pip install hyperopt`
    
Print the RMSE, MAPE and TRE on the test set. Use seed 6789.

In [60]:
from sklearn.svm import SVR
from sklearn.preprocessing import MultiLabelBinarizer

    
class SVRRecommender(Recommender):
    """
    SVR recommender class.
    """

    def __init__(self, kernel='rbf', c=1.0, epsilon=0.1):
        """
        Initialize base recommender params and variables.
        """
        super().__init__()
        self.model = None
        self.mlb = None
        self.kernel = kernel
        self.c = c
        self.epsilon = epsilon
        self.mlb = None
        self.users_dict = None
        self.user_features = None
        self.uses_dot_product = False

    def fit(self, interactions_df, users_df, items_df):
        """
        Training of the recommender.

        :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
            defined by user_id, item_id and features of the interaction.
        :param pd.DataFrame users_df: DataFrame with users and their features defined by user_id
            and the user feature columns.
        :param pd.DataFrame items_df: DataFrame with items and their features defined by item_id
            and the item feature columns.
        """

        # Transform genres to a more code-friendly form

        interactions_df = pd.merge(interactions_df, items_df, on='item_id')
        interactions_df = self._transform_genres(interactions_df)

        # Prepare user features

        users_df = interactions_df[['user_id', 'genres']].copy()
        users_df = users_df.explode('genres')
        users_df['val'] = 1
        users_df = users_df.pivot_table(index='user_id', columns='genres', values='val', aggfunc='count')
        users_df = users_df / users_df.sum(axis=1).values.reshape(-1, 1)
        users_df = users_df.rename_axis(None, axis=1).fillna(0)
        users_df = users_df.add_prefix('user_')
#         display(users_df.head(10))

        self.users_dict = users_df.to_dict('index')

        self.user_features = users_df.columns.tolist()

        interactions_df = interactions_df.merge(users_df, on='user_id')
#         display(interactions_df.head(10))

        # Prepare item features

        # Transform genres into binary values

        self.mlb = MultiLabelBinarizer()
        interactions_df = interactions_df.join(
            pd.DataFrame(self.mlb.fit_transform(interactions_df.pop('genres')),
                         columns=self.mlb.classes_,
                         index=interactions_df.index))

        # Normalize the values so that each movie's genres sum up to 1

        interactions_df[self.mlb.classes_] = interactions_df[self.mlb.classes_] \
            / interactions_df[self.mlb.classes_].sum(axis=1).values.reshape(-1, 1)

#         display(interactions_df.loc[:, self.mlb.classes_].head(10))

        # Prepare input data and fit the model

        interactions_df[self.mlb.classes_] = interactions_df[self.mlb.classes_] \
            * interactions_df[self.user_features].values

#         display(interactions_df.head(10))

        x = interactions_df.loc[:, list(self.mlb.classes_)].values
        y = interactions_df['rating'].values

        self.model = SVR(kernel=self.kernel, C=self.c, epsilon=self.epsilon).fit(x, y)

    def recommend(self, users_df, items_df, n_recommendations=1):
        """
        Serving of recommendations. Scores items in items_df for each user in users_df and returns
        top n_recommendations for each user.

        :param pd.DataFrame users_df: DataFrame with users and their features for which recommendations
            should be generated.
        :param pd.DataFrame items_df: DataFrame with items and their features which should be scored.
        :param int n_recommendations: Number of recommendations to be returned for each user.
        :return: DataFrame with user_id, item_id and score as columns returning n_recommendations top recommendations
            for each user.
        :rtype: pd.DataFrame
        """

        # Transform the item to be scored into proper features

        items_df = items_df.copy()
        items_df = self._transform_genres(items_df)

        items_df = items_df.join(
            pd.DataFrame(self.mlb.transform(items_df.pop('genres')),
                         columns=self.mlb.classes_,
                         index=items_df.index))

        items_df[self.mlb.classes_] = items_df[self.mlb.classes_] \
            / items_df[self.mlb.classes_].sum(axis=1).values.reshape(-1, 1)

        # Score the item

        recommendations = pd.DataFrame(columns=['user_id', 'item_id', 'score'])

        for ix, user in users_df.iterrows():
            if user['user_id'] in self.users_dict:
                user_df = pd.DataFrame.from_dict({user['user_id']: self.users_dict[user['user_id']]}, orient='index')
            else:
                user_df = pd.DataFrame.from_dict(
                    {user['user_id']: [1 / len(self.user_features)]*len(self.user_features)}, orient='index')
#             display(user_df)
#             display(items_df)
            input_df = items_df.copy()
            input_df[self.mlb.classes_] = items_df[self.mlb.classes_] * user_df.values
#             display(input_df)
            scores = self.model.predict(input_df.loc[:, self.mlb.classes_].values)

            chosen_pos = np.argsort(-scores)[:n_recommendations]

            user_recommendations = []
            for item_pos in chosen_pos:
                user_recommendations.append(
                    {
                        'user_id': user['user_id'],
                        'item_id': input_df.iloc[item_pos]['item_id'],
                        'score': scores[item_pos]
                    }
                )

            user_recommendations = pd.DataFrame(user_recommendations)

            recommendations = pd.concat([recommendations, user_recommendations])

        return recommendations

    @staticmethod
    def _transform_genres(df):
        """
        Transforms a string with genres into a list of cleaned genre names.

        :param pd.DataFrame df: A DataFrame with 'genres' column.
        """
        df.loc[:, 'genres'] = df['genres'].str.replace("-", "_", regex=False)
        df.loc[:, 'genres'] = df['genres'].str.replace(" ", "_", regex=False)
        df.loc[:, 'genres'] = df['genres'].str.replace("(", "", regex=False)
        df.loc[:, 'genres'] = df['genres'].str.replace(")", "", regex=False)
        df.loc[:, 'genres'] = df['genres'].str.lower()
        df.loc[:, 'genres'] = df['genres'].str.split("|")
        return df
    
    
# Quick test of the recommender

svr_recommender = SVRRecommender()
svr_recommender.fit(ml_ratings_df, None, ml_movies_df)
recommendations = svr_recommender.recommend(pd.DataFrame([[1], [2], [6]], columns=['user_id']), ml_movies_df, 10)

recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
print("Recommendations")
display(HTML(recommendations.to_html()))

Recommendations


Unnamed: 0,user_id,item_id,score,title,genres
0,1,147286,4.351657,The Adventures of Sherlock Holmes and Doctor Watson: The Treasures of Agra (1983),Crime|Mystery
1,1,146028,4.351657,The Adventures of Sherlock Holmes and Dr. Watson: The Hound of the Baskervilles (1981),Crime|Mystery
2,1,66335,4.286956,Afro Samurai: Resurrection (2009),Animation
3,1,160848,4.286956,The Red Turtle (2016),Animation
4,1,79677,4.223352,"Two Escobars, The (2010)",Crime|Documentary
5,1,1213,4.136901,Goodfellas (1990),Crime|Drama
6,1,5888,4.136901,Brother (Brat) (1997),Crime|Drama
7,1,31309,4.136901,Rocco and His Brothers (Rocco e i suoi fratelli) (1960),Crime|Drama
8,1,43912,4.136901,Freedomland (2006),Crime|Drama
9,1,5954,4.136901,25th Hour (2002),Crime|Drama


In [61]:
svr_recommender = SVRRecommender(kernel='rbf', c=5.9672721141155, epsilon=0.8583733904374324)

results = [['SVRRecommender'] + list(evaluate_train_test_split_explicit(
    svr_recommender, ml_ratings_df.loc[:, ['user_id', 'item_id', 'rating']], ml_movies_df, seed=seed))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'RMSE', 'MRE', 'TRE'])

display(results)

Unnamed: 0,Recommender,RMSE,MRE,TRE
0,SVRRecommender,0.994731,0.336384,0.223827


In [62]:
from hyperopt import hp, fmin, tpe, Trials
import traceback

# Split into train_validation and test sets

shuffle = np.arange(len(ml_ratings_df))
rng.shuffle(shuffle)
shuffle = list(shuffle)

train_test_split = 0.8
split_index = int(len(ml_ratings_df) * train_test_split)

train_validation = ml_ratings_df.iloc[shuffle[:split_index]].loc[:, ['user_id', 'item_id', 'rating']]
test = ml_ratings_df.iloc[shuffle[split_index:]].loc[:, ['user_id', 'item_id', 'rating']]

# Tune

def loss(tuned_params):
    svr_recommender = SVRRecommender(kernel='rbf', c=tuned_params['C'], epsilon=tuned_params['epsilon'])
    rmse, mre, tre = evaluate_train_test_split_explicit(
        svr_recommender, train_validation, ml_movies_df, seed=seed)
    return rmse

# Define the search space for hyperopt
# Write your code here


succeded = False
n_tries = 3
t = 0
while not succeded and t < n_tries:
    try:
        trials = Trials()
        best_param_set = fmin(loss, space=space, algo=tpe.suggest, max_evals=100, show_progressbar=True, trials=trials)
        succeded = True
    except:
        t += 1
        traceback.print_exc()

# Best params

print("C = {}".format(best_param_set['C']))
print("epsilon = {}".format(best_param_set['epsilon']))
    
# Test

C = 0.8078294657404328
epsilon = 0.3898720771476633

svr_recommender = SVRRecommender(C=best_param_set['C'], epsilon=best_param_set['epsilon'])

results = [['SVRRecommender'] + list(evaluate_train_test_split_explicit(
    svr_recommender, {'train': train_validation, 'test': test}, ml_movies_df, seed=seed))]

results = pd.DataFrame(results, 
                       columns=['Recommender', 'RMSE', 'MRE', 'TRE'])

display(HTML(results.to_html()))

ModuleNotFoundError: No module named 'hyperopt'

**Task 7.** Gather the results for LinearRegressionRecommender, SVRRecommender, TFIDFRecommender, MostPopularRecommender, HighestRatedRecommender, RandomRecommender from the evaluate_train_test_split_implicit method and print them as a single DataFrame.

In [63]:
linear_recommender = LinearRegressionRecommender()
svr_recommender = SVRRecommender()
tfidf_recommender = TFIDFRecommender()
most_popular_recommender = MostPopularRecommender()
highest_rated_recommender = HighestRatedRecommender()
random_recommender = RandomRecommender()

recommenders = [linear_recommender,svr_recommender,tfidf_recommender,most_popular_recommender,highest_rated_recommender,random_recommender]

all_results = []

for recommender in recommenders:
    results = [[type(recommender).__name__] + list(evaluate_train_test_split_implicit(
        recommender, ml_ratings_df, ml_movies_df))]

    results = pd.DataFrame(results, 
                           columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
    all_results.append(results)
    
all_results = pd.concat(all_results).reset_index(drop=True)
print(all_results)


Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,LinearRegressionRecommender,0.0,0.006711,0.01566,0.080537,0.0,0.003942,0.007501,0.028281


Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,SVRRecommender,0.006711,0.011186,0.013423,0.040268,0.006711,0.009241,0.010107,0.018459


Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,TFIDFRecommender,0.017897,0.040268,0.051454,0.085011,0.017897,0.030547,0.034973,0.045844


KeyError: "None of [Index(['item_id', 'user_id'], dtype='object')] are in the [index]"