## Recommendation Engine 

Necessary imports

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime, timedelta

Load data (we will use the ml-latest-small dataset)

In [2]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


Create dummy variables for different genres

In [6]:
movies = movies.join(movies.genres.str.get_dummies().astype(bool))
movies.drop(['genres', '(no genres listed)'], inplace=True, axis=1)

Join movies and ratings

In [7]:
df = pd.merge(movies, ratings, on='movieId', how='right')

# Shuffle rows
df = df.sample(frac=1).reset_index(drop=True)

In [8]:
df.head()

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,userId,rating,timestamp
0,1234,"Sting, The (1973)",False,False,False,False,True,True,False,False,...,False,False,False,False,False,False,False,590,4.0,1258419911
1,34162,Wedding Crashers (2005),False,False,False,False,True,False,False,False,...,False,False,True,False,False,False,False,288,4.0,1174395858
2,2078,"Jungle Book, The (1967)",False,False,True,True,True,False,False,False,...,True,False,False,False,False,False,False,132,3.0,1157968835
3,2997,Being John Malkovich (1999),False,False,False,False,True,False,False,True,...,False,False,False,False,False,False,False,430,4.0,962936332
4,3361,Bull Durham (1988),False,False,False,False,True,False,False,True,...,False,False,True,False,False,False,False,51,5.0,1230928856


Create pivot table with votes

In [9]:
user_ratings_pivot = df.pivot_table(index='userId', columns='title', values='rating')

In [10]:
user_ratings_pivot.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [11]:
# Columns match the number of unique movies of the dataset
df.title.nunique()

9719

Select a random user

In [12]:
user = 182

In [13]:
ratings_user = df.query(f'userId == {user}')

In [14]:
ratings_user

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,userId,rating,timestamp
59,7022,Battle Royale (Batoru rowaiaru) (2000),True,False,False,False,False,False,False,True,...,False,False,False,False,True,False,False,182,4.5,1075742316
139,3947,Get Carter (1971),True,False,False,False,False,True,False,True,...,False,False,False,False,True,False,False,182,2.5,1055153078
160,10,GoldenEye (1995),True,True,False,False,False,False,False,False,...,False,False,False,False,True,False,False,182,3.5,1054782216
183,5298,Human Nature (2001),False,False,False,False,True,False,False,False,...,False,False,True,False,False,False,False,182,4.0,1055155191
195,1245,Miller's Crossing (1990),False,False,False,False,False,True,False,True,...,False,False,False,False,True,False,False,182,4.5,1054780337
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100410,16,Casino (1995),False,False,False,False,False,True,False,True,...,False,False,False,False,False,False,False,182,5.0,1054783631
100471,3858,Cecil B. DeMented (2000),False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,182,4.0,1055155678
100555,1672,"Rainmaker, The (1997)",False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,182,2.5,1075765162
100651,4011,Snatch (2000),False,False,False,False,True,True,False,False,...,False,False,False,False,True,False,False,182,4.5,1054781325


As we can see, there are a lot of NaN values in the user_rating_pivot dataframe, which means that most users have not seen all the movies of the dataset (since no rating was assigned to them)

In [15]:
user_ratings_pivot.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In order to fix this, we can't just substitute the NaN values with 0, as this would give the unwatched movies a rating of 'dislike'. For that reason we will substract the means for each user to each rating, changing the average rating of each user to 0, and then substitute the missing values by 0 (a completely neutral score).

In [16]:
avg_ratings = user_ratings_pivot.mean(axis=1)
#save for later
user_ratings_table = user_ratings_pivot
user_ratings_pivot = user_ratings_pivot.sub(avg_ratings, axis=0)

In [17]:
user_ratings_pivot = user_ratings_pivot.fillna(0)
user_ratings_pivot.loc[182].sort_values(ascending=False).head(20)

title
Seventh Seal, The (Sjunde inseglet, Det) (1957)    1.488741
Apocalypse Now (1979)                              1.488741
Fight Club (1999)                                  1.488741
Pollock (2000)                                     1.488741
Chocolat (2000)                                    1.488741
Singin' in the Rain (1952)                         1.488741
8 1/2 (8½) (1963)                                  1.488741
Full Metal Jacket (1987)                           1.488741
Reservoir Dogs (1992)                              1.488741
Unbearable Lightness of Being, The (1988)          1.488741
Platoon (1986)                                     1.488741
Matrix, The (1999)                                 1.488741
Believer, The (2001)                               1.488741
Being John Malkovich (1999)                        1.488741
Brazil (1985)                                      1.488741
Rosencrantz and Guildenstern Are Dead (1990)       1.488741
Cinema Paradiso (Nuovo cinema Para

The algorithm we will use to estimate how similar a pair of users are will be cosine similarity. Example with users 182 and 326.

In [18]:
cosine_similarity(user_ratings_pivot.loc[182, :].values.reshape(1,-1), user_ratings_pivot.loc[326, :].values.reshape(1,-1))

array([[0.00977254]])

Apply it over the whole dataframe

In [19]:
user_similarities = cosine_similarity(user_ratings_pivot)
cosine_similarity_df = pd.DataFrame(user_similarities, index=user_ratings_pivot.index,columns=user_ratings_pivot.index).fillna(0)

In [20]:
cosine_similarity_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.001265,0.000553,0.048419,0.021847,-0.045497,-0.006200,0.047013,0.019510,-0.008754,...,0.018127,-0.017172,-0.015221,-0.037059,-0.029121,0.012016,0.055261,0.075224,-0.025713,0.010932
2,0.001265,1.000000,0.000000,-0.017164,0.021796,-0.021051,-0.011114,-0.048085,0.000000,0.003012,...,-0.050551,-0.031581,-0.001688,0.000000,0.000000,0.006226,-0.020504,-0.006001,-0.060091,0.024999
3,0.000553,0.000000,1.000000,-0.011260,-0.031539,0.004800,0.000000,-0.032471,0.000000,0.000000,...,-0.004904,-0.016117,0.017749,0.000000,-0.001431,-0.037289,-0.007789,-0.013001,0.000000,0.019550
4,0.048419,-0.017164,-0.011260,1.000000,-0.029620,0.013956,0.058091,0.002065,-0.005874,0.051590,...,-0.037687,0.063122,0.027640,-0.013782,0.040037,0.020590,0.014628,-0.037569,-0.017884,-0.000995
5,0.021847,0.021796,-0.031539,-0.029620,1.000000,0.009111,0.010117,-0.012284,0.000000,-0.033165,...,0.015964,0.012427,0.027076,0.012461,-0.036272,0.026319,0.031896,-0.001751,0.093829,-0.000278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.012016,0.006226,-0.037289,0.020590,0.026319,-0.009137,0.028326,0.022277,0.031633,-0.039946,...,0.053683,0.016384,0.098011,0.061078,0.019678,1.000000,0.017927,0.056676,0.038422,0.075464
607,0.055261,-0.020504,-0.007789,0.014628,0.031896,0.045501,0.030981,0.048822,-0.012161,-0.017656,...,0.049059,0.038197,0.049317,0.002355,-0.029381,0.017927,1.000000,0.044514,0.019049,0.021860
608,0.075224,-0.006001,-0.013001,-0.037569,-0.001751,0.021727,0.028414,0.071759,0.032783,-0.052000,...,0.069198,0.051388,0.012801,0.006319,-0.007978,0.056676,0.044514,1.000000,0.050714,0.054454
609,-0.025713,-0.060091,0.000000,-0.017884,0.093829,0.053017,0.008754,0.077180,0.000000,-0.040090,...,0.043465,0.062400,0.015334,0.094038,-0.054722,0.038422,0.019049,0.050714,1.000000,-0.012471


Find the 30 most similar users to our sample user 182.

In [21]:
cosine_similarity_series = cosine_similarity_df.loc[182]

In [22]:
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

In [23]:
ordered_similarities

userId
182    1.000000
434    0.171599
480    0.168925
606    0.154940
590    0.145742
         ...   
3     -0.045030
327   -0.045946
155   -0.056471
82    -0.057706
361   -0.090362
Name: 182, Length: 610, dtype: float64

In [24]:
ordered_similarities.index[1:36].values

array([434, 480, 606, 590, 489, 414, 387, 219, 490, 599, 610,  64, 287,
       368, 415, 160, 391, 132,  72, 274, 331, 483, 449, 187,  18, 552,
       103,  42, 444, 375, 131, 373, 211, 298, 254], dtype=int64)

In [25]:
top_similar_users_ratings = user_ratings_table[user_ratings_table.index.isin(ordered_similarities.index[1:31])]

In [26]:
top_similar_users_ratings

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18,,,,,,,,,4.0,,...,,,,,,,,,,
42,,,,,,,,,,,...,,,,,,,,,5.0,
64,,,,,,,,,,,...,,,,,,,,,,
72,,,,,,,,,,,...,,,,,,,,,,
103,,,,,,,,,,,...,,,,,,,,,,
132,,,,,,,,,,,...,,,,,,,,,,
160,,,,,,,,,,,...,,,,,,,,,,
187,,,,,,,,,,,...,,,,,,,,,,
219,,,,,,,,,,,...,,,,,,,2.0,,,
274,,,,,,,,,,,...,,3.5,,,,,2.5,2.0,,


First, we want to know what genre is the most and best rated by the user, in order to know which one should be recommended first. In order to measure both quality and quantity of ratings, we can implement a formula that gives a weighted score: that is, a metric that takes into account number of ratings and average ratings.

$$
Weighted Score = \frac{1}{2} * (s + 5(1 - e^{\frac{-q}{Q}}))
$$

Where:

* s is the mean score of the genre.
* q is the number of ratings of the genre.
* Q is a number that represents a moderate number of votes for a genre.

Let's implement the function

In [27]:
def weighted_score(s: float, q: int, Q: int) -> float:
    return 0.5 * (s + 5 * (1 - np.exp(-q/Q)))

Additionally, we will prioritise the ratings of the most recently watched movies by the user, creating a new metric that is an average between the weighted score of the last month and the historical weighted score. As for the number, we will select the 100 most recently watched movies (in the case of this user, approximately a 10% of the movies he has watched)

In [28]:
def timestamp_to_datetime(timestamp: str):
    return datetime.fromtimestamp(int(timestamp))

In [29]:
ratings_user['datetime'] = ratings_user.timestamp.apply(timestamp_to_datetime)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_user['datetime'] = ratings_user.timestamp.apply(timestamp_to_datetime)


In [30]:
ratings_user = ratings_user.sort_values(by=['datetime'], ascending=False)

We could have selected a higher Q, but for the sake of making recommendations that can surprise the user more I will set the number to 20, that is: 20 is considered a good number of ratings for a genre to be considered liked. Considering some genres are more niche than others, this will favour productions of those genres to be more likely selected by the user. Likewise, we will use Q = 2 for the sample of most recently watched 100 movies.

In [31]:
genres_user_rating = pd.DataFrame(index = ratings_user.columns[2:-4])

for genre in genres_user_rating.index:
    df_genre = ratings_user.loc[ratings_user[genre] == True]
    df_genre_recent = ratings_user.head(100).loc[ratings_user[genre] == True]
    genres_user_rating.loc[genre, 'Average Rating (global)'] = df_genre.rating.mean()
    genres_user_rating.loc[genre, 'Number of Ratings (global)'] = df_genre.shape[0]
    genres_user_rating.loc[genre, 'Weighted Score (global)'] = weighted_score(s = df_genre.rating.mean(), q = df_genre.shape[0], Q=20)
    if not np.isnan(df_genre_recent.rating.mean()):
        genres_user_rating.loc[genre, 'Average Rating (recent)'] = df_genre_recent.rating.mean() 
        genres_user_rating.loc[genre, 'Number of Ratings (recent)'] = df_genre_recent.shape[0]
        genres_user_rating.loc[genre, 'Weighted Score (recent)'] = weighted_score(s = df_genre_recent.rating.mean(), q = df_genre_recent.shape[0], Q=2)

    else:
        genres_user_rating.loc[genre, 'Average Rating (recent)'] = 0
        genres_user_rating.loc[genre, 'Number of Ratings (recent)'] = 0
        genres_user_rating.loc[genre, 'Weighted Score (recent)'] = 0
    genres_user_rating.loc[genre, 'Weighted Score (avg. global & recent)'] = (genres_user_rating.loc[genre, 'Weighted Score (recent)'] + genres_user_rating.loc[genre, 'Weighted Score (global)']) / 2

Now, we can know what genres have the highest score for the user, according to the metric we have defined.

In [32]:
genres_user_rating.sort_values(by=['Weighted Score (avg. global & recent)'], ascending=False)

Unnamed: 0,Average Rating (global),Number of Ratings (global),Weighted Score (global),Average Rating (recent),Number of Ratings (recent),Weighted Score (recent),Weighted Score (avg. global & recent)
Drama,3.867076,489.0,4.433538,3.801724,58.0,4.400862,4.4172
War,3.959016,61.0,4.361111,4.166667,6.0,4.458866,4.409988
Mystery,3.765432,81.0,4.33916,3.807692,13.0,4.400088,4.369624
Romance,3.651235,162.0,4.324858,3.65625,16.0,4.327286,4.326072
Crime,3.675393,191.0,4.337518,3.595238,21.0,4.29755,4.317534
Comedy,3.448328,329.0,4.224164,3.515625,32.0,4.257812,4.240988
Thriller,3.286458,288.0,4.143228,3.483333,30.0,4.241666,4.192447
Fantasy,3.331395,86.0,4.131776,3.5,7.0,4.174507,4.153141
Action,3.092453,265.0,4.046222,3.416667,24.0,4.208318,4.12727
Adventure,3.119186,172.0,4.059133,3.1875,16.0,4.092911,4.076022


In the frontend, it will be represented in the following way: the higher the score of the genre, the higher the recommendations of the genre will be featured on the display. As the user scrolls down, the second, third, and so on... best scored genres will be visible.

In [33]:
def get_recommendations(df: pd.DataFrame, genre: str, n: int = None) -> pd.DataFrame:
    genre_movies = df.loc[df[genre] == True].title.values
    genre_movies_similarusers = [m for m in top_similar_users_ratings.columns if m in df.loc[df[genre] == True].title.values]
    scores = top_similar_users_ratings[genre_movies_similarusers].mean(skipna=True)
    scores = scores[~scores.isna()]
    if n:
        return pd.DataFrame(scores).sort_values(by=[0], ascending=False).head(n)
    else:
        return pd.DataFrame(scores).sort_values(by=[0], ascending=False)

get_recommendations(df, 'Drama', 10)

Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
Stranger Than Paradise (1984),5.0
Satin Rouge (2002),5.0
All the Real Girls (2003),5.0
Manon of the Spring (Manon des sources) (1986),5.0
Sophie's Choice (1982),5.0
Sonatine (Sonachine) (1993),5.0
Some Kind of Wonderful (1987),5.0
Eva (2011),5.0
Enter the Void (2009),5.0
Branded to Kill (Koroshi no rakuin) (1967),5.0


We can estimate the accuracy of this algorithm using the metric mean absolute error. For this, we will consider movies rated 4 or more as 'liked', and movies rated less as 'disliked' (as this is the implementation that is present in the frontend for the users)

In [34]:
def mean_absolute_error(movies: pd.DataFrame, ratings: pd.DataFrame, genre: str):
    rm = get_recommendations(movies, 'Drama')
    ur = ratings.copy()
    rm['liked'] = rm[0] >= 4
    ur['liked'] = ratings_user.rating >= 4
    # If it is equal to 1, the prediction has been a failure: the recommended movies that the user has seen were disliked; if it is equal to 0, it has been a total success, as liking the movie has been correctly predicted
    return 1- ur[ur.title.isin(rm.query('liked == True').index)].query('Drama == True').liked.mean()

mean_absolute_error(df, ratings_user, 'Drama')

0.21052631578947367

Our recomendations are almost 80% accurate (for the drama genre). Not bad!

When clicking on each movie, recommendations will also be made below the media player. In this case, it won't be collaborative filtering based, but purely content based. We will look for movies that are of similar genres, and have similar user tags.

In [35]:
movie_tags = pd.merge(movies, tags, on='movieId', how='left')

In [36]:
movie_tags.head()

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,userId,tag,timestamp
0,1,Toy Story (1995),False,True,True,True,True,False,False,False,...,False,False,False,False,False,False,False,336.0,pixar,1139046000.0
1,1,Toy Story (1995),False,True,True,True,True,False,False,False,...,False,False,False,False,False,False,False,474.0,pixar,1137207000.0
2,1,Toy Story (1995),False,True,True,True,True,False,False,False,...,False,False,False,False,False,False,False,567.0,fun,1525286000.0
3,2,Jumanji (1995),False,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,62.0,fantasy,1528844000.0
4,2,Jumanji (1995),False,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,62.0,magic board game,1528844000.0


In [37]:
arr_similarity = lambda a1, a2: np.mean([int(x == y) for x, y in zip(a1, a2)])
arr_similarity_tag = lambda a_m, a_t: np.mean([1 if t in a_t.values else 0 for t in a_m])

def get_similar_movies(df_total: pd.DataFrame, movie_id: int, n: int = 10):
    df_movie = df_total.query(f'movieId == {movie_id}')
    
    score_df = pd.DataFrame()
    
    # We will first calculate the most similar movies regarding genre
    genres = df_movie.iloc[0][2:-3]
    
    for _, m in df_total.iterrows():
        score_df.loc[m.title, 'movieId'] = m.movieId
        score_df.loc[m.title, 'genre_similarity'] = arr_similarity(genres, m[2:-3])
        
    # Now we will learn how many tags do the movies share with the target movie
    # There are many tags that are irrelevant to some extent. I couldn't spot them all, but a very obvious one is 'fun', which is very subjective and of little use for recommendations
    tags_movie = [t for t in df_movie.tag.values if t != 'fun']
    tag_related_movies = df_total[df_total.tag.isin(tags_movie)]
    
    # Now we will study how many tags are shared between our candidate recommended movies and our target movie
    for m in tag_related_movies.movieId.unique():
        tags = df_total[df_total.movieId == m].tag
        score_df.loc[df_total.query(f'movieId == {m}').iloc[0].title, 'tag_similarity'] = arr_similarity_tag(tags_movie, tags)
    
    score_df.tag_similarity = score_df.tag_similarity.fillna(0)
    
    # Since tags are not as reliable as genre, we will use them but still give them less importance than we do to genre for the recommendation
    score_df['genre_tag_similarity'] = 0.8 * score_df.genre_similarity + 0.2 * score_df.tag_similarity
    return score_df.sort_values(by='genre_tag_similarity', ascending=False).head(n)

In [38]:
similar_movies = get_similar_movies(df_total = movie_tags, movie_id = 89745)

In [39]:
similar_movies

Unnamed: 0,movieId,genre_similarity,tag_similarity,genre_tag_similarity
"Avengers, The (2012)",89745.0,1.0,1.0,1.0
"Amazing Spider-Man, The (2012)",95510.0,1.0,0.0,0.8
Captain America: The Winter Soldier (2014),110102.0,1.0,0.0,0.8
Godzilla (2014),111364.0,1.0,0.0,0.8
The Hunger Games: Catching Fire (2013),106487.0,1.0,0.0,0.8
Ender's Game (2013),106002.0,1.0,0.0,0.8
Avatar (2009),72998.0,1.0,0.0,0.8
Pacific Rim (2013),103228.0,1.0,0.0,0.8
After Earth (2013),102880.0,1.0,0.0,0.8
Tron: Legacy (2010),82461.0,1.0,0.0,0.8
