Followed this tutorial: https://towardsdatascience.com/build-a-user-based-collaborative-filtering-recommendation-engine-for-anime-92d35921f304

In [1]:
import numpy as np
import pandas as pd

In [2]:
animes = pd.read_csv('data/anime.csv')
rating_matrix = pd.read_csv('data/rating_formatted.csv')
mal = pd.read_csv('data/mal.csv')

In [3]:
# Format the imported data
rating_matrix = rating_matrix.set_index('user_id')
rating_matrix.columns = rating_matrix.columns.astype(int)

rating_matrix

Unnamed: 0_level_0,1,5,6,7,15,16,18,19,20,22,...,41783,41930,42091,42203,42603,42897,42938,43608,43609,47778
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
160,8.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353344,9.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Remove ratings of 0 (not rated)
mal = mal[mal.my_score != 0]

In [5]:
# Create a dataframe for the user data
USER = -1 # unique id for user

user_df = mal[['my_score', 'series_animedb_id']].set_index('series_animedb_id')
user_df = user_df.sort_index().transpose() # ascending anime id order, rotate it to match rating_matrix
user_df = user_df.rename(index={'my_score': USER}) # give user unique id

user_df

series_animedb_id,431,849,918,1575,2167,2904,4181,4224,4382,5081,...,46095,47398,47778,48561,48569,48583,48736,49310,50265,50360
-1,3,7,7,8,7,8,9,6,7,6,...,7,4,8,8,9,8,6,9,8,8


In [6]:
# Matrix combining current user and other user data
final_matrix = pd.concat([user_df, rating_matrix], ignore_index=False)
final_matrix = final_matrix.reindex(sorted(final_matrix.columns), axis=1) # list anime ids in acsending order
final_matrix = final_matrix.fillna(0) # replace NaNs with 0

final_matrix

Unnamed: 0,1,5,6,7,15,16,18,19,20,22,...,46095,47398,47778,48561,48569,48583,48736,49310,50265,50360
-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,4.0,8.0,8.0,9.0,8.0,6.0,9.0,8.0,8.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353344,9.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Cosine similarity

from sklearn.metrics.pairwise import cosine_similarity
import operator
def similar_users(user_id, matrix, k=20):
    # create a df of just the current user
    user = matrix[matrix.index == user_id]
    
    # and a df of all other users
    other_users = matrix[matrix.index != user_id]
    
    # calc cosine similarity between user and each other user
    similarities = cosine_similarity(user, other_users)[0].tolist()
    
    # create list of indices of these users
    other_user_ids = other_users.index.tolist()
    
    # create dataframe of user index and their similarity
    similarity_series = pd.Series(similarities, index=other_user_ids)
    
    # sort by similarity
    similarity_series = similarity_series.sort_values(ascending=False)
    
    # grab k users off the top
    return similarity_series.head(k)
    

# Run the function
similarity_series = similar_users(USER, final_matrix)
display(similarity_series)

218002    0.486004
180585    0.451228
81301     0.450311
311774    0.438344
338347    0.433555
3747      0.431432
53967     0.428823
62660     0.424184
247044    0.420969
80223     0.420387
329906    0.418658
71658     0.418230
16106     0.415667
104915    0.413608
212427    0.410772
232078    0.410352
200232    0.409586
179102    0.409477
50191     0.409025
35145     0.404858
dtype: float64

In [8]:
# Predicted score

def predicted_score(similarity_series, matrix, animes):
    avg_ratings = matrix[matrix != 0].mean(axis=1)
    
    # Equation from Recommendation Systems: An Introduction by Dietmar Jannack et al.
    return [avg_ratings[USER] + (sum([similarity * (matrix[anime][similar_user] - avg_ratings[similar_user])
                                      for similar_user, similarity in similarity_series.iteritems()])) 
                                            / similarity_series.sum() for anime in animes]
            

In [9]:
# Recommend anime

def recommend_item(user_index, similarity_series, matrix, items=10):
    
    # load vectors for similar users
    similar_users = matrix[matrix.index.isin(similarity_series.index)]
    # calc avg ratings for each anime across the similar users
    similar_users = similar_users.mean(axis=0)
    # convert to dataframe so its easy to sort and filter
    similar_users_df = pd.DataFrame(similar_users, columns=['score'])
    
    
    # load vector for the current user
    user_df = matrix[matrix.index == user_index]
    # transpose it so its easier to filter
    user_df_transposed = user_df.transpose()
    # rename the column as 'rating'
    user_df_transposed.columns = ['rating']
    # Animes without a rating (0)
    user_df_transposed = user_df_transposed[user_df_transposed['rating']==0]
    # generate a list of animes the user has not seen
    animes_unseen = user_df_transposed.index.tolist()
    
    
    # filter avg ratings of similar users for only anime the current user has not seen
    similar_users_df_filtered = similar_users_df[similar_users_df.index.isin(animes_unseen)]
    # order the dataframe
    similar_users_df_ordered = similar_users_df_filtered.sort_values(by=['score'], ascending=False)
    # grab the top n anime   
    top_anime = similar_users_df_ordered.head(items)
    # sort the list
    top_anime = top_anime.sort_index()
    # make a list for just the anime ids
    top_anime_id = top_anime.index.tolist()
    
    # Pull out anime data of top animes
    anime_information = animes[animes['MAL_ID'].isin(top_anime_id)]
    # Add the average ratings of similar users to the dataframe
    anime_information.insert(0, 'Score', predicted_score(similarity_series, matrix, top_anime_id))
    # Sort based on score
    anime_information = anime_information.sort_values('Score', ascending=False)
    # Remove index numbers
    anime_information.reset_index(drop=True, inplace=True)
    
    
    return anime_information

# Run the function
recommend_item(USER, similarity_series, final_matrix)

Unnamed: 0,Score,MAL_ID,Name,Average Rating,Genres,English name,Japanese name,Type,Episodes,Premiered,Studios,Members,Favorites
0,8.366586,9253,Steins;Gate,9.11,"Thriller, Sci-Fi",Steins;Gate,STEINS;GATE,TV,24,Spring 2011,White Fox,1771162,148452
1,8.003709,5114,Fullmetal Alchemist: Brotherhood,9.19,"Action, Military, Adventure, Comedy, Drama, Ma...",Fullmetal Alchemist:Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,TV,64,Spring 2009,Bones,2248456,183914
2,7.693839,1535,Death Note,8.63,"Mystery, Police, Psychological, Supernatural, ...",Death Note,デスノート,TV,37,Fall 2006,Madhouse,2589552,145201
3,7.601886,11741,Fate/Zero 2nd Season,8.59,"Action, Supernatural, Magic, Fantasy",Fate/Zero Season 2,フェイト/ゼロ 2ndシーズン,TV,12,Spring 2012,ufotable,808294,17197
4,7.037294,37779,Yakusoku no Neverland,8.65,"Sci-Fi, Mystery, Horror, Psychological, Thrill...",The Promised Neverland,約束のネバーランド,TV,12,Winter 2019,CloverWorks,1133952,32542
5,6.923593,37510,Mob Psycho 100 II,8.84,"Action, Slice of Life, Comedy, Supernatural",Mob Psycho 100 II,モブサイコ100 II,TV,13,Winter 2019,Bones,835688,20654
6,6.909301,30276,One Punch Man,8.57,"Action, Sci-Fi, Comedy, Parody, Super Power, S...",One Punch Man,ワンパンマン,TV,12,Fall 2015,Madhouse,2123866,54435
7,6.336864,33486,Boku no Hero Academia 2nd Season,8.33,"Action, Comedy, Super Power, School, Shounen",My Hero Academia 2,僕のヒーローアカデミア,TV,25,Spring 2017,Bones,1611771,18930
8,6.286908,36456,Boku no Hero Academia 3rd Season,8.25,"Action, Comedy, Super Power, School, Shounen",My Hero Academia 3,僕のヒーローアカデミア,TV,25,Spring 2018,Bones,1333355,13575
9,5.978966,14813,Yahari Ore no Seishun Love Comedy wa Machigatt...,8.07,"Slice of Life, Comedy, Drama, Romance, School",My Teen Romantic Comedy SNAFU,やはり俺の青春ラブコメはまちがっている。,TV,13,Spring 2013,Brain's Base,971934,29425
