Followed this tutorial: https://towardsdatascience.com/build-a-user-based-collaborative-filtering-recommendation-engine-for-anime-92d35921f304

In [1]:
import numpy as np
import pandas as pd

In [2]:
animes = pd.read_csv('data/anime.csv')
rating_matrix = pd.read_csv('data/rating_formatted.csv')
mal = pd.read_csv('data/mal.csv')

In [3]:
# Format the imported data
rating_matrix = rating_matrix.set_index('user_id')
rating_matrix.columns = rating_matrix.columns.astype(int)

rating_matrix

Unnamed: 0_level_0,1,5,6,7,15,16,18,19,20,22,...,41783,41930,42091,42203,42603,42897,42938,43608,43609,47778
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
160,8.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353344,9.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Remove ratings of 0 (not rated)
mal = mal[mal.my_score != 0]

In [5]:
# Create a dataframe for the user data
USER = -1 # unique id for user

user_df = mal[['my_score', 'series_animedb_id']].set_index('series_animedb_id')
user_df = user_df.sort_index().transpose() # ascending anime id order, rotate it to match rating_matrix
user_df = user_df.rename(index={'my_score': USER}) # give user unique id

user_df

series_animedb_id,431,849,918,1575,2167,2904,4181,4224,4382,5081,...,46095,47398,47778,48561,48569,48583,48736,49310,50265,50360
-1,3,7,7,8,7,8,9,6,7,6,...,7,4,8,8,9,8,6,9,8,8


In [6]:
# Matrix combining current user and other user data
final_matrix = pd.concat([user_df, rating_matrix], ignore_index=False)
final_matrix = final_matrix.reindex(sorted(final_matrix.columns), axis=1) # list anime ids in acsending order
final_matrix = final_matrix.fillna(0) # replace NaNs with 0
final_matrix = final_matrix.T

final_matrix

Unnamed: 0,-1,16,81,124,155,160,211,269,324,368,...,353185,353207,353213,353241,353258,353329,353344,353362,353372,353387
1,0.0,0.0,0.0,0.0,0.0,8.0,10.0,7.0,0.0,9.0,...,0.0,10.0,0.0,0.0,10.0,0.0,9.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,9.0,10.0,0.0,9.0,0.0,...,0.0,9.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48583,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48736,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49310,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50265,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Cosine similarity

from sklearn.metrics.pairwise import cosine_similarity
import operator
def recommend_anime(anime_id, matrix):
    # create a df of just the current user
    anime = matrix[matrix.index == anime_id]
    
    # and a df of all other users
    other_anime = matrix[matrix.index != anime_id]
    
    # calc cosine similarity between user and each other user
    similarities = cosine_similarity(anime, other_anime)[0].tolist()
    
    # create list of indices of these users
    other_anime_ids = other_anime.index.tolist()
    
    # create dataframe of user index and their similarity
    similarity_series = pd.Series(similarities, index=other_anime_ids)
    
    # sort by similarity
    similarity_series = similarity_series.sort_values(ascending=False)
    
    # map anime id to name
    similarity_series.index = similarity_series.index.map(dict(zip(animes['MAL_ID'], animes['Name'])))
    
    return similarity_series

In [8]:
similarity_series = recommend_anime(23273, final_matrix)
similarity_series.head(10)

Kimi no Na wa.             0.648607
Koe no Katachi             0.616177
Shingeki no Kyojin         0.610146
Boku dake ga Inai Machi    0.606155
No Game No Life            0.595911
One Punch Man              0.585825
Tokyo Ghoul                0.582321
Sword Art Online           0.582139
Boku no Hero Academia      0.578416
Noragami                   0.574414
dtype: float64

In [9]:
# s = similarity_series.to_frame().reset_index()
# s = s.rename(columns={'index': 'Name', 0: 'Score'})

# s.to_csv('item-based-recs2.csv', index=False)