## Simple but effective collaborative filtering recommendation system by anime-similarity 

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/anime-recommendations-database/rating.csv
/kaggle/input/anime-recommendations-database/anime.csv


# Data importation

In [2]:
dirname = '/kaggle/input/anime-recommendations-database'

rating_path = os.path.join(dirname, 'rating.csv')
anime_path = os.path.join(dirname, 'anime.csv')

rating_df = pd.read_csv(rating_path)
item_df = pd.read_csv(anime_path)

In [3]:
print(rating_df.shape)
print(rating_df.head())

(7813737, 3)
   user_id  anime_id  rating
0        1        20      -1
1        1        24      -1
2        1        79      -1
3        1       226      -1
4        1       241      -1


In [4]:
print(item_df.shape)
print(item_df.head())

(12294, 7)
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [5]:
rating_df.isna().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

We don't have NA on rating_df! Good point.

# Data preparation

Let's rename the anime_id by item_id. From now, the "animes" will be called "items".

In [6]:
colname_mapping = {
    'anime_id': 'item_id'
}
rating_df = rating_df.rename(columns=colname_mapping)
item_df = item_df.rename(columns=colname_mapping)

In [7]:
print(rating_df.head())
print(item_df.head())

   user_id  item_id  rating
0        1       20      -1
1        1       24      -1
2        1       79      -1
3        1      226      -1
4        1      241      -1
   item_id                              name  \
0    32281                    Kimi no Na wa.   
1     5114  Fullmetal Alchemist: Brotherhood   
2    28977                          Gintama°   
3     9253                       Steins;Gate   
4     9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572

In [8]:
train = rating_df

# Model definition and fitting

In [9]:
from implicit import nearest_neighbours as nn
import scipy.sparse as sparse

In [10]:
sparse_item_user = sparse.csr_matrix(
    (train['rating'].astype(float),
     (train['item_id'], train['user_id'])))

model = nn.CosineRecommender()

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')
model.fit(data_conf)

HBox(children=(FloatProgress(value=0.0, max=34520.0), HTML(value='')))




In [11]:
items_id = train.item_id.unique().tolist()
items_id[:5]

[20, 24, 79, 226, 241]

# Item-similarity dict creation

In [12]:
def create_item_id_to_similar(model, nb_reco, factor_similar, items_id):
    return {item_id: [(sim_item_id, sim_score)
                      for sim_item_id, sim_score in model.similar_items(
                          item_id, nb_reco * factor_similar)
                      if sim_item_id != item_id
                     ]
            for item_id in items_id
           }

item_id_to_similar = create_item_id_to_similar(
    model=model,
    nb_reco=10,
    factor_similar=5,
    items_id=items_id)

In [13]:
list(item_id_to_similar.items())[:2]

[(20,
  [(1535, 0.5463407077066675),
   (121, 0.47272149158167476),
   (269, 0.466974610783123),
   (5114, 0.4644807583399425),
   (1575, 0.46391151405222386),
   (11757, 0.46328649033916763),
   (16498, 0.45781777902012366),
   (813, 0.452006430364823),
   (442, 0.4444997099847257),
   (9919, 0.4439122675141474),
   (2904, 0.44365494125697846),
   (226, 0.4357369321904798),
   (2472, 0.4343306763730486),
   (3588, 0.432832107642108),
   (8074, 0.4324508320250583),
   (223, 0.41795363534614516),
   (936, 0.41118621378998754),
   (6702, 0.4084388192008567),
   (4437, 0.4065098938344902)]),
 (24,
  [(846, 0.8247128890929168),
   (517, 0.5924375328243505),
   (4015, 0.5562273803416666),
   (72, 0.38335837369811243),
   (849, 0.37472112119618595),
   (71, 0.37233760843976127),
   (189, 0.35264547278487807),
   (73, 0.3453509503193189),
   (245, 0.33648511961634203),
   (2104, 0.3334990346845112),
   (66, 0.32871673370836213),
   (4224, 0.32718830479054095),
   (1887, 0.3259372703781951),
 

# Computation and visualization functions definition

In [14]:
# visualize similar anime
max_item = 3

def print_item_features(item_id):
    current_item = item_df[item_df['item_id'] == item_id]
    print(f"id: {item_id}\nname: {current_item['name'].values[0]}\ngenre: {current_item['genre'].values[0]}")

for item_id, sim_item_ids in list(item_id_to_similar.items())[:max_item]:
    print("-----------------------------------------------")
    print_item_features(item_id=item_id)
    print("-----------------------------------------------")
    sim_item_ids = [id[0] for id in sim_item_ids]
    for counter, sim_item_id in enumerate(sim_item_ids):
        print(f"similarity rank {counter+1}")
        print_item_features(item_id=sim_item_id)
    print("-----------------------------------------------")

-----------------------------------------------
id: 20
name: Naruto
genre: Action, Comedy, Martial Arts, Shounen, Super Power
-----------------------------------------------
similarity rank 1
id: 1535
name: Death Note
genre: Mystery, Police, Psychological, Supernatural, Thriller
similarity rank 2
id: 121
name: Fullmetal Alchemist
genre: Action, Adventure, Comedy, Drama, Fantasy, Magic, Military, Shounen
similarity rank 3
id: 269
name: Bleach
genre: Action, Comedy, Shounen, Super Power, Supernatural
similarity rank 4
id: 5114
name: Fullmetal Alchemist: Brotherhood
genre: Action, Adventure, Drama, Fantasy, Magic, Military, Shounen
similarity rank 5
id: 1575
name: Code Geass: Hangyaku no Lelouch
genre: Action, Mecha, Military, School, Sci-Fi, Super Power
similarity rank 6
id: 11757
name: Sword Art Online
genre: Action, Adventure, Fantasy, Game, Romance
similarity rank 7
id: 16498
name: Shingeki no Kyojin
genre: Action, Drama, Fantasy, Shounen, Super Power
similarity rank 8
id: 813
name: D

In [15]:
def get_sorted_by_values(items_id,
                         item_id_to_similar,
                         nb_reco):   
    score = {}

    for item_id in items_id:
        for sim_results in item_id_to_similar[item_id]:
            score[sim_results[0]] = score.get(sim_results[0], 0) + sim_results[1]
  
    # order the dictionary to identify the most similar animes
    sorted_by_value = sorted(score.items(), key=lambda kv: kv[1], reverse=True)
    return sorted_by_value

In [16]:
sorted_by_value = get_sorted_by_values(items_id=items_id[:10],
                     item_id_to_similar=item_id_to_similar,
                     nb_reco=5
                    )

In [17]:
sorted_by_value

[(2167, 2.3156978461146482),
 (849, 2.0325746135802),
 (1575, 1.8885540696973904),
 (2904, 1.8177595300839928),
 (71, 1.5576582947150237),
 (2993, 1.5442268392669776),
 (1535, 1.5390210307366161),
 (4224, 1.5233347128868662),
 (355, 1.4863803928657444),
 (1195, 1.4105008357372606),
 (121, 1.36489436779453),
 (1840, 1.3551769941647944),
 (6547, 1.31992002418816),
 (189, 1.2961200724776685),
 (2104, 1.2778705067819276),
 (2787, 1.274857830821524),
 (241, 1.2681767908338972),
 (936, 1.2480835800795482),
 (517, 1.2416125115452403),
 (4015, 1.2071200439596856),
 (63, 1.1594826395314328),
 (4063, 1.1269144964601503),
 (3503, 1.1159614524091648),
 (193, 1.1111223068468812),
 (2472, 1.1108649983300316),
 (4214, 1.0756935378681154),
 (4437, 1.0318120453990582),
 (3455, 0.9937313515791661),
 (8074, 0.9477430864983402),
 (11757, 0.9247509018331403),
 (4654, 0.9207929490478863),
 (16498, 0.9069332750558332),
 (5114, 0.9026534379363684),
 (1818, 0.893809551117522),
 (3712, 0.8824795308643508),
 (48

In [18]:
def user_to_visited_item_id_dict(train,
                                 user_list):
    return train.groupby('user_id')['item_id'].apply(lambda g: g.values
                                                  .tolist()).to_dict()

In [19]:
user_to_visited_item_id_dict = user_to_visited_item_id_dict(train=train,
                                                            user_list=train.user_id.unique().tolist())

In [20]:
def print_visited_item(user_id):
    visited_item_list = user_to_visited_item_id_dict[user_id]
    print(f"For user_id: {user_id}, visited animes (count:{len(visited_item_list)}) are:")
    for counter, item_id in enumerate(visited_item_list):
        print(f"visited anime {counter+1}")
        print_item_features(item_id=item_id)

In [21]:
print_visited_item(user_id=train.user_id.values[0])

For user_id: 1, visited animes (count:153) are:
visited anime 1
id: 20
name: Naruto
genre: Action, Comedy, Martial Arts, Shounen, Super Power
visited anime 2
id: 24
name: School Rumble
genre: Comedy, Romance, School, Shounen
visited anime 3
id: 79
name: Shuffle!
genre: Comedy, Drama, Ecchi, Fantasy, Harem, Magic, Romance, School, Seinen
visited anime 4
id: 226
name: Elfen Lied
genre: Action, Drama, Horror, Psychological, Romance, Seinen, Supernatural
visited anime 5
id: 241
name: Girls Bravo: First Season
genre: Comedy, Ecchi, Fantasy, Harem, Romance, School
visited anime 6
id: 355
name: Shakugan no Shana
genre: Action, Drama, Fantasy, Romance, School, Supernatural
visited anime 7
id: 356
name: Fate/stay night
genre: Action, Fantasy, Magic, Romance, Supernatural
visited anime 8
id: 442
name: Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo!
genre: Adventure, Comedy, Drama, Historical, Shounen, Supernatural
visited anime 9
id: 487
name: Girls Bravo: Second Season
genre

In [22]:
def compute_recommendation(user_id, user_to_visited_item_id_dict, item_id_to_similar, nb_reco=5):
    visited_item_list = user_to_visited_item_id_dict[user_id]
    sorted_by_values = get_sorted_by_values(
        items_id=visited_item_list,
        item_id_to_similar=item_id_to_similar,
        nb_reco=nb_reco
    )
    return [id_score[0] for id_score in sorted_by_values][:nb_reco]

def print_user_recommendation(user_id, user_to_visited_item_id_dict, item_id_to_similar, nb_reco=5):
    recommendation_list = compute_recommendation(
        user_id=user_id, 
        user_to_visited_item_id_dict=user_to_visited_item_id_dict, 
        item_id_to_similar=item_id_to_similar, 
        nb_reco=nb_reco)
    
    for counter, item_id in enumerate(recommendation_list):
        print(f"recommended item {counter+1}")
        print_item_features(item_id=item_id)

In [23]:
compute_recommendation(
    user_id=train.user_id.values[0],
    user_to_visited_item_id_dict=user_to_visited_item_id_dict,
    item_id_to_similar=item_id_to_similar,
    nb_reco=5)

[9041, 10719, 8841, 11617, 19815]

In [24]:
print_user_recommendation(
    user_id=train.user_id.values[0],
    user_to_visited_item_id_dict=user_to_visited_item_id_dict,
    item_id_to_similar=item_id_to_similar,
    nb_reco=5)

recommended item 1
id: 9041
name: IS: Infinite Stratos
genre: Action, Comedy, Harem, Mecha, Sci-Fi
recommended item 2
id: 10719
name: Boku wa Tomodachi ga Sukunai
genre: Comedy, Ecchi, Harem, Romance, School, Seinen, Slice of Life
recommended item 3
id: 8841
name: Kore wa Zombie Desu ka?
genre: Action, Comedy, Ecchi, Harem, Magic, Supernatural
recommended item 4
id: 11617
name: High School DxD
genre: Comedy, Demons, Ecchi, Harem, Romance, School
recommended item 5
id: 19815
name: No Game No Life
genre: Adventure, Comedy, Ecchi, Fantasy, Game, Supernatural


In [25]:
def print_visited_anime_and_recommendation(
    user_id,
    user_to_visited_item_id_dict=user_to_visited_item_id_dict, 
    item_id_to_similar=item_id_to_similar, 
    nb_reco=5
):
    
    print_visited_item(user_id=user_id)
    
    print_user_recommendation(
        user_id=user_id,
        user_to_visited_item_id_dict=user_to_visited_item_id_dict,
        item_id_to_similar=item_id_to_similar,
        nb_reco=nb_reco
    )


import random
def get_random_user(users):
    return random.choice(users)

# Recommendation visualization for a random user

In [26]:
print_visited_anime_and_recommendation(
    user_id=get_random_user(users=train.user_id.unique().tolist())
)

For user_id: 4968, visited animes (count:36) are:
visited anime 1
id: 27
name: Trinity Blood
genre: Action, Supernatural, Vampire
visited anime 2
id: 60
name: Chrno Crusade
genre: Action, Demons, Historical, Romance, Supernatural
visited anime 3
id: 64
name: Rozen Maiden
genre: Action, Comedy, Drama, Magic, Seinen
visited anime 4
id: 112
name: Chou Henshin Cosprayers
genre: Action, Adventure, Comedy, Ecchi, Fantasy, Magic, Sci-Fi, Super Power
visited anime 5
id: 125
name: Futakoi
genre: Comedy, Romance, School
visited anime 6
id: 133
name: Green Green
genre: Comedy, Ecchi, Romance, School, Slice of Life
visited anime 7
id: 143
name: Kannazuki no Miko
genre: Drama, Magic, Mecha, Romance, Shoujo Ai, Shounen, Supernatural
visited anime 8
id: 180
name: Vandread
genre: Action, Ecchi, Mecha, Sci-Fi, Shounen, Space
visited anime 9
id: 181
name: Vandread: The Second Stage
genre: Action, Ecchi, Mecha, Sci-Fi, Shounen, Space
visited anime 10
id: 196
name: Onegai☆Twins
genre: Comedy, Drama, Harem