In [1]:
import pandas as pd

In [2]:
# Read user-ratings & anime-info datasets
df_rating = pd.read_csv('./Datasets/user-ratings.csv')
# df_anime = pd.read_csv('./Datasets/anime-info-main-clean.csv', sep=';')
df_anime = pd.read_csv('./Datasets/anime.csv')

In [3]:
df_anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
df_rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [5]:
df_rating.describe()

Unnamed: 0,user_id,anime_id,rating
count,7813737.0,7813737.0,7813737.0
mean,36727.96,8909.072,6.14403
std,20997.95,8883.95,3.7278
min,1.0,1.0,-1.0
25%,18974.0,1240.0,6.0
50%,36791.0,6213.0,7.0
75%,54757.0,14093.0,9.0
max,73516.0,34519.0,10.0


In [6]:
df_rating.rating.value_counts()

 8     1646019
-1     1476496
 7     1375287
 9     1254096
 10     955715
 6      637775
 5      282806
 4      104291
 3       41453
 2       23150
 1       16649
Name: rating, dtype: int64

In [7]:
# Delete rows with -1 rating (i.e. the user watched it but didn't assign a rating)
df_rating = df_rating[df_rating["rating"] != -1]

In [8]:
df = pd.merge(df_rating, df_anime[["anime_id","name"]]).drop("anime_id", axis = 1)
df.columns = ['user_id', 'rating', 'title']
df.head()

Unnamed: 0,user_id,rating,title
0,1,10,Highschool of the Dead
1,3,6,Highschool of the Dead
2,5,2,Highschool of the Dead
3,12,6,Highschool of the Dead
4,14,6,Highschool of the Dead


In [9]:
# Count the number of ratings for each anime
count_rating = df.groupby("title")["rating"].count().sort_values(ascending = False)
count_rating

title
Death Note                         34226
Sword Art Online                   26310
Shingeki no Kyojin                 25290
Code Geass: Hangyaku no Lelouch    24126
Angel Beats!                       23565
                                   ...  
La Primavera                           1
Chou Zenmairobo: Patrasche             1
Ushi Atama                             1
Gun-dou Musashi Recap                  1
Futago no Ookami Daibouken             1
Name: rating, Length: 9926, dtype: int64

In [10]:
# Delete anime(s) with irrelevant no. of ratings
r = 5000
more_than_r_ratings = count_rating[count_rating.apply(lambda x: x >= r)].index

# Keep only the animes with at least r ratings in the DataFrame
df_r = df[df['title'].apply(lambda x: x in more_than_r_ratings)].reset_index(drop=True)

## Collaborative Recommendation System

In [11]:
# Dataframe with user_id(s) as rows and anime_id(s) as columns and rating(s) as values
df_recom = df_r.pivot_table(index='user_id',columns='title',values='rating')

def find_corr(anime):
    '''
    Get the correlation of one anime with the others.
    
    Parameters
    ----------
    anime : str
        Name of the anime
    
    Returns
    -------
    similar_to_anime : Dataframe
        DataFrame with the correlation of the anime with all others
    '''
    
    similar_to_anime = df_recom.corrwith(df_recom[anime])
    similar_to_anime = pd.DataFrame(similar_to_anime, columns=['Correlation'])
    similar_to_anime = similar_to_anime.sort_values(by = 'Correlation', ascending = False)
    return similar_to_anime

In [13]:
find_corr('Naruto').head(10)

Unnamed: 0_level_0,Correlation
title,Unnamed: 1_level_1
Naruto,1.0
Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo!,0.582344
Naruto: Shippuuden Movie 1,0.578797
Bleach,0.547983
Fairy Tail,0.444271
Bleach Movie 2: The DiamondDust Rebellion - Mou Hitotsu no Hyourinmaru,0.439225
Dragon Ball Z,0.437442
InuYasha,0.4331
Dragon Ball GT,0.429704
Katekyo Hitman Reborn!,0.419789
