### $ Anime$ $ Recommendation $

In [1]:
import pandas as pd
import numpy as np

In [2]:
animeDF = pd.read_csv('anime.csv')

In [3]:
ratingDF = pd.read_csv('rating.csv')

### Basic info on Dataset

In [4]:
ratingDF.shape

(7813737, 3)

In [5]:
animeDF.shape

(12294, 7)

In [6]:
ratingDF.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [7]:
animeDF.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [8]:
ratingDF.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [9]:
ratingDF['rating'].unique() # here we see -1 which means that the user has watched the anime but hasn't reviewed it

array([-1, 10,  8,  6,  9,  7,  3,  5,  4,  1,  2], dtype=int64)

In [10]:
ratingDF['user_id'].unique() 

array([    1,     2,     3, ..., 73514, 73515, 73516], dtype=int64)

### Finding how many times a users have rated an Anime

In [11]:
users_rated = ratingDF.groupby('user_id')['rating'].count().sort_values(ascending=False)[:20]
users_rated

user_id
48766    10227
42635     3747
53698     2905
57620     2702
59643     2633
51693     2622
27364     2499
45659     2469
7345      2429
66021     2362
12431     2351
65840     2218
53492     2203
28521     2155
40604     2130
22434     1993
58343     1968
65836     1858
30597     1846
51270     1835
Name: rating, dtype: int64

### No of ratings for each rating level

In [12]:
rating_level = ratingDF['rating'].value_counts().reset_index()
rating_level.columns = ['Rating','No of Ratings']
ratingDF

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


### Null Value treatment

In [13]:
animeDF.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [14]:
filtered_animeDF = animeDF.fillna(0)

### Droping indices with ratings as -1

In [15]:
indice = ratingDF[ratingDF['rating']==-1].index

In [16]:
filtered_rating = ratingDF.drop(index=indice,axis=0)
filtered_rating.rating.unique()

array([10,  8,  6,  9,  7,  3,  5,  4,  1,  2], dtype=int64)

### the main objective here is to recommend users an anime that they would like

In [17]:
from sklearn.neighbors import NearestNeighbors

In [18]:
filtered_rating.index=range(0,len(filtered_rating))
filtered_rating.rating.value_counts()

8     1646019
7     1375287
9     1254096
10     955715
6      637775
5      282806
4      104291
3       41453
2       23150
1       16649
Name: rating, dtype: int64

### Filter out anime series with were low ratings

In [19]:
filter_rating = filtered_rating[filtered_rating['rating']>9]['anime_id']

### The dataset is still to large so to filter it in a better way is by using the universal ratings

In [20]:
filtered_anime_ratings = animeDF[animeDF['rating']>8]['anime_id']
filtered_rating_anime = filtered_rating[filtered_rating['anime_id'].isin(filter_rating)]
filtered_rating_anime = filtered_rating[filtered_rating['anime_id'].isin(filtered_anime_ratings)]
len(filtered_rating_anime)

2043222

In [21]:
filtered_rating_anime.shape

(2043222, 3)

In [22]:
filtered_rating_anime.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2043222 entries, 4 to 6337239
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 62.4 MB


### Create a pivot table for rating Vs user_id

In [23]:
rating_anime_matrix = filtered_rating_anime.pivot_table(index='anime_id',columns='user_id',values='rating')
rating_anime_matrix = rating_anime_matrix.fillna(0)
rating_anime_matrix

user_id,2,3,5,7,8,10,11,12,14,16,...,73507,73508,73509,73510,73511,73512,73513,73514,73515,73516
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,10.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,10.0,0.0
6,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0
15,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33161,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Compressing the table into a compressed sparse matrix

In [24]:
from scipy.sparse import csr_matrix
csr_rating_matrix =  csr_matrix(rating_anime_matrix.values)
csr_rating_matrix

<512x66105 sparse matrix of type '<class 'numpy.float64'>'
	with 2043216 stored elements in Compressed Sparse Row format>

### Fitting the model with Nearest Neighbors

In [25]:
recom = NearestNeighbors(metric='cosine',n_neighbors=10)
recom.fit(csr_rating_matrix)

NearestNeighbors(metric='cosine', n_neighbors=10)

### Recommendation like Naruto

In [26]:
recommendation = animeDF[animeDF['name']=='Naruto']['anime_id']

In [27]:
recommendation = recommendation.reset_index()

In [28]:
reco = np.array(rating_anime_matrix.iloc[int(recommendation['anime_id'])]).reshape(1,-1)

### Getting the recommendation list

In [29]:
dist,ind = recom.kneighbors(reco)
ind

array([[ 20,  19,  92, 119,  87, 124,  25, 164, 137,  44]], dtype=int64)

In [30]:
var = []
for _ in ind[0]:
    var.extend(list((animeDF[animeDF['anime_id']== _]['name'])))

In [31]:
print(var[1:])

['Monster', 'After War Gundam X', 'Final Approach', 'Mobile Suit Gundam: Char&#039;s Counterattack', 'Fushigi Yuugi: Eikouden', 'Sunabouzu', 'Mononoke Hime', 'Hunter x Hunter OVA', 'Rurouni Kenshin: Meiji Kenkaku Romantan - Tsuioku-hen']
