In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
%matplotlib inline

In [18]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [19]:
anime= pd.read_csv('anime.csv')

In [20]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [21]:
# Let's deal with missing values

In [22]:
anime.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [23]:
# but it's there even in the anime column

In [24]:
anime[anime['episodes']=='Unknown'].head(3)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
74,21,One Piece,"Action, Adventure, Comedy, Drama, Fantasy, Sho...",TV,Unknown,8.58,504862
252,235,Detective Conan,"Adventure, Comedy, Mystery, Police, Shounen",TV,Unknown,8.25,114702
615,1735,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,Unknown,7.94,533578


In [25]:
#Animes that are grouped under Hentai Categories generally have 1 episode. 
#So I’ve filled the unknown values with 1.
anime.loc[(anime["genre"]=="Hentai") & (anime["episodes"]=="Unknown"),"episodes"] = "1"

In [26]:
#Animes that are grouped under “Movies” are considered as ‘1’ episode as
#per the dataset overview goes.
anime.loc[(anime["type"] == "Movie") & (anime["episodes"] == "Unknown")] = "1"

In [27]:
# some known animes are being filled manually
known_animes = {"Naruto Shippuuden":500, "One Piece":784,"Detective Conan":854, "Dragon Ball Super":86,
                "Crayon Shin chan":942, "Yu Gi Oh Arc V":148,"Shingeki no Kyojin Season 2":25,
                "Boku no Hero Academia 2nd Season":25,"Little Witch Academia TV":25}


In [28]:
for k,v in known_animes.items():    
    anime.loc[anime["name"]==k,"episodes"] = v

In [29]:
#filled remaining episodes with 2 after mapping to function
anime["episodes"] = anime["episodes"].map(lambda x:np.nan if x=="Unknown" else x)
anime["episodes"].fillna(anime["episodes"].median(),inplace = True)

In [30]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [31]:
#next we will convert type to categorical variable
pd.get_dummies(anime[['type']]).head()

Unnamed: 0,type_1,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1


In [None]:
# we also need to transform genre
anime_features = pd.concat([anime["genre"].str.get_dummies(sep=","),
                            pd.get_dummies(anime[["type"]]),
                            anime[["rating"]],anime[["members"]],anime["episodes"]],axis=1)

In [32]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [33]:
# let's look at Ratings, genre, and members

In [34]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   anime_id  12294 non-null  object
 1   name      12294 non-null  object
 2   genre     12232 non-null  object
 3   type      12269 non-null  object
 4   episodes  12294 non-null  object
 5   rating    12067 non-null  object
 6   members   12294 non-null  object
dtypes: object(7)
memory usage: 672.5+ KB


In [35]:
anime.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      227
members       0
dtype: int64

In [36]:
#changing type of rating col
anime["rating"] = anime["rating"].astype(float)

In [None]:
#filling na vals in rating with median
anime["rating"].fillna(anime["rating"].median(),inplace = True)

In [None]:
#changing type of members col
anime["members"] = anime["members"].astype(float)

In [38]:
#In KNN distance metric is used, so we need to scale the features

In [39]:
# before that we need to do get dummies
# we also need to transform genre
anime_features = pd.concat([anime["genre"].str.get_dummies(sep=","),
                            pd.get_dummies(anime[["type"]]),
                            anime[["rating"]],anime[["members"]],anime["episodes"]],axis=1)

In [42]:
anime['name']

0                                           Kimi no Na wa.
1                         Fullmetal Alchemist: Brotherhood
2                                                 Gintama°
3                                              Steins;Gate
4                                            Gintama&#039;
                               ...                        
12289         Toushindai My Lover: Minami tai Mecha-Minami
12290                                          Under World
12291                       Violence Gekiga David no Hoshi
12292    Violence Gekiga Shin David no Hoshi: Inma Dens...
12293                     Yasuji no Pornorama: Yacchimae!!
Name: name, Length: 12294, dtype: object

In [43]:
anime_features.head()

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,type_1,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV,rating,members,episodes
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,9.37,200630.0,1
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,9.26,793665.0,64
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.25,114262.0,51
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.17,673572.0,24
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.16,151266.0,51


In [44]:
#scaling using min_max_scaler

In [45]:
from sklearn.preprocessing import MinMaxScaler

In [46]:
min_max_scaler = MinMaxScaler()
anime_features = min_max_scaler.fit_transform(anime_features)

In [47]:
np.round(anime_features,2)

array([[0.  , 0.  , 0.  , ..., 0.93, 0.2 , 0.  ],
       [1.  , 0.  , 0.  , ..., 0.92, 0.78, 0.03],
       [0.  , 0.  , 1.  , ..., 0.92, 0.11, 0.03],
       ...,
       [0.  , 0.  , 0.  , ..., 0.43, 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.44, 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.5 , 0.  , 0.  ]])

In [49]:
anime_features.shape

(12294, 93)

In [50]:
# Fit nearest neighbour algo to data

In [51]:
from sklearn.neighbors import NearestNeighbors

In [53]:
nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(anime_features)

In [60]:
# distances are distances of 6 nearest points from the current feature point. 
# I've taken six because the first distance is 0, i.e with itself

In [54]:
distances, indices = nbrs.kneighbors(anime_features)

In [56]:
distances.shape

(12294, 6)

In [57]:
indices.shape

(12294, 6)

In [58]:
distances

array([[0.00000000e+00, 1.01633857e+00, 1.03484164e+00, 1.03547556e+00,
        1.41673596e+00, 1.43504703e+00],
       [0.00000000e+00, 1.02326345e+00, 1.49460533e+00, 1.51670381e+00,
        1.56629658e+00, 1.58703108e+00],
       [0.00000000e+00, 3.78413369e-02, 4.18152040e-02, 2.35264187e-01,
        3.15115844e-01, 1.41517800e+00],
       ...,
       [0.00000000e+00, 1.66526972e-03, 1.68342172e-03, 5.66357548e-03,
        6.86823383e-03, 6.86911414e-03],
       [0.00000000e+00, 1.11268583e-03, 1.20660436e-03, 2.22255509e-03,
        2.24564527e-03, 3.33391240e-03],
       [0.00000000e+00, 1.00000248e+00, 1.00002222e+00, 1.00002225e+00,
        1.00003025e+00, 1.00005000e+00]])

In [59]:
indices

array([[    0,   208,  1494,  1959,    60,   894],
       [    1,   200,   268,   101,   795,   290],
       [    2,     4,     9,    12, 10896,     8],
       ...,
       [12291, 12238, 12237, 12236, 12256, 12235],
       [12292, 12231, 12232, 12230, 12229, 12283],
       [12293,  7426,  8279,  7349,  7335,  7498]], dtype=int64)

In [61]:
#helper function for printing similar names

In [65]:
anime[anime["name"]=='Gintama'].index.tolist()[0]

12

In [66]:
# get index from name
def get_index_from_name(name):
    return anime[anime["name"]==name].index.tolist()[0]
    

In [71]:
# can find similar animes on the basis of name(query) or id
def print_similar_animes(query=None,id=None):
    if id:
        # find the element in the indices then take indices of 5 nearest neighbours
        #pass those indices to name columns of anime
        for id in indices[id][1:]:
            print(anime.iloc[id]["name"])
    if query:
        found_id = get_index_from_name(query)
        for id in indices[found_id][1:]:
            print(anime.iloc[id]["name"])

In [72]:
print_similar_animes(query="Naruto")


Naruto: Shippuuden
Katekyo Hitman Reborn!
Bleach
Dragon Ball Z
Boku no Hero Academia


In [77]:
print_similar_animes('Steins;Gate')

Steins;Gate 0
Fireball Charming
Escha Chron
Hoshi no Ko Poron
Yuusei Kamen
