In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
anime = pd.read_csv("./anime-recommendations-database/anime.csv")

In [4]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [7]:
anime.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [8]:
# Sample the ones where episode number is unknown

anime[anime['episodes'] == 'Unknown'].head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
74,21,One Piece,"Action, Adventure, Comedy, Drama, Fantasy, Sho...",TV,Unknown,8.58,504862
252,235,Detective Conan,"Adventure, Comedy, Mystery, Police, Shounen",TV,Unknown,8.25,114702
615,1735,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,Unknown,7.94,533578
991,966,Crayon Shin-chan,"Comedy, Ecchi, Kids, School, Shounen, Slice of...",TV,Unknown,7.73,26267
1021,33157,Tanaka-kun wa Itsumo Kedaruge Specials,"Comedy, School, Slice of Life",Special,Unknown,7.72,5400


In [11]:
# This subsets the columns using the .loc function (?). Row subsetting is similar to R. 
# The interesting thing is that the () around the matching conditions are needed, 
# otherwise the & operator fails. 

anime.loc[(anime['genre'] == 'Hentai') & (anime['episodes'] == 'Unknown'), 'episodes'] = 1
anime.loc[(anime['genre'] == 'OVA') & (anime['episodes'] == 'Unknown'), 'episodes'] = 1
anime.loc[(anime['genre'] == 'Movie') & (anime['episodes'] == 'Unknown'), 'episodes'] = 1

In [13]:
anime[anime['episodes'] == 'Unknown'].shape

(304, 7)

In [17]:
anime['rating'].describe()

count    12064.000000
mean         6.473902
std          1.026746
min          1.670000
25%          5.880000
50%          6.570000
75%          7.180000
max         10.000000
Name: rating, dtype: float64

In [19]:
# This is how to find the nulls in the dataset. Why sum()? 

anime.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [20]:
# This returns the entire dataframe evaluated to boolean. So summing it up returns counts of nulls. 

anime.isnull()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False


In [21]:
known_animes = {"Naruto Shippuuden":500, "One Piece":784,"Detective Conan":854, "Dragon Ball Super":86,
                "Crayon Shin chan":942, "Yu Gi Oh Arc V":148,"Shingeki no Kyojin Season 2":25,
                "Boku no Hero Academia 2nd Season":25,"Little Witch Academia TV":25}

In [22]:
# Scroll through above dict and substitute known episode counts. 

for k,v in known_animes.items():    
    anime.loc[anime["name"]==k,"episodes"] = v

In [23]:
# Write a lambda function that replaces all Unknowns with np.nan. 

anime["episodes"] = anime["episodes"].map(lambda x:np.nan if x=="Unknown" else x)

In [24]:
anime["episodes"].fillna(anime["episodes"].median(),inplace = True)

In [32]:
# Default argument to get_dummies is a column, which anime[['type']] generates. 

pd.get_dummies(anime[["type"]]).head()

Unnamed: 0,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,1,0,0,0,0,0
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,0,0,0,0,1
4,0,0,0,0,0,1


In [33]:
# Convert the ratings to a float
anime["rating"] = anime["rating"].astype(float)

# Replace all 'na's with the median rating 
anime["rating"].fillna(anime["rating"].median(),inplace = True)

# Convert the members to a float
anime["members"] = anime["members"].astype(float)

In [34]:
# Scaling
# Pandas returns dataframes, so .get_dummies can operate directly on that. 
# Some lines have multiple types, so comma separator breaks them out into 
# each component genre. 

anime_features = pd.concat([anime["genre"].str.get_dummies(sep=","),
                            pd.get_dummies(anime[["type"]]),
                            anime[["rating"]],anime[["members"]],anime["episodes"]],axis=1)

In [36]:
anime["name"] = anime["name"].map(lambda name:re.sub('[^A-Za-z0-9]+', " ", name))
anime_features.head()

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Yaoi,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV,rating,members,episodes
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,9.37,200630.0,1
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,9.26,793665.0,64
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.25,114262.0,51
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.17,673572.0,24
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.16,151266.0,51


In [37]:
anime_features.columns

Index([u' Adventure', u' Cars', u' Comedy', u' Dementia', u' Demons',
       u' Drama', u' Ecchi', u' Fantasy', u' Game', u' Harem', u' Hentai',
       u' Historical', u' Horror', u' Josei', u' Kids', u' Magic',
       u' Martial Arts', u' Mecha', u' Military', u' Music', u' Mystery',
       u' Parody', u' Police', u' Psychological', u' Romance', u' Samurai',
       u' School', u' Sci-Fi', u' Seinen', u' Shoujo', u' Shoujo Ai',
       u' Shounen', u' Shounen Ai', u' Slice of Life', u' Space', u' Sports',
       u' Super Power', u' Supernatural', u' Thriller', u' Vampire', u' Yaoi',
       u' Yuri', u'Action', u'Adventure', u'Cars', u'Comedy', u'Dementia',
       u'Demons', u'Drama', u'Ecchi', u'Fantasy', u'Game', u'Harem', u'Hentai',
       u'Historical', u'Horror', u'Josei', u'Kids', u'Magic', u'Martial Arts',
       u'Mecha', u'Military', u'Music', u'Mystery', u'Parody', u'Police',
       u'Psychological', u'Romance', u'Samurai', u'School', u'Sci-Fi',
       u'Seinen', u'Shoujo', u'S

In [38]:
from sklearn.preprocessing import MinMaxScaler

In [39]:
# Read up on MinMaxScaler() and .fit_transform method 

min_max_scaler = MinMaxScaler()
anime_features = min_max_scaler.fit_transform(anime_features)

In [40]:
np.round(anime_features,2)

array([[0.  , 0.  , 0.  , ..., 0.92, 0.2 , 0.  ],
       [1.  , 0.  , 0.  , ..., 0.91, 0.78, 0.03],
       [0.  , 0.  , 1.  , ..., 0.91, 0.11, 0.03],
       ...,
       [0.  , 0.  , 0.  , ..., 0.39, 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.4 , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.45, 0.  , 0.  ]])

In [41]:
from sklearn.neighbors import NearestNeighbors

In [42]:
nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(anime_features)

In [43]:
# What does this do? 

# Documentation here: 
# http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html#sklearn.neighbors.NearestNeighbors.fit



distances, indices = nbrs.kneighbors(anime_features)

In [44]:
def get_index_from_name(name):
    return anime[anime["name"]==name].index.tolist()[0]

In [45]:
all_anime_names = list(anime.name.values)

In [46]:
def get_id_from_partial_name(partial):
    for name in all_anime_names:
        if partial in name:
            print(name,all_anime_names.index(name))

In [47]:
""" print_similar_query can search for similar animes both by id and by name. """

def print_similar_animes(query=None,id=None):
    if id:
        for id in indices[id][1:]:
            print(anime.ix[id]["name"])
    if query:
        found_id = get_index_from_name(query)
        for id in indices[found_id][1:]:
            print(anime.ix[id]["name"])

In [48]:
print_similar_animes("Naruto")

Naruto Shippuuden
Katekyo Hitman Reborn 
Bleach
Dragon Ball Z
Boku no Hero Academia


In [52]:
distances[1:6]

array([[0.        , 1.02413628, 1.49532476, 1.51709788, 1.56764563,
        1.58773614],
       [0.        , 0.03806193, 0.04229669, 0.23545853, 0.33783993,
        1.41519443],
       [0.        , 1.20937475, 1.22706071, 1.24010438, 1.24167403,
        1.24445771],
       [0.        , 0.03806193, 0.0725346 , 0.20088422, 0.34043324,
        1.41662942],
       [0.        , 0.08739552, 0.08921095, 0.19695951, 1.0012002 ,
        1.0019679 ]])