In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
import warnings

np.random.seed(8)
warnings.filterwarnings(action="ignore")

In [2]:
#We are only interested in Animes (Type == "TV") and we will only use Genres for recommending similar animes
anime_df = pd.read_csv('../data/raw/anime.csv',
                       usecols=["MAL_ID", "Name", "Genres", "Type"], index_col= "MAL_ID")
anime_df = anime_df.loc[anime_df['Type'] == 'TV']
anime_df.head()


Unnamed: 0_level_0,Name,Genres,Type
MAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV
6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV
7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",TV
8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural",TV
15,Eyeshield 21,"Action, Sports, Comedy, Shounen",TV


In [3]:
# Drop the animes with null values and remove duplicates if any
clean_anime_df = anime_df[anime_df.Genres.notna()]
clean_anime_df = clean_anime_df.drop_duplicates()

In [4]:
# First, split the genre column by comma and expand the list so there is
# a column for each genre. 
genres = clean_anime_df.Genres.str.split(", ", expand=True)

In [5]:
# Now we can get the list of unique genres. We "convert" the dataframe into
# a single dimension array and take the unique values
unique_genres = pd.Series(genres.values.ravel('K')).dropna().unique()
unique_genres


array(['Action', 'Adventure', 'Comedy', 'Slice of Life', 'Drama', 'Game',
       'Sci-Fi', 'Harem', 'Military', 'Space', 'Music', 'Mecha',
       'Supernatural', 'Historical', 'Mystery', 'School', 'Fantasy',
       'Ecchi', 'Kids', 'Sports', 'Horror', 'Dementia', 'Magic',
       'Psychological', 'Cars', 'Shounen', 'Demons', 'Romance', 'Parody',
       'Thriller', 'Seinen', 'Super Power', 'Unknown', 'Martial Arts',
       'Shounen Ai', 'Josei', 'Shoujo', 'Samurai', 'Police', 'Shoujo Ai',
       'Vampire'], dtype=object)

In [6]:
# Getting the dummy variables will result in having a lot more columns
# than unique genres
dummies = pd.get_dummies(genres)


In [7]:
# So we sum up the columns with the same genre to have a single column for
# each genre
for genre in unique_genres:
    clean_anime_df[genre] = dummies.loc[:,dummies.columns.str.endswith(genre)].sum(axis=1)

In [8]:
#Since we have separate column for each genre, we can remove the other unnecessary cols
genre_df = clean_anime_df.drop(
    columns=["Name", "Genres", "Type"])

In [9]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

anime_matrix = csr_matrix(genre_df.values)


model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(anime_matrix)


NearestNeighbors(algorithm='brute', metric='cosine')

In [10]:
def get_index_from_title(title):
    return clean_anime_df[clean_anime_df.Name == title].index

In [11]:
#Enter the exact title of the anime here. 
query_title = "Tokyo Ghoul"
idx = get_index_from_title(query_title).values[0]

In [12]:
#Calculate the nearest animes and their respective distances
distances, indices = model_knn.kneighbors(
    genre_df[genre_df.index == idx].values.reshape(1, -1), n_neighbors=11)


In [13]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(
            anime_df.Name[idx]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(
            i, np.array(anime_df)[indices.flatten()[i]][0], distances.flatten()[i]))

Recommendations for Tokyo Ghoul:

1: Tokyo Ghoul, with distance of 3.3306690738754696e-16:
2: Tokyo Ghoul √A, with distance of 3.3306690738754696e-16:
3: Tokyo Ghoul:re, with distance of 3.3306690738754696e-16:
4: Elfen Lied, with distance of 0.14285714285714313:
5: Ajin 2nd Season, with distance of 0.15484574527148354:
6: Ajin, with distance of 0.15484574527148354:
7: Kokkoku, with distance of 0.15484574527148354:
8: Shinsekai yori, with distance of 0.2284832501895404:
9: Kiseijuu: Sei no Kakuritsu, with distance of 0.2284832501895404:
10: Boogiepop wa Warawanai, with distance of 0.2284832501895404:
