# Importing Libraries

In [1]:
import os
import numpy as np
import pandas as pd

import warnings

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings("always")
warnings.filterwarnings("ignore")

# Importing Data

In [2]:
anime_complete_df = pd.read_csv(r'C:\Users\tsubo\Data Analytics\Anime Recommendations\Data\anime2.csv')
anime_synop_df = pd.read_csv(r'C:\Users\tsubo\Data Analytics\Anime Recommendations\Data\anime_with_synopsis.csv')

In [3]:
anime_synop_df.head(2)

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."


In [4]:
anime_synop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16214 entries, 0 to 16213
Data columns (total 5 columns):
MAL_ID       16214 non-null int64
Name         16214 non-null object
Score        16214 non-null object
Genres       16214 non-null object
sypnopsis    16206 non-null object
dtypes: int64(1), object(4)
memory usage: 633.4+ KB


In [5]:
anime_synop_df = anime_synop_df[['MAL_ID', 'Name', 'Genres', 'sypnopsis']]

In [6]:
anime_df = anime_synop_df.join(anime_complete_df, on='MAL_ID', rsuffix='r')

In [7]:
anime_df = anime_df[['Name', 'Genres', 'sypnopsis', 'Type']]

In [8]:
anime_df.columns = ['Name', 'Genres', 'Synopsis', 'Type']

In [9]:
anime_df

Unnamed: 0,Name,Genres,Synopsis,Type
0,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever...",Movie
1,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ...",TV
2,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...",TV
3,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,TV
4,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...,TV
5,Eyeshield 21,"Action, Sports, Comedy, Shounen",Sena is like any other shy kid starting high s...,TV
6,Hachimitsu to Clover,"Comedy, Drama, Josei, Romance, Slice of Life","Yuuta Takemoto, a sophomore at an arts college...",TV
7,Hungry Heart: Wild Striker,"Slice of Life, Comedy, Sports, Shounen",Kyosuke Kano has lived under the shadow of his...,TV
8,Initial D Fourth Stage,"Action, Cars, Sports, Drama, Seinen",Takumi Fujiwara finally joins Ryousuke and Kei...,TV
9,Monster,"Drama, Horror, Mystery, Police, Psychological,...","Dr. Kenzou Tenma, an elite neurosurgeon recent...",TV


# Anime Missing Values

In [10]:
anime_df.isnull().sum().sort_values(ascending=False)

Type        9436
Synopsis       8
Genres         0
Name           0
dtype: int64

In [11]:
print(anime_df['Synopsis'].mode()[0])
print(anime_df['Type'].mode()[0])

No synopsis information has been added to this title. Help improve our database by adding a synopsis here .
TV


Seems like the default message for no synopsis is 'No synopsis information has been added to this title.
Help improve our database by adding a synopsis here .'.
We will replace null values in Synopsis with this. For type, we fill in with TV.

In [13]:
anime_df['Synopsis'] = anime_df['Synopsis'].fillna(
    anime_df['Synopsis'].dropna().mode().values[0]
)
anime_df['Type'] = anime_df['Type'].fillna(
    anime_df['Type'].dropna().mode().values[0]
)

Verify that we removed all nan values

In [14]:
anime_df.isnull().sum()

Name        0
Genres      0
Synopsis    0
Type        0
dtype: int64

Only care about animes, so type being TV. Others can be dropped

In [15]:
anime_df = anime_df[anime_df['Type']=='TV']

In [16]:
anime_df.drop('Type', axis=1, inplace=True)

# Building the recommender
We will be using name and genre of each anime. We will use Tf-idf on the names of anime and find the count frequency on the genres of anime. Then, we will use cosine similarity to compute the similarities of the two frequency matrices. Finally, we use the average of the two similarity scores per anime to find recommendations.

In [17]:
indices = pd.Series(anime_df.index, index = anime_df['Name'])

In [18]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf2 = TfidfVectorizer(stop_words='english')
count = CountVectorizer(stop_words='english')

In [19]:
tfidf_matrix = tfidf.fit_transform(anime_df['Name'])
count_matrix = count.fit_transform(anime_df['Genres'])
tfidf2_matrix = tfidf2.fit_transform(anime_df['Synopsis'])

In [20]:
name_similarity = cosine_similarity(tfidf_matrix)
genre_similarity = cosine_similarity(count_matrix)
synopsis_similarity = cosine_similarity(tfidf2_matrix)

# Recommendations

In [21]:
def get_recommendations(anime):
    i = indices[anime]
    
    name_score = list(enumerate(name_similarity[i]))
    genre_score = list(enumerate(genre_similarity[i]))
    synopsis_score = list(enumerate(synopsis_similarity[i]))
    
    name_score = sorted(name_score, key = lambda x: x[0])
    genre_score = sorted(genre_score, key = lambda x: x[0])
    synopsis_score = sorted(synopsis_score, key = lambda x: x[0])
    
    combined_score = [(i, (sc_1 + sc_2 + sc_3) / 3) for (i, sc_1), (_, sc_2), (_, sc_3) in zip(name_score, genre_score, synopsis_score)]
    
    combined_score = sorted(combined_score, key = lambda x: x[1], reverse = True)
    
    anime_ids = [i[0] for i in combined_score[1:11]]
    
    anime_recs = []
    
    index = 0
    while len(anime_recs) != 10:
        anime_id = combined_score[1:][index][0]
        index += 1
        if anime in indices.iloc[[anime_id]].index[0]:
            continue
        else:
            anime_recs.append(indices.iloc[[anime_id]].index[0])
    
    
    print(f'If you liked {anime}, you should try:')
    for i, v in list(enumerate(anime_recs)):
        print(f'{i + 1}. {v}')

In [32]:
get_recommendations('Samurai Champloo')

If you liked Samurai Champloo, you should try:
1. Hellsing Ultimate
2. Hellsing: The Dawn
3. Hellsing: Digest for Freaks
4. Dance in the Vampire Bund: Special Edition
5. Kizumonogatari II: Nekketsu-hen
6. Ajin Part 1: Shoudou
7. Ajin Part 2: Shoutotsu
8. Ajin Part 3: Shougeki
9. Ajin
10. Bloodivores


In [33]:
path = r'C:\Users\tsubo\Data Analytics\Anime Recommendations\Data'

In [34]:
anime_df.to_csv(os.path.join(path, 'anime_df.csv'))