# Machine Learning: Anime Recommendation

#### Data Collection

In [1]:
import kaggle

In [2]:
!kaggle datasets download -d CooperUnion/anime-recommendations-database --unzip

Downloading anime-recommendations-database.zip to C:\Users\nclee\Documents\Education\Ironhack\Project-Anime




  0%|          | 0.00/25.0M [00:00<?, ?B/s]
  4%|3         | 1.00M/25.0M [00:00<00:02, 8.98MB/s]
  8%|7         | 2.00M/25.0M [00:00<00:02, 9.33MB/s]
 12%|#1        | 3.00M/25.0M [00:00<00:02, 9.64MB/s]
 20%|#9        | 5.00M/25.0M [00:00<00:02, 10.3MB/s]
 24%|##3       | 6.00M/25.0M [00:00<00:02, 9.88MB/s]
 28%|##7       | 7.00M/25.0M [00:00<00:02, 9.38MB/s]
 36%|###5      | 9.00M/25.0M [00:01<00:02, 8.02MB/s]
 44%|####3     | 11.0M/25.0M [00:01<00:01, 8.56MB/s]
 52%|#####1    | 13.0M/25.0M [00:01<00:01, 9.19MB/s]
 56%|#####5    | 14.0M/25.0M [00:01<00:01, 9.45MB/s]
 60%|#####9    | 15.0M/25.0M [00:01<00:01, 9.61MB/s]
 68%|######7   | 17.0M/25.0M [00:01<00:00, 10.2MB/s]
 72%|#######1  | 18.0M/25.0M [00:01<00:00, 10.2MB/s]
 76%|#######5  | 19.0M/25.0M [00:02<00:00, 9.95MB/s]
 80%|#######9  | 20.0M/25.0M [00:02<00:00, 9.64MB/s]
 88%|########7 | 22.0M/25.0M [00:02<00:00, 10.5MB/s]
 96%|#########5| 24.0M/25.0M [00:02<00:00, 10.9MB/s]
100%|##########| 25.0M/25.0M [00:02<00:00, 10.0MB/s]


In [85]:
!kaggle datasets download -d azathoth42/myanimelist -f AnimeList.csv --unzip

Downloading 28524%2F45582%2Fcompressed%2FAnimeList.csv.zip to C:\Users\nclee\Documents\Education\Ironhack\Project-Anime




  0%|          | 0.00/3.02M [00:00<?, ?B/s]
 33%|###3      | 1.00M/3.02M [00:00<00:00, 3.55MB/s]
 66%|######6   | 2.00M/3.02M [00:00<00:00, 3.37MB/s]
 99%|#########9| 3.00M/3.02M [00:00<00:00, 3.39MB/s]
100%|##########| 3.02M/3.02M [00:00<00:00, 3.33MB/s]


In [86]:
import zipfile

In [88]:
with zipfile.ZipFile("28524%2F45582%2Fcompressed%2FAnimeList.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("")

##### Libraries

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set()
plt.style.use('seaborn-whitegrid')
sns.set_style("white")

import scipy.stats as st
from sklearn.preprocessing import StandardScaler

## Data PreparationStandardScaler

In [30]:
anime=pd.read_csv("anime.csv", index_col="anime_id")

In [31]:
anime.isna().sum()

name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [32]:
# dropping those with NA in "genre", "type", "rating"
anime[anime.loc[:, ["genre", "type", "rating"]].isna().all(axis=1)]
anime.drop(anime[anime.loc[:, ["genre", "type", "rating"]].isna().all(axis=1)].index, axis=0, inplace=True)

In [33]:
# updated list to fill in the gaps in data
animelist=pd.read_csv("AnimeList.csv")

In [34]:
# trying to fill in the missing type information from new animelist, there are still "Unknown"
anime.type.fillna(animelist.set_index("anime_id").type, inplace=True)

In [35]:
# fill in the rest by research
anime_type={34437: "Movie", 32455: "TV", 28613: "TV", 30448: "TV", 24023: "TV", 34348: "TV"}
anime.loc[anime.type=="Unknown", "type"]=anime[anime.type=="Unknown"].index.map(anime_type)

In [36]:
# creating a dummy column for airing using episode as an indicator
anime["airing"]=np.where(anime.episodes=="Unknown", 1, 0)

In [37]:
anime["episodes"]=np.where(anime.episodes=="Unknown", 0, anime.episodes)

In [38]:
# creating a dummy column for whether it was NA for rating
anime["rate_na"]=np.where(anime.rating.isna(), 1, 0)

In [39]:
# fill all the NA in rating as 0
anime.rating.fillna(0, inplace=True)

In [40]:
anime.describe(include="all")

Unnamed: 0,name,genre,type,episodes,rating,members,airing,rate_na
count,12291,12232,12291,12291.0,12291.0,12291.0,12291.0,12291.0
unique,12289,3264,6,187.0,,,,
top,Saru Kani Gassen,Hentai,TV,1.0,,,,
freq,2,823,3805,5677.0,,,,
mean,,,,,6.354337,18075.6,0.027418,0.018469
std,,,,,1.33961,54826.69,0.163306,0.134645
min,,,,,0.0,5.0,0.0,0.0
25%,,,,,5.82,225.5,0.0,0.0
50%,,,,,6.55,1551.0,0.0,0.0
75%,,,,,7.17,9443.0,0.0,0.0


In [41]:
# should we group them into 5 instead of 6?
anime.type.value_counts(normalize=True)

TV         0.309576
OVA        0.269384
Movie      0.191360
Special    0.136360
ONA        0.053616
Music      0.039704
Name: type, dtype: float64

In [42]:
anime["type"]=np.where(anime.type.isin(["ONA", "Music"]), "Other", anime.type)

In [43]:
# standardize the data for members
scaler=StandardScaler()
anime[["episode", "rating", "members"]]=scaler.fit_transform(anime[["episodes", "rating", "members"]])


In [44]:
# dummies
dummy_col=["type"]
df=pd.get_dummies(data=anime, columns=dummy_col,drop_first=True)

In [18]:
anime

Unnamed: 0_level_0,name,genre,type,episodes,rating,members,airing,rate_na
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,0.937,0.197872,0,0.000550
5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,0.926,0.782770,0,0.035204
28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,0.925,0.112689,0,0.028053
9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,0.917,0.664325,0,0.013201
9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,0.916,0.149186,0,0.028053
...,...,...,...,...,...,...,...,...
9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,0.415,0.000203,0,0.000550
5543,Under World,Hentai,OVA,1,0.428,0.000176,0,0.000550
5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,0.488,0.000211,0,0.002200
6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,0.498,0.000168,0,0.000550


In [45]:
# Missing values in genre
anime = anime.reset_index()
missingvalues_genre = anime[anime.genre.isnull()]
missingvalues_genre_titles = missingvalues_genre['anime_id'].tolist()
len(missingvalues_genre_titles)

59

In [46]:
# Cross-referencing our missing values with the newest version of the dataset
animelist[animelist['anime_id'].isin(missingvalues_genre_titles)]

# Making `anime_id` the index so we can easly merge both dataframes
anime = anime.set_index('anime_id')
animelist = animelist.set_index('anime_id')

# Filling the missing values that are available in the newest version of the dataset
anime['genre'] = anime['genre'].fillna(animelist['genre'])
anime['genre'].isnull().sum()

40

In [47]:
# Research and replace the remaining missing values 
anime.at[29765, 'genre'] = 'Short' #Metropolis (2009)
anime.at[32695, 'genre'] = 'Fantasy' #Match Shoujo
anime.at[33187, 'genre'] = 'Short' #Katsudou Shashin
anime.at[30862, 'genre'] = 'Short' #Yubi wo Nusunda Onna
anime.at[28987, 'genre'] = 'Short' #Kamakura
anime.at[29629, 'genre'] = 'Short' #Coffee Break
anime.at[28653, 'genre'] = 'Short' #Maze
anime.at[31834, 'genre'] = 'Short' #Mormorando
anime.at[31760, 'genre'] = 'Short' #Tsuru Shitae Waka Kan
anime.at[31831, 'genre'] = 'Short' #Fantasy
anime.at[31833, 'genre'] = 'Short' #Metamorphose
anime.at[30399, 'genre'] = 'Short' #Arigatou Gomennasai
anime.at[28655, 'genre'] = 'Short' #PiKA PiKA
anime.at[31832, 'genre'] = 'Short' #Zawazawa
anime.at[28647, 'genre'] = 'Short' #Kappo
anime.at[29764, 'genre'] = 'Short' #Blend
anime.at[29921, 'genre'] = 'Short' #Bunbuku Chagama (1958)
anime.at[29655, 'genre'] = 'Short' #Chanda Gou
anime.at[29923, 'genre'] = 'Short' #Fukusuke
anime.at[30861, 'genre'] = 'Short' #Happy Bogeys
anime.at[32636, 'genre'] = 'Short' #Hokori Inu no Hanashi
anime.at[31511, 'genre'] = 'Short' #Holiday
anime.at[31509, 'genre'] = 'Short' #Ichi-gan Kuni
anime.at[29920, 'genre'] = 'Short' #Kobutori (1957)
anime.at[30055, 'genre'] = 'Adventure' #Mabeob Chunjamun: Daemawangui Buhwaleul Magala
anime.at[29767, 'genre'] = 'Short' #Minamo
anime.at[29922, 'genre'] = 'Short' #Ou-sama Ninatta Kitsune
anime.at[33318, 'genre'] = 'Short' #Scripta Volant
anime.at[31508, 'genre'] = 'Short' #Shinya Doubutsuen
anime.at[33320, 'genre'] = 'Short' #Suijun Genten
anime.at[30408, 'genre'] = 'Short' #Tokyo SOS
anime.at[33319, 'genre'] = 'Short' #Wareware no Heya
anime.at[32644, 'genre'] = 'Short' #Yaseruyagi
anime.at[33388, 'genre'] = 'Adventure' #Charanpo Shima no Monogatari
anime.at[33389, 'genre'] = 'Fantasy' #Genba no Joukitsune
anime.at[34310, 'genre'] = 'Kids' #Tamagotchi Movie: Tanpen Himitsu no Otodoke Daisakusen!
anime.at[33390, 'genre'] = 'Action' #Zunda Horizon

In [48]:
# Reset index
anime = anime.reset_index()
# Check again for missing values to be sure
anime[anime['genre'].isnull()]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,airing,rate_na,episode
8955,28487,Ikite Iru,,OVA,1,-0.451147,-0.328934,0,0,-0.238713
9137,30435,Kankou Taisen Saitama: Sakuya no Tatakai,,Other,4,-1.578386,-0.327821,0,0,-0.173863
9978,31078,PikkaPika Summer,,TV,31,-0.510868,-0.327949,0,0,0.409786


In [49]:
# Drop the three titles where we couldn't find the genre 
anime.drop([8955, 9137, 9978], inplace=True)
anime.reset_index(inplace=True)
anime.shape

(12288, 11)

In [50]:
# Genre has way too many unique values!
anime.genre.nunique()

3272

In [51]:
anime.genre.value_counts()

Hentai                                  823
Comedy                                  524
Music                                   303
Kids                                    200
Comedy, Slice of Life                   179
                                       ... 
Demons, Historical                        1
Comedy, Ecchi, Magic, Parody              1
Music, School, Shoujo                     1
Hentai, Romance, School, Sports           1
Fantasy, Kids, School, Slice of Life      1
Name: genre, Length: 3272, dtype: int64

In [52]:
anime.genre.value_counts(normalize=True)

Hentai                                  0.066976
Comedy                                  0.042643
Music                                   0.024658
Kids                                    0.016276
Comedy, Slice of Life                   0.014567
                                          ...   
Demons, Historical                      0.000081
Comedy, Ecchi, Magic, Parody            0.000081
Music, School, Shoujo                   0.000081
Hentai, Romance, School, Sports         0.000081
Fantasy, Kids, School, Slice of Life    0.000081
Name: genre, Length: 3272, dtype: float64

In [62]:
# number of anime with multiple genres 
anime[anime.genre.str.contains(",")].shape

(9436, 11)

In [55]:
# explore the variety of genre within anime
log=[j.strip(r" ") for i in anime.genre.str.lower().dropna().values for j in i.split(",")]
print(len(log), 'values')
print(len(set(log)), 'unique values')

36356 values
44 unique values


In [57]:
# frequency of each genre mentioned in the dataset
pd.Series(log).value_counts()

comedy           4649
action           2849
adventure        2350
fantasy          2315
sci-fi           2070
drama            2018
shounen          1712
kids             1610
romance          1466
slice of life    1224
school           1222
hentai           1141
supernatural     1038
mecha             944
music             862
historical        808
magic             779
ecchi             637
shoujo            604
seinen            547
sports            543
mystery           495
super power       465
military          426
parody            408
space             381
horror            369
harem             319
demons            294
martial arts      266
dementia          243
psychological     229
police            197
game              181
samurai           148
vampire           102
thriller           87
cars               72
shounen ai         65
shoujo ai          55
josei              54
yuri               42
yaoi               39
short              31
dtype: int64

In [60]:
# splitting the multiple genre
dummies=anime.genre.str.split(", ", expand=True)

In [61]:
# making dummies columns
dummies=dummies.stack().str.get_dummies().sum(level=0)

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12283,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12284,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12285,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12286,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Exploratory Data Analysis

In [51]:
from pandas_profiling import ProfileReport
prof = ProfileReport(anime)
prof.to_file(output_file='output.html')

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=24.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…


