# Machine Learning: Anime Recommendation

#### Data Collection

In [1]:
import kaggle

In [2]:
!kaggle datasets download -d CooperUnion/anime-recommendations-database --unzip

Downloading anime-recommendations-database.zip to C:\Users\nclee\Documents\Education\Ironhack\Project-Anime




  0%|          | 0.00/25.0M [00:00<?, ?B/s]
  4%|3         | 1.00M/25.0M [00:00<00:02, 8.98MB/s]
  8%|7         | 2.00M/25.0M [00:00<00:02, 9.33MB/s]
 12%|#1        | 3.00M/25.0M [00:00<00:02, 9.64MB/s]
 20%|#9        | 5.00M/25.0M [00:00<00:02, 10.3MB/s]
 24%|##3       | 6.00M/25.0M [00:00<00:02, 9.88MB/s]
 28%|##7       | 7.00M/25.0M [00:00<00:02, 9.38MB/s]
 36%|###5      | 9.00M/25.0M [00:01<00:02, 8.02MB/s]
 44%|####3     | 11.0M/25.0M [00:01<00:01, 8.56MB/s]
 52%|#####1    | 13.0M/25.0M [00:01<00:01, 9.19MB/s]
 56%|#####5    | 14.0M/25.0M [00:01<00:01, 9.45MB/s]
 60%|#####9    | 15.0M/25.0M [00:01<00:01, 9.61MB/s]
 68%|######7   | 17.0M/25.0M [00:01<00:00, 10.2MB/s]
 72%|#######1  | 18.0M/25.0M [00:01<00:00, 10.2MB/s]
 76%|#######5  | 19.0M/25.0M [00:02<00:00, 9.95MB/s]
 80%|#######9  | 20.0M/25.0M [00:02<00:00, 9.64MB/s]
 88%|########7 | 22.0M/25.0M [00:02<00:00, 10.5MB/s]
 96%|#########5| 24.0M/25.0M [00:02<00:00, 10.9MB/s]
100%|##########| 25.0M/25.0M [00:02<00:00, 10.0MB/s]


In [85]:
!kaggle datasets download -d azathoth42/myanimelist -f AnimeList.csv --unzip

Downloading 28524%2F45582%2Fcompressed%2FAnimeList.csv.zip to C:\Users\nclee\Documents\Education\Ironhack\Project-Anime




  0%|          | 0.00/3.02M [00:00<?, ?B/s]
 33%|###3      | 1.00M/3.02M [00:00<00:00, 3.55MB/s]
 66%|######6   | 2.00M/3.02M [00:00<00:00, 3.37MB/s]
 99%|#########9| 3.00M/3.02M [00:00<00:00, 3.39MB/s]
100%|##########| 3.02M/3.02M [00:00<00:00, 3.33MB/s]


In [86]:
import zipfile

In [88]:
with zipfile.ZipFile("28524%2F45582%2Fcompressed%2FAnimeList.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("")

##### Libraries

In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set()
plt.style.use('seaborn-whitegrid')
sns.set_style("white")

import scipy.stats as st
from sklearn.preprocessing import MinMaxScaler

## Data Preparation

In [2]:
anime=pd.read_csv("anime.csv", index_col="anime_id")

In [3]:
anime.isna().sum()

name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [4]:
# dropping those with NA in "genre", "type", "rating"
anime[anime.loc[:, ["genre", "type", "rating"]].isna().all(axis=1)]
anime.drop(anime[anime.loc[:, ["genre", "type", "rating"]].isna().all(axis=1)].index, axis=0, inplace=True)

In [6]:
# updated list to fill in the gaps in data
animelist=pd.read_csv("AnimeList.csv")

In [7]:
# trying to fill in the missing type information from new animelist, there are still "Unknown"
anime.type.fillna(animelist.set_index("anime_id").type, inplace=True)

In [9]:
# fill in the rest by research
anime_type={34437: "Movie", 32455: "TV", 28613: "TV", 30448: "TV", 24023: "TV", 34348: "TV"}
anime.loc[anime.type=="Unknown", "type"]=anime[anime.type=="Unknown"].index.map(anime_type)

In [10]:
# creating a dummy column for airing using episode as an indicator
anime["airing"]=np.where(anime.episodes=="Unknown", 1, 0)

In [22]:
anime["episodes"]=np.where(anime.episodes=="Unknown", 0, anime.episodes)

In [11]:
# creating a dummy column for whether it was NA for rating
anime["rate_na"]=np.where(anime.rating.isna(), 1, 0)

In [16]:
# fill all the NA in rating as 0
anime.rating.fillna(0, inplace=True)

In [23]:
anime.describe(include="all")

Unnamed: 0,name,genre,type,episodes,rating,members,airing,rate_na
count,12291,12232,12291,12291.0,12291.0,12291.0,12291.0,12291.0
unique,12289,3264,6,187.0,,,,
top,Shi Wan Ge Leng Xiaohua,Hentai,TV,1.0,,,,
freq,2,823,3805,5677.0,,,,
mean,,,,,6.354337,18075.6,0.027418,0.018469
std,,,,,1.33961,54826.69,0.163306,0.134645
min,,,,,0.0,5.0,0.0,0.0
25%,,,,,5.82,225.5,0.0,0.0
50%,,,,,6.55,1551.0,0.0,0.0
75%,,,,,7.17,9443.0,0.0,0.0


In [29]:
# should we group them into 5 instead of 6?
anime.type.value_counts(normalize=True)

TV         0.309576
OVA        0.269384
Movie      0.191360
Special    0.136360
Other      0.093320
Name: type, dtype: float64

In [28]:
anime["type"]=np.where(anime.type.isin(["ONA", "Music"]), "Other", anime.type)

In [48]:
# standardize the data for members
scaler=MinMaxScaler()
anime[["episode", "rating", "members"]]=scaler.fit_transform(anime[["episodes", "rating", "members"]])


In [None]:
# dummies
dummy_col=["type"]
df=pd.get_dummies(data=anime, columns=dummy_col,drop_first=True)

## Exploratory Data Analysis

In [51]:
from pandas_profiling import ProfileReport
prof = ProfileReport(anime)
prof.to_file(output_file='output.html')

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=24.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…


