# Machine Learning: Anime Recommendation

#### Data Collection

In [1]:
import kaggle

In [2]:
!kaggle datasets download -d CooperUnion/anime-recommendations-database --unzip

Downloading anime-recommendations-database.zip to C:\Users\nclee\Documents\Education\Ironhack\Project-Anime




  0%|          | 0.00/25.0M [00:00<?, ?B/s]
  4%|3         | 1.00M/25.0M [00:00<00:02, 8.98MB/s]
  8%|7         | 2.00M/25.0M [00:00<00:02, 9.33MB/s]
 12%|#1        | 3.00M/25.0M [00:00<00:02, 9.64MB/s]
 20%|#9        | 5.00M/25.0M [00:00<00:02, 10.3MB/s]
 24%|##3       | 6.00M/25.0M [00:00<00:02, 9.88MB/s]
 28%|##7       | 7.00M/25.0M [00:00<00:02, 9.38MB/s]
 36%|###5      | 9.00M/25.0M [00:01<00:02, 8.02MB/s]
 44%|####3     | 11.0M/25.0M [00:01<00:01, 8.56MB/s]
 52%|#####1    | 13.0M/25.0M [00:01<00:01, 9.19MB/s]
 56%|#####5    | 14.0M/25.0M [00:01<00:01, 9.45MB/s]
 60%|#####9    | 15.0M/25.0M [00:01<00:01, 9.61MB/s]
 68%|######7   | 17.0M/25.0M [00:01<00:00, 10.2MB/s]
 72%|#######1  | 18.0M/25.0M [00:01<00:00, 10.2MB/s]
 76%|#######5  | 19.0M/25.0M [00:02<00:00, 9.95MB/s]
 80%|#######9  | 20.0M/25.0M [00:02<00:00, 9.64MB/s]
 88%|########7 | 22.0M/25.0M [00:02<00:00, 10.5MB/s]
 96%|#########5| 24.0M/25.0M [00:02<00:00, 10.9MB/s]
100%|##########| 25.0M/25.0M [00:02<00:00, 10.0MB/s]


In [85]:
!kaggle datasets download -d azathoth42/myanimelist -f AnimeList.csv --unzip

Downloading 28524%2F45582%2Fcompressed%2FAnimeList.csv.zip to C:\Users\nclee\Documents\Education\Ironhack\Project-Anime




  0%|          | 0.00/3.02M [00:00<?, ?B/s]
 33%|###3      | 1.00M/3.02M [00:00<00:00, 3.55MB/s]
 66%|######6   | 2.00M/3.02M [00:00<00:00, 3.37MB/s]
 99%|#########9| 3.00M/3.02M [00:00<00:00, 3.39MB/s]
100%|##########| 3.02M/3.02M [00:00<00:00, 3.33MB/s]


In [86]:
import zipfile

In [88]:
with zipfile.ZipFile("28524%2F45582%2Fcompressed%2FAnimeList.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("")

##### Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set()
plt.style.use('seaborn-whitegrid')
sns.set_style("white")

import scipy.stats as st
from sklearn.preprocessing import StandardScaler

## Data PreparationStandardScaler

In [2]:
anime=pd.read_csv("anime.csv", index_col="anime_id")

In [3]:
anime.isna().sum()

name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [4]:
# dropping those with NA in "genre", "type", "rating"
anime[anime.loc[:, ["genre", "type", "rating"]].isna().all(axis=1)]
anime.drop(anime[anime.loc[:, ["genre", "type", "rating"]].isna().all(axis=1)].index, axis=0, inplace=True)

In [5]:
# updated list to fill in the gaps in data
animelist=pd.read_csv("AnimeList.csv", index_col="anime_id")

In [6]:
# trying to fill in the missing type information from new animelist, there are still "Unknown"
anime.type.fillna(animelist.type, inplace=True)

In [7]:
# fill in the rest by research
anime_type={34437: "Movie", 32455: "TV", 28613: "TV", 30448: "TV", 24023: "TV", 34348: "TV"}
anime.loc[anime.type=="Unknown", "type"]=anime[anime.type=="Unknown"].index.map(anime_type)

In [8]:
# creating a dummy column for airing using episode as an indicator
anime["airing"]=np.where(anime.episodes=="Unknown", 1, 0)

In [14]:
anime["episodes"]=np.where(anime.episodes=="Unknown", 0, anime.episodes)
anime["episodes"]=anime.episodes.astype(int)

In [15]:
# creating a dummy column for whether it was NA for rating
anime["rate_na"]=np.where(anime.rating.isna(), 1, 0)

In [16]:
# fill all the NA in rating as 0
anime.rating.fillna(0, inplace=True)

In [17]:
anime.describe(include="all")

Unnamed: 0,name,genre,type,episodes,rating,members,airing,rate_na
count,12291,12232,12291,12291.0,12291.0,12291.0,12291.0,12291.0
unique,12289,3264,6,,,,,
top,Shi Wan Ge Leng Xiaohua,Hentai,TV,,,,,
freq,2,823,3805,,,,,
mean,,,,12.04304,6.354337,18075.6,0.027418,0.018469
std,,,,46.262561,1.33961,54826.69,0.163306,0.134645
min,,,,0.0,0.0,5.0,0.0,0.0
25%,,,,1.0,5.82,225.5,0.0,0.0
50%,,,,2.0,6.55,1551.0,0.0,0.0
75%,,,,12.0,7.17,9443.0,0.0,0.0


In [18]:
# should we group them into 5 instead of 6?
anime.type.value_counts(normalize=True)

TV         0.309576
OVA        0.269384
Movie      0.191360
Special    0.136360
ONA        0.053616
Music      0.039704
Name: type, dtype: float64

In [19]:
anime["type"]=np.where(anime.type.isin(["ONA", "Music"]), "Other", anime.type)

In [20]:
# standardize the data for members
scaler=StandardScaler()
anime[["episode", "rating", "members"]]=scaler.fit_transform(anime[["episodes", "rating", "members"]])


In [21]:
# dummies
dummy_col=["type"]
anime=pd.get_dummies(data=anime, columns=dummy_col,drop_first=True)

In [22]:
# Missing values in genre
missingvalues_genre = anime[anime.genre.isnull()]
missingvalues_genre_titles = missingvalues_genre.index.tolist()
len(missingvalues_genre_titles)

59

In [23]:
# Cross-referencing our missing values with the newest version of the dataset
animelist[animelist.index.isin(missingvalues_genre_titles)]

# Filling the missing values that are available in the newest version of the dataset
anime['genre'] = anime['genre'].fillna(animelist['genre'])
anime['genre'].isnull().sum()

40

In [24]:
# Research and replace the remaining missing values 
anime.at[29765, 'genre'] = 'Short' #Metropolis (2009)
anime.at[32695, 'genre'] = 'Fantasy' #Match Shoujo
anime.at[33187, 'genre'] = 'Short' #Katsudou Shashin
anime.at[30862, 'genre'] = 'Short' #Yubi wo Nusunda Onna
anime.at[28987, 'genre'] = 'Short' #Kamakura
anime.at[29629, 'genre'] = 'Short' #Coffee Break
anime.at[28653, 'genre'] = 'Short' #Maze
anime.at[31834, 'genre'] = 'Short' #Mormorando
anime.at[31760, 'genre'] = 'Short' #Tsuru Shitae Waka Kan
anime.at[31831, 'genre'] = 'Short' #Fantasy
anime.at[31833, 'genre'] = 'Short' #Metamorphose
anime.at[30399, 'genre'] = 'Short' #Arigatou Gomennasai
anime.at[28655, 'genre'] = 'Short' #PiKA PiKA
anime.at[31832, 'genre'] = 'Short' #Zawazawa
anime.at[28647, 'genre'] = 'Short' #Kappo
anime.at[29764, 'genre'] = 'Short' #Blend
anime.at[29921, 'genre'] = 'Short' #Bunbuku Chagama (1958)
anime.at[29655, 'genre'] = 'Short' #Chanda Gou
anime.at[29923, 'genre'] = 'Short' #Fukusuke
anime.at[30861, 'genre'] = 'Short' #Happy Bogeys
anime.at[32636, 'genre'] = 'Short' #Hokori Inu no Hanashi
anime.at[31511, 'genre'] = 'Short' #Holiday
anime.at[31509, 'genre'] = 'Short' #Ichi-gan Kuni
anime.at[29920, 'genre'] = 'Short' #Kobutori (1957)
anime.at[30055, 'genre'] = 'Adventure' #Mabeob Chunjamun: Daemawangui Buhwaleul Magala
anime.at[29767, 'genre'] = 'Short' #Minamo
anime.at[29922, 'genre'] = 'Short' #Ou-sama Ninatta Kitsune
anime.at[33318, 'genre'] = 'Short' #Scripta Volant
anime.at[31508, 'genre'] = 'Short' #Shinya Doubutsuen
anime.at[33320, 'genre'] = 'Short' #Suijun Genten
anime.at[30408, 'genre'] = 'Short' #Tokyo SOS
anime.at[33319, 'genre'] = 'Short' #Wareware no Heya
anime.at[32644, 'genre'] = 'Short' #Yaseruyagi
anime.at[33388, 'genre'] = 'Adventure' #Charanpo Shima no Monogatari
anime.at[33389, 'genre'] = 'Fantasy' #Genba no Joukitsune
anime.at[34310, 'genre'] = 'Kids' #Tamagotchi Movie: Tanpen Himitsu no Otodoke Daisakusen!
anime.at[33390, 'genre'] = 'Action' #Zunda Horizon

In [25]:
# Check again for missing values to be sure
anime[anime['genre'].isnull()]

Unnamed: 0_level_0,name,genre,episodes,rating,members,airing,rate_na,episode,type_OVA,type_Other,type_Special,type_TV
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
28487,Ikite Iru,,1,-0.451147,-0.328934,0,0,-0.238713,1,0,0,0
30435,Kankou Taisen Saitama: Sakuya no Tatakai,,4,-1.578386,-0.327821,0,0,-0.173863,0,1,0,0
31078,PikkaPika Summer,,31,-0.510868,-0.327949,0,0,0.409786,0,0,0,1


In [26]:
# Drop the three titles where we couldn't find the genre 
anime.drop(anime[anime['genre'].isnull()].index, inplace=True)
anime.shape

(12288, 12)

In [27]:
# Genre has way too many unique values!
anime.genre.nunique()

3272

In [28]:
anime.genre.value_counts()

Hentai                                                              823
Comedy                                                              524
Music                                                               303
Kids                                                                200
Comedy, Slice of Life                                               179
                                                                   ... 
Adventure, Comedy, Demons, Fantasy, Magic, Martial Arts, Shounen      1
Kids, Music, School, Slice of Life                                    1
Comedy, Ecchi, Magic, Sci-Fi, Seinen                                  1
Comedy, Mystery, School, Shounen, Super Power                         1
Action, Drama, Police, School, Shoujo                                 1
Name: genre, Length: 3272, dtype: int64

In [29]:
anime.genre.value_counts(normalize=True)

Hentai                                                              0.066976
Comedy                                                              0.042643
Music                                                               0.024658
Kids                                                                0.016276
Comedy, Slice of Life                                               0.014567
                                                                      ...   
Adventure, Comedy, Demons, Fantasy, Magic, Martial Arts, Shounen    0.000081
Kids, Music, School, Slice of Life                                  0.000081
Comedy, Ecchi, Magic, Sci-Fi, Seinen                                0.000081
Comedy, Mystery, School, Shounen, Super Power                       0.000081
Action, Drama, Police, School, Shoujo                               0.000081
Name: genre, Length: 3272, dtype: float64

In [30]:
# number of anime with multiple genres 
anime[anime.genre.str.contains(",")].shape

(9436, 12)

In [31]:
# explore the variety of genre within anime
log=[j.strip(r" ") for i in anime.genre.str.lower().dropna().values for j in i.split(",")]
print(len(log), 'values')
print(len(set(log)), 'unique values')

36356 values
44 unique values


In [32]:
# frequency of each genre mentioned in the dataset
pd.Series(log).value_counts()

comedy           4649
action           2849
adventure        2350
fantasy          2315
sci-fi           2070
drama            2018
shounen          1712
kids             1610
romance          1466
slice of life    1224
school           1222
hentai           1141
supernatural     1038
mecha             944
music             862
historical        808
magic             779
ecchi             637
shoujo            604
seinen            547
sports            543
mystery           495
super power       465
military          426
parody            408
space             381
horror            369
harem             319
demons            294
martial arts      266
dementia          243
psychological     229
police            197
game              181
samurai           148
vampire           102
thriller           87
cars               72
shounen ai         65
shoujo ai          55
josei              54
yuri               42
yaoi               39
short              31
dtype: int64

In [33]:
# splitting the multiple genre
dummies=anime.genre.str.split(", ", expand=True)

In [34]:
# making dummies columns
dummies=dummies.stack().str.get_dummies().sum(level=0)

In [41]:
# prepare dataframe for analysis
df=pd.concat([anime, dummies], axis=1).drop(["genre", "name"], axis=1)

## Exploratory Data Analysis

In [None]:
from pandas_profiling import ProfileReport
prof = ProfileReport(anime)
prof.to_file(output_file='output.html')

## Unstructured Learning

### scikit-cmeans

In [45]:
from skcmeans.algorithms import Probabilistic, GustafsonKesselMixin

In [87]:
clusterer = Probabilistic(n_clusters=7, n_init=1000)
clusterer.fit(df.values)

<skcmeans.algorithms.Probabilistic at 0x17c10219a60>

In [88]:
clusterer.u

AttributeError: 'Probabilistic' object has no attribute 'u'

In [90]:
from yellowbrick.cluster import KElbowVisualizer



In [94]:
model = FCM()
visualizer = KElbowVisualizer(model, k=(2,5), metric='distortion', timings=False, locate_elbow=False)

visualizer.fit(df)        # Fit the data to the visualizer
visualizer.show() 

YellowbrickTypeError: The supplied model is not a clustering estimator; try a classifier or regression score visualizer instead!

In [48]:
from fcmeans import FCM

In [49]:
fcm = FCM(n_clusters=3)
fcm.fit(df)

<fcmeans.fcm.FCM at 0x17c14d8eaf0>

In [50]:
fcm_centers = fcm.centers
fcm_labels  = fcm.u.argmax(axis=1)

In [62]:
fcm_labels

array([2, 0, 0, ..., 2, 2, 2], dtype=int64)

In [66]:
fcm.u

array([[1.00159163e-02, 1.71153030e-05, 9.89966968e-01],
       [9.43029051e-01, 1.12473712e-04, 5.68584748e-02],
       [9.58560273e-01, 4.64083525e-05, 4.13933187e-02],
       ...,
       [1.18483601e-03, 1.82518497e-06, 9.98813339e-01],
       [5.11652901e-03, 8.70975920e-06, 9.94874761e-01],
       [4.85095129e-03, 8.25319687e-06, 9.95140796e-01]])

In [36]:
import skfuzzy as fuzz

In [69]:
cluster_centers,fuzzy_labels,_,_,_,_,_=fuzz.cluster.cmeans(data=df, c=3, m=2, error=0.05, maxiter=1000)


In [60]:
labels_list = np.argmax(fuzzy_labels,axis=0).T

In [61]:
labels_list

array([1, 2, 2, 0, 0, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 2,
       0, 2, 0, 0, 0, 2, 0, 0, 0, 0], dtype=int64)

In [70]:
fuzzy_labels.shape

(3, 54)

In [None]:
fuzzy_l