# Building an Anime Recommender System

In [1]:
import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, vstack

### 1. Load Datasets
From `AnimeList.csv` get
 - `anime_ids`: Sorted list of all anime id's. Used as columns of `X` and `prediction`.
 - `anime_dict`: Dictionary of type {'anime_id': 'anime_title'}. Used to reconstruct anime title from anime id.
   
From `UserAnimeList.csv` get
 - `userlist`: List of all usernames. Used as index of `X`.
 - `X`: Sparse user-item matrix. Load in chunks (see https://datascienceplus.com/processing-huge-dataset-with-python/).

In [59]:
animes = pd.read_csv('data/AnimeList.csv')


anime_dict = {a.anime_id: a.title for i,a in animes.iterrows()}

# If an English title exists. Replace the original title.
for i,a in animes.iterrows():
    if not pd.isna(a.title_english):
        anime_dict[a.anime_id] = a.title_english

anime_ids = animes.sort_values(by="anime_id").anime_id.unique()

In [3]:
size = 3e6
reader = pd.read_csv('data/UserAnimeList.csv', chunksize = size)

chunk_list = []
userlist = []

# Each chunk is in dataframe format
for chunk in reader:  
    chunk = chunk[['username', 'anime_id', 'my_score']]
    chunk = chunk[chunk.my_score != 0]
    
    # Transform Dataframe to (N=#users x K=#animes) matrix, with scores as values
    chunk = chunk.pivot(index="username", columns="anime_id", values="my_score")
    
    # Add dimensions so that we can easily vstack
    chunk = chunk.reindex(columns = anime_ids)
    
    # Extract userlist
    userlist = userlist + chunk.index.tolist()
    
    chunk = csr_matrix(chunk.fillna(0))  # Convert to SciPy sparse matrix
    
    chunk_list.append(chunk)

In [4]:
X = vstack(chunk_list, format="csr")

### 2. Build Prediction
Factorize `X` into user-matrix `W` and item-matrix `H` with `sklearn.decomposition.NMF`. Essentially solves:

$$ \underset{W,H}{\mathrm{argmin}} \sum_{i,j \, \in \, X} (x_{ij} - w_i^T \cdot h_j)^2 $$

Get row index of USER from `userlist`. Predict ratings via:

$$ \hat{x}_i = \, w_i^T \cdot H \qquad (i = \text{USER_ID})$$


Sources: 
* https://datajobs.com/data-science-repo/Recommender-Systems-[Netflix].pdf
* https://medium.com/datadriveninvestor/how-to-built-a-recommender-system-rs-616c988d64b2

In [5]:
from sklearn.decomposition import NMF
model = NMF(n_components=7, init='nndsvda', shuffle=True)
W = model.fit_transform(X)
H = model.components_

In [61]:
index = userlist.index('Manuel')

prediction = pd.Series(np.dot(W[index],H), index=anime_ids)
_, watched = X[index].nonzero()  # Get Col# of watched animes
watched = anime_ids[watched]  # Get anime_id of watched animes
prediction = prediction.drop(list(watched))  # Note: columns == anime_ids's

for i in prediction.sort_values(ascending=False).head(200).index:
    print(anime_dict[i])

Cowboy Bebop
Death Note
Fullmetal Alchemist: Brotherhood
The Girl Who Leapt Through Time
Howl&#039;s Moving Castle
Samurai Champloo
5 Centimeters Per Second
Ghost in the Shell
Bakemonogatari
AKIRA
My Neighbor Totoro
Evangelion: 1.0 You Are (Not) Alone
When They Cry
Summer Wars
Mushi-Shi
Grave of the Fireflies
Paprika
Evangelion: 2.0 You Can (Not) Advance
Clannad
Serial Experiments Lain
Clannad ~After Story~
Trigun
Nausicaä of the Valley of the Wind
Fate/Zero
Toradora!
K-ON!
Angel Beats!
Eden of The East
Ergo Proxy
Cowboy Bebop: The Movie
Perfect Blue
Lucky☆Star
Great Teacher Onizuka
Soul Eater
Attack on Titan
Castle in the Sky
anohana: The Flower We Saw That Day
Fate/Zero Season 2
Ghost in the Shell: Stand Alone Complex
the Garden of sinners Chapter 2: Murder Speculation Part A
Hellsing Ultimate
When They Cry: Kai
Monster
the Garden of sinners Chapter 5: Paradox Paradigm
Kiki&#039;s Delivery Service
the Garden of sinners Chapter 3: Remaining Sense of Pain
Psycho-Pass
The Melancholy of 