# Building an Anime Recommender System

In [1]:
import datasets, malscraper
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
from tqdm import tqdm

### 1. Load Datasets
From `AnimeList.csv` get
 - `anime_ids`: Sorted list of all anime id's. Used as columns of `X` and `prediction`.
 - `anime_dict`: Dictionary of type {'anime_id': 'anime_title'}. Used to reconstruct anime title from anime id.
   
From `UserAnimeList.csv` get
 - `userlist`: List of all usernames. Used as index of `X`.
 - `X`: Sparse user-item matrix. Load in chunks (see https://datascienceplus.com/processing-huge-dataset-with-python/).

In [2]:
data = datasets.MyAnimeList(extension="cleaned",debug=True,chunksize=1e6)

0it [00:00, ?it/s]


### 2. Build Prediction
Factorize `X` into user-matrix `W` and item-matrix `H` with `sklearn.decomposition.NMF`. Essentially solves:

$$ \underset{W,H}{\mathrm{argmin}} \sum_{i,j \, \in \, X} (x_{ij} - w_i^T \cdot h_j)^2 $$

Get row index of USER from `userlist`. Predict ratings via:

$$ \hat{x}_i = \, w_i^T \cdot H \qquad (i = \text{USER_ID})$$

Get prediction for new user via linear regression.


Sources: 
* https://datajobs.com/data-science-repo/Recommender-Systems-[Netflix].pdf
* https://medium.com/datadriveninvestor/how-to-built-a-recommender-system-rs-616c988d64b2

SINGULAR VALUE PROBLEM:
* No user must have less than k ranked animes.
* No anime must have been ranked less than k times.

In [None]:
class MatrixFactorization():
    
    def __init__(self, k, reg):
        self.k = k
        self.reg = reg

        
    def fit(self, X, steps):
        self.U = np.random.randn(X.shape[0],self.k)
        self.V = np.random.randn(X.shape[1],self.k)
        self.bu = np.zeros(X.shape[0])
        self.bv = np.zeros(X.shape[1])
        self.mu = X.data.mean()
        self.error = [self.loss(X)]
        
        print(f"Iteration 0 :  Training Error = {self.error[-1]}")
        
        for t in range(1,steps+1):
            sv_errors = 0  # count singular value errors
            
            # Update V
            for j in range(self.V.shape[0]):
                ivec = X.getcol(j).nonzero()[0]
                matrix_ = self.U[ivec].T.dot(self.U[ivec]) + np.eye(self.k) * self.reg
                vector_ = (X[ivec,j].T - self.bu[ivec] - self.bv[j] - self.mu).dot(self.U[ivec]).T
                try:
                    self.bv[j] = (X[ivec,j].T - self.U[ivec].dot(self.V[j]) - self.bu[ivec] - self.mu).sum()
                    self.bv[j] = self.bv[j] / ( len(ivec) + self.reg)
                    self.V[j] = np.squeeze(np.linalg.solve(matrix_,vector_))
                except np.linalg.LinAlgError: 
                    sv_errors += 1
                    continue

            # Update U
            for i in range(self.U.shape[0]):
                jvec = X.getrow(i).nonzero()[1]
                matrix_ = self.V[jvec].T.dot(self.V[jvec]) + np.eye(self.k) * self.reg
                vector_ = (X[i,jvec] - self.bu[i] - self.bv[jvec] - self.mu).dot(self.V[jvec]).T
                try:
                    self.bu[i] = (X[i,jvec] - self.V[ivec].dot(self.U[i]) - self.bv[jvec] - self.mu).sum()
                    self.bu[i] = self.bu[i] / ( len(jvec) + self.reg)
                    self.U[i] = np.squeeze(np.linalg.solve(matrix_,vector_))
                except np.linalg.LinAlgError: 
                    sv_errors += 1
                    continue
                
            self.error.append(self.loss(X))
            
            print(f"Iteration {t} :  Training Error = {self.error[-1]}  SV Errors = {sv_errors}/{self.V.shape[0]+self.U.shape[0]}")
        
        self.plot_loss()
           
            
    def loss(self, X):
        N = 0.
        E = 0.
        for j in range(self.V.shape[0]):
            ivec = X.getcol(j).nonzero()[0]
            xtmp = X[ivec,j].T
            xhat = self.U[ivec].dot(self.V[j].T) + self.bu[ivec] + self.bv[j] + self.mu
            resd = xtmp - xhat
            E += resd.dot(resd.T)
            N += len(ivec)
        return E[0,0] / N
    
    
    def plot_loss(self):
        plt.plot(self.error,color="C0", label="Training Error")
        plt.legend()
        plt.show()


model = MatrixFactorization(k=10, reg=20.)
model.fit(data.X,steps=10)

Iteration 0 :  Training Error = 12.940036101686983


In [34]:
for k in range(10):
    jmax = model.V[:,k].argmax()
    jmin = model.V[:,k].argmin()
    idmax = data.cindex[jmax]
    idmin = data.cindex[jmin]
    amax = data.get_anime_by_id(idmax)
    amin = data.get_anime_by_id(idmin)
    print(f'Feature {k}:  min : {amin.title}  max : {amax.title}')

Feature 0:  min : Ace wo Nerae!  max : Elfen Lied
Feature 1:  min : Elfen Lied  max : Junjou Romantica
Feature 2:  min : Naruto: Shippuuden  max : Aria The Origination
Feature 3:  min : Boku no Pico  max : Steins;Gate
Feature 4:  min : Bleach Movie 2: The DiamondDust Rebellion - Mou Hitotsu no Hyourinmaru  max : High School DxD BorN
Feature 5:  min : School Days  max : Crayon Shin-chan
Feature 6:  min : FLCL  max : Isekai wa Smartphone to Tomo ni.
Feature 7:  min : Umineko no Naku Koro ni  max : Ghost in the Shell
Feature 8:  min : Pico to Chico  max : One Piece
Feature 9:  min : Loveless  max : Higurashi no Naku Koro ni Kira


### ToDo
* Normalize scores and switch algorithm **or** implement bias terms.
* Predict scores of users not in dataset.
* Move loading dataset and training to functions.
* REST Api: https://medium.com/@mahdi04/train-predict-simple-machine-learning-models-with-django-rest-76ce46bf2868

### 3. Webscrape current animelist

In [None]:
import malscraper
import importlib
importlib.reload(malscraper)

In [None]:
myanimelist = malscraper.get_user_anime_list('Manuel')

In [None]:
index = userlist.index('Manuel')

prediction = pd.Series(np.dot(W[index],H), index=anime_ids)
watched = [a['id_ref'] for a in myanimelist if a['id_ref'] in prediction.index]
prediction = prediction.drop(list(watched))  # Note: columns == anime_ids's

for i in prediction.sort_values(ascending=False).head(10).index:
    print(anime_dict[i])