# Building an Anime Recommender System

In [1]:
import datasets, utils
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### 1. Load Dataset

In [6]:
data = datasets.MyAnimeList()

### 2. Build Prediction

Current best time on debug set (k=7):

* Iteration 0 : 24.73 seconds
* Iteration 1 : 48.98 seconds

In [None]:
import importlib
importlib.reload(utils)
nfeatures = 7
steps = 10
model = utils.MatrixFactorization(k=nfeatures, reg=20.)
model.fit(data.X,steps=10)

In [None]:
for k in range(nfeatures):
    jmax = model.V[:,k].argmax()
    jmin = model.V[:,k].argmin()
    idmax = data.cindex[jmax]
    idmin = data.cindex[jmin]
    amax = data.get_anime_by_id(idmax)
    amin = data.get_anime_by_id(idmin)
    str_ = (
        f'Feature {k+1}:\n'
        f'\tmax : ({model.V[jmax,k]:+2.1f}) {amax.title}\n'
        f'\tmin : ({model.V[jmin,k]:+2.1f}) {amin.title}'
    )
    print(str_)

### ToDo
* Predict scores of users not in dataset.
* REST Api: https://medium.com/@mahdi04/train-predict-simple-machine-learning-models-with-django-rest-76ce46bf2868

### 3. Webscrape current animelist

In [3]:
myprofile = utils.get_user_anime_list('Manuel')

Scraped 300 additional anime from https://myanimelist.net/animelist/Manuel/load.json?offset=0&status=7
Scraped 148 additional anime from https://myanimelist.net/animelist/Manuel/load.json?offset=300&status=7


#### 4. Get model from server

In [9]:
pickle_in = open("model.pickle", 'rb')
model = pickle.load(pickle_in)
pickle_in.close()

In [5]:
for k in range(model.k):
    jmax = model.V[:,k].argmax()
    jmin = model.V[:,k].argmin()
    idmax = data.cindex[jmax]
    idmin = data.cindex[jmin]
    amax = data.get_anime_by_id(idmax)
    amin = data.get_anime_by_id(idmin)
    str_ = (
        f'Feature {k+1}:\n'
        f'\tmax : ({model.V[jmax,k]:+2.1f}) {amax.title}\n'
        f'\tmin : ({model.V[jmin,k]:+2.1f}) {amin.title}'
    )
    print(str_)

Feature 1:
	max : (+4.0) Suzumiya Haruhi no Yuuutsu
	min : (-6.0) Naruto: Shippuuden
Feature 2:
	max : (+5.0) .hack//Sign
	min : (-3.1) Tsubasa Chronicle 2nd Season
Feature 3:
	max : (+4.7) Sword Art Online
	min : (-3.3) Higashi no Eden Movie II: Paradise Lost
Feature 4:
	max : (+7.0) Pingu in the City
	min : (-4.1) Lucky☆Star
Feature 5:
	max : (+4.5) Bobobo-bo Bo-bobo
	min : (-4.4) Aki-Sora
Feature 6:
	max : (+5.9) Ore no Imouto ga Konnani Kawaii Wake ga Nai. Specials
	min : (-5.2) School Days
Feature 7:
	max : (+3.6) Boku no Pico
	min : (-7.6) School Days
Feature 8:
	max : (+3.6) Vampire Knight: Guilty
	min : (-3.9) Hachimitsu to Clover
Feature 9:
	max : (+5.4) Dragon Ball GT
	min : (-4.1) Aku no Hana
Feature 10:
	max : (+6.9) One Piece
	min : (-2.9) Princess Princess
Feature 11:
	max : (+4.2) School Days
	min : (-5.7) Sword Art Online II
Feature 12:
	max : (+4.6) Neon Genesis Evangelion
	min : (-4.2) Shinsekai yori
Feature 13:
	max : (+4.3) Bokusatsu Tenshi Dokuro-chan
	min : (-3.7)

In [62]:
index = data.users[data.users.username == 'Manuel'].index

prediction = pd.Series(np.dot(model.U[index],model.V.T).flatten(), index=data.cindex)
prediction = prediction + model.bu[index] + model.bv + model.mu
watched = [a['id_ref'] for a in myprofile if a['id_ref'] in prediction.index]
prediction = prediction.drop(list(watched))  # Note: columns == anime_ids's

print('Top 10 Anime Predictions')
for i in prediction.sort_values(ascending=False).head(20).index:
    str_ = (
        f"\t({prediction[i]:2.4f}) {data.get_anime_by_id(i).title}"
    )
    print(str_)
    
print('\nBottom 10 Anime Predictions')
for i in prediction.sort_values(ascending=False).tail(10).index:
    str_ = (
        f"\t({prediction[i]:2.4f}) {data.get_anime_by_id(i).title}"
    )
    print(str_)

Top 10 Anime Predictions
	(9.6769) Nana
	(9.6563) Love Live! School Idol Project 2nd Season
	(9.5861) Hunter x Hunter (2011)
	(9.5677) Kokoro Connect: Michi Random
	(9.5244) Clannad: After Story
	(9.4984) Kuroko no Basket 2nd Season
	(9.4973) Kuroko no Basket 3rd Season
	(9.4855) Shinsekai yori
	(9.4262) Snow Halation
	(9.4215) Hunter x Hunter: Greed Island Final
	(9.4130) Love Live! School Idol Project
	(9.3943) Kenpuu Denki Berserk
	(9.3720) High School DxD New
	(9.3068) Hunter x Hunter
	(9.2841) Hunter x Hunter: Greed Island
	(9.2792) Kiseijuu: Sei no Kakuritsu
	(9.2381) Crayon Shin-chan
	(9.2372) Hunter x Hunter: Yorkshin City Kanketsu-hen
	(9.2367) Kuroko no Basket
	(9.2338) Chihayafuru 2

Bottom 10 Anime Predictions
	(5.2245) Super Child
	(5.2195) Tsui no Sora
	(5.2085) Ningen Doubutsuen
	(5.0830) Utsu Musume Sayuri
	(5.0803) Kokuhaku
	(5.0419) Aki no Puzzle
	(4.9264) Uju Heukgisa
	(4.8966) Forest Fairy Five
	(4.6567) Ai (ONA)
	(4.6346) Abunai Sisters: Koko & Mika


### 5. Out of sample predictions using OLS

In [3]:
myprofile = utils.get_user_anime_list('Manuel')

Scraped 300 additional anime from https://myanimelist.net/animelist/Manuel/load.json?offset=0&status=7
Scraped 148 additional anime from https://myanimelist.net/animelist/Manuel/load.json?offset=300&status=7


In [4]:
print(myprofile[0])

{'name': '3-gatsu no Lion', 'id_ref': 31646, 'status': 1, 'score': 0}


In [7]:
def jsondict_to_vec(profile):
    vec = pd.Series(index=data.cindex)
    for anime in profile:
        if anime['status'] not in [2,4]:
            continue
        vec[anime['id_ref']] = anime['score']
    vec = vec[data.cindex]
    return np.array(vec)
    
u = jsondict_to_vec(myprofile)

In [78]:
x = (u - model.mu - model.bv)

n = 0
xbar = 0
for i,e in enumerate(x):
    if not np.isnan(e):
        n += 1
        xbar += x[i]
xbar = xbar/n
x = x - xbar

vcov = np.linalg.inv(model.V.T.dot(model.V))

vx_ = 0
vcov_ = 0
for i,e in enumerate(x):
    if not np.isnan(e):
        vx_ += model.V[i,:]*x[i]
        vcov_ += np.outer(model.V[i,:], model.V[i,:])
vcov_ = np.linalg.inv(vcov_)
        
uhat = vcov_.dot(vx_)
print(uhat)

[ 0.03801727  0.05985485 -0.02586976 -0.31182302  0.26042233 -0.30728367
  0.23353339 -0.23892761 -0.08831289  0.37450211  0.27934204 -0.01900866
 -0.01106566  0.02314803 -0.42648209  0.11860496 -0.30235507 -0.14443665
 -0.10884845 -0.08504698  0.04568596  0.43096937  0.35011442 -0.29771794
  0.13459615 -0.23688228 -0.04275935  0.10430467  0.09664343  0.20579265
  0.16599957  0.10486619  0.05711688  0.08387709  0.18344094  0.01765343
  0.05207742  0.10602123  0.26168322  0.29706534]


In [79]:
prediction = pd.Series(np.dot(uhat,model.V.T).flatten(), index=data.cindex)
prediction = prediction + xbar + model.bv + model.mu
watched = [a['id_ref'] for a in myprofile if a['id_ref'] in prediction.index]
prediction = prediction.drop(list(watched))  # Note: columns == anime_ids's

print('Top 10 Anime Predictions')
for i in prediction.sort_values(ascending=False).head(20).index:
    str_ = (
        f"\t({prediction[i]:2.4f}) {data.get_anime_by_id(i).title}"
    )
    print(str_)
    
print('\nBottom 10 Anime Predictions')
for i in prediction.sort_values(ascending=False).tail(10).index:
    str_ = (
        f"\t({prediction[i]:2.4f}) {data.get_anime_by_id(i).title}"
    )
    print(str_)

Top 10 Anime Predictions
	(11.1159) Nana
	(10.8133) Kenpuu Denki Berserk
	(9.5002) Paradise Kiss
	(9.2069) Ginga Eiyuu Densetsu
	(9.1089) Devilman: Crybaby
	(8.9505) Nodame Cantabile Finale
	(8.8477) Koe no Katachi
	(8.7421) Yoru wa Mijikashi Arukeyo Otome
	(8.7103) Nodame Cantabile: Paris-hen
	(8.6801) Chihayafuru
	(8.6507) Kono Sekai no Katasumi ni
	(8.6478) Nekojiru-sou
	(8.6336) Steins;Gate: Oukoubakko no Poriomania
	(8.6069) Ginga Eiyuu Densetsu Gaiden
	(8.6067) Chihayafuru 2
	(8.5934) Aria The Origination
	(8.5927) Berserk: Ougon Jidai-hen III - Kourin
	(8.5388) Tokyo Ghoul
	(8.5292) Hunter x Hunter (2011)
	(8.5225) Tokyo Ghoul √A

Bottom 10 Anime Predictions
	(0.8278) Masou Gakuen HxH
	(0.8004) DRAMAtical Murder
	(0.7420) Suki na Mono wa Suki Dakara Shou ga Nai!!
	(0.6876) Angel&#039;s Feather
	(0.5967) Makura no Danshi
	(0.4612) 07-Ghost
	(0.0493) Zombie-Loan
	(-0.0565) Junjou Romantica 2
	(-0.6943) Gakuen Heaven
	(-0.7745) Junjou Romantica


## PLAYGROUND

In [None]:
import time

def loss(X, model):
    X = X.tocsc()
    N = 0.
    E = 0.
    for j in range(model.V.shape[0]):
        ivec = X.getcol(j).nonzero()[0]
        xtru = X[ivec,j].todense().T
        xhat = model.U[ivec].dot(model.V[j].T) + model.bu[ivec] + model.bv[j] + model.mu
        resd = xtru - xhat
        E += resd.dot(resd.T)
        N += len(ivec)
    return E[0,0] / N

tstart = time.time()
result = loss(data.X,model)
print(f"Took {time.time()-tstart:.2f} seconds. Result: {result}. Should be {gotcha}.")