# Benchmarks for cold-start

The idea is to compare DPP / random / mosaïque for the selection of the first items to rate for a newcomer.

## Preliminaries: DPP sampling

Load vectors from the saved SVD

In [1]:
from mangaki.utils.data import Dataset
dataset = Dataset()

In [2]:
from mangaki.utils.svd import MangakiSVD
algo = MangakiSVD()
algo.load(algo.get_backup_filename())
dataset.load('ratings-' + algo.get_backup_filename())

In [3]:
import numpy as np
rated_by_jj = np.array(User.objects.get(id=1).rating_set.values_list('work', 'work__title', 'choice'), dtype=[('work_id', 'i2'), ('work_title', 'U128'), ('choice', 'S8')])

In [4]:
rated_works = rated_by_jj['work_id']
encoded_work_ids = dataset.encode_works(rated_works)

In [5]:
vectors = algo.VT.T[encoded_work_ids]
L = vectors.dot(vectors.T)

In [15]:
D, V = np.linalg.eig(L.T)
D = np.real(D)
V = np.real(V)

If we want to summarize JJ's profile, which ratings should we keep?

In [16]:
from mangaki.utils import dpplib
sampled_indices = list(map(int, dpplib.sample_k(5, D, V)))
print(rated_by_jj[sampled_indices])
dpp_works = rated_works[sampled_indices]

def get_dpp_works(k):
    sampled_indices = list(map(int, dpplib.sample_k(k, D, V)))
    # print(rated_by_jj[sampled_indices])
    return rated_works[sampled_indices]

import random
def get_random_works(k):
    return np.array(random.sample(list(rated_works), k))

[(7408, 'Soul Eater', b'willsee')
 (  23, 'Suzumiya Haruhi no Yuuutsu', b'favorite')
 ( 260, 'Pokemon 3: The Movie', b'willsee')
 (  18, 'Fairy Tail', b'willsee') (  26, 'Cowboy Bebop', b'favorite')]


## Simulation

In [17]:
print(Rating.objects.filter(user_id=1).count(), 'ratings for JJ')
ds = Dataset()
anonymized = ds.make_anonymous_data(Rating.objects.all())

536 ratings for JJ


In [18]:
def select_works_to_rate():
    if METHOD == 'dpp':
        kept_works = get_dpp_works(NB_WORKS)
    else:
        kept_works = get_random_works(NB_WORKS)
    return kept_works

In [19]:
def split_train_test(kept_works):
    i_train = []
    i_test = []
    my_encoded_user_id = ds.encode_user[1]
    encoded_kept_works = set(ds.encode_works(kept_works))
    for i, (encoded_user_id, encoded_work_id) in enumerate(anonymized.X):
        if encoded_user_id == my_encoded_user_id and encoded_work_id not in encoded_kept_works:
            i_test.append(i)
        else:
            i_train.append(i)
    return i_train, i_test

In [40]:
from mangaki.utils.knn import MangakiKNN
from sklearn.metrics import mean_squared_error
def compute_error(i_train, i_test):
    knn = MangakiKNN(40)
    knn.set_parameters(anonymized.nb_users, anonymized.nb_works)
    knn.fit(anonymized.X[i_train], anonymized.y[i_train])
    my_encoded_user_id = ds.encode_user[1]
    neighbors = knn.get_neighbors([my_encoded_user_id])[0]
    # print([User.objects.get(id=user_id) for user_id in ds.decode_users(neighbors)])
    y_pred = knn.predict(anonymized.X[i_test])
    print('predicted:', y_pred[:5])
    print('was:', anonymized.y[i_test][:5])
    rmse = mean_squared_error(anonymized.y[i_test], y_pred) ** 0.5
    print('rmse:', rmse)
    top_pred_indices = y_pred.argsort()[-20:]
    # top_pred_indices = anonymized.y[i_test].argsort()[-20:]
    print('predicted:', y_pred[top_pred_indices])
    print('was:', anonymized.y[i_test][top_pred_indices])
    rmse = mean_squared_error(anonymized.y[i_test][top_pred_indices], y_pred[top_pred_indices]) ** 0.5
    print('rmse:', rmse)
    for work_id in [ds.decode_work[encoded_work_id] for _, encoded_work_id in anonymized.X[i_test][top_pred_indices]]:
        print(Work.objects.get(id=work_id).title)
    # print('full predict')
    # knn.predict(np.array([(1522, ds.encode_work[13811])]), True)
    return rmse

In [41]:
def run_simulation():
    kept_works = select_works_to_rate()
    i_train, i_test = split_train_test(kept_works)
    print('train size', len(i_train))
    print('test size', len(i_test))
    rmse = compute_error(i_train, i_test)
    print(NB_WORKS, METHOD, rmse)

## Results (number of works asked, strategy, RMSE)

- 10 DPP 1.09488816919 1.1020390692
- 20 DPP 1.0957652159900086
- 50 DPP 1.06436560838
- 100 DPP 1.0724727205
- 500 DPP 1.27818830679 0.848657862475 (possibly overfitting)

- 10 RND 1.09689215268 1.08273857127
- 20 RND 1.11930399192
- 50 RND 1.06455730002
- 100 RND 1.06357987938
- 500 RND 0.825381153284 0.754057572138

In [42]:
NB_WORKS = 10
METHOD = 'dpp'

In [43]:
run_simulation()

train size 317355
test size 526
[array([ 855,  279, 1050,  873, 1155, 1799,  103,  893, 1892,  745, 1761,
       1235,  101, 1239,  406, 1567,  634, 1441,  438, 1244, 1134, 1664,
        610,  169,  912, 1222,  129, 1687,  159,  407,  758,  593,  962,
        551, 1323,  785,  570, 1485, 1814, 1401])]
predicted: [ 0.  0.  0.  0.  0.]
was: [-0.5 -0.5 -0.5  0.5  0.5]
rmse: 1.23206181617
predicted: [ 1.14453807  1.14509658  1.15560294  1.16562679  1.20279494  1.22436833
  1.22739906  1.26143443  1.26331597  1.2685274   1.31005521  1.31468137
  1.39853659  1.42103941  1.43096924  1.6688064   1.68547088  1.70220729
  1.72033654  1.72612439]
was: [ 0.5  2.   2.  -0.5  4.   0.5  4.   2.   2.   2.   2.   0.5  0.1  4.   2.
  0.5  0.5  2.   4.   2. ]
rmse: 1.41820288426
Angel Beats!
Mahou Shoujo Madoka★Magica
Toki wo Kakeru Shoujo
Kill la Kill
Suzumiya Haruhi no Yuuutsu
Fullmetal Alchemist
Les Enfants Loups : Ame & Yuki
Code Geass: Hangyaku no Lelouch R2
Durarara!!
Nausicaä of the Valley of the 

Still needs to be done:
- run simulation on several users (how many works should be kept?)
- try a different number of neighbors
- should we evaluate the strategy on the whole profile or just the top recommendations?
- try other models than MangakiKNN (but others are slow)

## SVD