In [1]:
from collections import defaultdict, Counter

god_rmse = defaultdict(list)

In [3]:
from mangaki.algo.als import MangakiALS
from mangaki.algo.svd import MangakiSVD

In [12]:
from mangaki.utils.algo import get_dataset_backup
from mangaki.utils.data import Dataset
from mangaki.utils.values import rating_values

dataset = Dataset()
dataset.load_csv('balse/ratings.csv', convert=lambda x: rating_values[x], title_filename='balse/works.csv')

In [13]:
from mangaki.utils.balse import MangakiBALSE
from mangaki.settings import DATA_DIR
from scipy.sparse import load_npz

In [14]:
from sklearn.model_selection import ShuffleSplit

NB_SPLIT = 5
k_fold = ShuffleSplit(n_splits=NB_SPLIT)
SETS = list(k_fold.split(dataset.anonymized.X))

In [178]:
GOD_I = 4

In [179]:
# START
i_train, i_test = SETS[GOD_I]

In [221]:
dataset.anonymized.X.shape

(334390, 2)

In [180]:
NOTVAL = round(0.7 * len(i_train))

X_train = dataset.anonymized.X[i_train]
y_train = dataset.anonymized.y[i_train]
X_subtrain = X_train[:NOTVAL]
y_subtrain = y_train[:NOTVAL]
X_val = X_train[NOTVAL:]
y_val = y_train[NOTVAL:]
X_test = dataset.anonymized.X[i_test]
y_test = dataset.anonymized.y[i_test]
nb_subtrain_rated = Counter(X_subtrain[:, 1])
nb_train_rated = Counter(X_train[:, 1])

In [181]:
def run_als(X_train, X_test, y_train, y_test):
    als = MangakiALS(10, LAMBDA=0.1)
    als.set_parameters(dataset.anonymized.nb_users, dataset.anonymized.nb_works)
    #als.fit(X_train, y_train)
    als.all_errors(X_train, X_test, y_train, y_test)
    return als

In [182]:
als = run_als(X_train, X_test, y_train, y_test)
sub_als = run_als(X_subtrain, X_test, y_subtrain, y_test)

Computing M: (2079 × 9979)
Chrono: fill and center matrix [0q, 1463ms]
Shapes (2079, 10) (10, 9979)
Chrono: factor matrix [0q, 12342ms]
Train minmax -2.98166002478 4.52700465152
Train error 0.980646617638
Test minmax -2.80634401035 5.18515675231
Test error 1.15149072211
Computing M: (2079 × 9979)
Chrono: fill and center matrix [0q, 988ms]
Shapes (2079, 10) (10, 9979)
Chrono: factor matrix [0q, 10569ms]
Train minmax -2.93223724413 4.56488695928
Train error 0.946329414823
Test minmax -4.42400264331 4.49266330151
Test error 1.18043112233


In [183]:
from mangaki.utils.zero import MangakiZero

zero = MangakiZero()
zero.all_errors(X_train, X_test, y_train, y_test)

Train minmax 0 0
Train error 1.57335141405
Test minmax 0 0
Test error 1.5646916998


In [184]:
from mangaki.utils.lasso import MangakiLASSO

def run_lasso(X_train, X_test, y_train, y_test):
    lasso = MangakiLASSO(with_bias=True, alpha=0.01)
    #lasso.load_tags()
    lasso.set_parameters(dataset.anonymized.nb_users, dataset.anonymized.nb_works)
    # lasso.fit(X_train, y_train)
    lasso.all_errors(X_train, X_test, y_train, y_test)
    return lasso

In [185]:
lasso = run_lasso(X_train, X_test, y_train, y_test)
sub_lasso = run_lasso(X_subtrain, X_test, y_subtrain, y_test)

Fitting user ID 0/2073…
Fitting user ID 500/2073…
Fitting user ID 1000/2073…
Fitting user ID 1500/2073…
Fitting user ID 2000/2073…
Sparsity: 1.00000 ± 0.00000


Train minmax -2.0 2.0
Train error 0.763738670439
Test minmax -2.0 2.0
Test error 1.44732800804


Fitting user ID 0/2045…
Fitting user ID 500/2045…
Fitting user ID 1000/2045…
Fitting user ID 1500/2045…
Fitting user ID 2000/2045…
Sparsity: 1.00000 ± 0.00000


Train minmax -2.0 2.0
Train error 0.694643658002
Test minmax -2.0 2.0
Test error 1.48473649764


In [186]:
i_test_ordered = sorted(i_test, key=lambda i: nb_train_rated[dataset.anonymized.X[i][1]], reverse=True)

In [187]:
X_tmp_test = dataset.anonymized.X[i_test_ordered]
y_tmp_test = dataset.anonymized.y[i_test_ordered]

In [188]:
# Attention
# X_val = X_train

In [189]:
nb_r = list(map(lambda x: nb_subtrain_rated[x[1]], X_val))

In [190]:
y_val_als = sub_als.predict(X_val)
y_val_lasso = sub_lasso.predict(X_val)

In [191]:
y_als = sub_als.predict(X_tmp_test)
y_lasso = sub_lasso.predict(X_tmp_test)

In [192]:
import numpy as np

X = np.column_stack((nb_r, y_val_als, y_val_lasso))
y = y_val

In [193]:
X.shape

(90285, 3)

In [194]:
y_val_als.shape

(90285,)

In [195]:
y.shape

(90285,)

In [196]:
import tensorflow as tf

In [197]:
beta = tf.Variable(tf.random_normal([1]), name='beta')
gamma = tf.Variable(tf.random_normal([1]), name='gamma')

In [198]:
e1 = np.array([1, 0, 0])
e2 = np.array([0, 1, 0])
e3 = np.array([0, 0, 1])
pred = tf.sigmoid(beta*(X.dot(e1) - gamma)) * X.dot(e2) + (1 - tf.sigmoid(beta*(X.dot(e1) - gamma))) * X.dot(e3)
loss = tf.reduce_mean(tf.square(y - pred)) ** 0.5
reg_loss = loss

global_step = tf.Variable(0, trainable=False)
starter_learning_rate = 0.9
learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                           20, 0.9965402628278678, staircase=True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_step = optimizer.minimize(reg_loss, var_list=[beta, gamma], global_step=global_step)

init_op = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init_op)

In [201]:
for i in range(5000):
    sess.run(train_step)
    if i % 500 == 0:
        print('loss', sess.run(reg_loss), 'beta', beta.eval(), 'gamma', gamma.eval(), learning_rate.eval())

loss 1.16952 beta [ 0.03688588] gamma [-0.69050276] 0.159098
loss 1.16952 beta [ 0.03687955] gamma [-0.69555074] 0.145893
loss 1.16952 beta [ 0.03687378] gamma [-0.70017523] 0.133785
loss 1.16952 beta [ 0.03686853] gamma [-0.70441359] 0.122681
loss 1.16952 beta [ 0.0368637] gamma [-0.70829797] 0.112499
loss 1.16952 beta [ 0.03685927] gamma [-0.71185929] 0.103162
loss 1.16952 beta [ 0.03685518] gamma [-0.71512151] 0.0946
loss 1.16952 beta [ 0.03685147] gamma [-0.71811312] 0.0867485
loss 1.16952 beta [ 0.03684811] gamma [-0.72085565] 0.0795487
loss 1.16952 beta [ 0.03684492] gamma [-0.72336936] 0.0729465


In [202]:
nb_rt = list(map(lambda x: nb_train_rated[x[1]], X_tmp_test))

In [203]:
Xt = np.column_stack((nb_rt, y_als, y_lasso))

In [205]:
loss.eval(), beta.eval(), gamma.eval()

(1.1695154,
 array([ 0.03684209], dtype=float32),
 array([-0.72566766], dtype=float32))

In [206]:
y_full_als = als.predict(X_tmp_test)
y_full_lasso = lasso.predict(X_tmp_test)

In [207]:
new_pred = tf.sigmoid(beta*(Xt.dot(e1) - gamma)) * y_full_als + (1 - tf.sigmoid(beta*(Xt.dot(e1) - gamma))) * y_full_lasso

In [208]:
als.compute_rmse(y_als, y_tmp_test)

1.1804311223303001

In [209]:
WINDOW = 1000000

rmse_als_full = als.compute_rmse(y_full_als, y_tmp_test)
print('Test error', rmse_als_full)
god_rmse['als'].append(rmse_als_full)

Test error 1.15149072211


In [210]:
y_new_pred = new_pred.eval()
rmse_balse_full = als.compute_rmse(y_new_pred, y_tmp_test)
print('Test error', rmse_balse_full)
god_rmse['balse'].append(rmse_balse_full)

Test error 1.14354738023


In [211]:
rmse_lasso_full = als.compute_rmse(y_full_lasso, y_tmp_test)
print('Test error', rmse_lasso_full)
god_rmse['lasso'].append(rmse_lasso_full)

Test error 1.44732800804


In [220]:
nb_rt[-1000]

3

In [217]:
1000 / len(y_tmp_test)

0.02990520051436945

In [219]:
(len(nb_rt) - nb_rt.index(0)) / len(y_tmp_test)

0.007386584527049254

In [212]:
for WINDOW, tag in [(1000, 'cold'), (len(nb_rt) - nb_rt.index(0), 'freeze')]:
    god_rmse['als-%s' % tag].append(als.compute_rmse(y_full_als[-WINDOW:], y_tmp_test[-WINDOW:]))
    y_new_pred = new_pred.eval()
    god_rmse['balse-%s' % tag].append(als.compute_rmse(y_new_pred[-WINDOW:], y_tmp_test[-WINDOW:]))
    god_rmse['lasso-%s' % tag].append(als.compute_rmse(y_full_lasso[-WINDOW:], y_tmp_test[-WINDOW:]))

In [213]:
import math

def avgstd(l):  # Displays mean and variance
    n = len(l)
    mean = float(sum(l)) / n
    var = float(sum(i * i for i in l)) / n - mean * mean
    return '%.5f ± %.5f' % (round(mean, 5), round(1.96 * math.sqrt(var / n), 3))  # '%.3f ± %.3f' % 

In [215]:
for key in god_rmse:
    # print(key, god_rmse[key])
    print(key, avgstd(god_rmse[key]))

als 1.15681 ± 0.00400
balse 1.14954 ± 0.00400
lasso 1.44444 ± 0.00200
als-cold 1.29269 ± 0.02900
balse-cold 1.22714 ± 0.03600
lasso-cold 1.31331 ± 0.03600
als-freeze 1.50047 ± 0.03500
balse-freeze 1.34533 ± 0.04500
lasso-freeze 1.37909 ± 0.05600


In [177]:
print(WINDOW)
for i in range(1, 4):
    encoded_user_id, encoded_work_id = X_tmp_test[-i]
    if True:#encoded_user_id == 2015:
        print(X_tmp_test[-i])
        print(dataset.titles[encoded_work_id],
              'als', als.predict(np.array([(encoded_user_id, encoded_work_id)])),
              'lasso', lasso.predict(np.array([(encoded_user_id, encoded_work_id)])),
              'sub_als', y_als[-i],
              'sub_lasso', y_lasso[-i],
              'balse', y_new_pred[-i],
              'truth', y_tmp_test[-i],
              'user bias', als.means[encoded_user_id],
              'nb ratings', nb_train_rated[encoded_work_id])

240
[1306 1483]
Enfants d'Agartha (les) als [ 2.45263219] lasso [ 0.97617097] sub_als 1.343685313 sub_lasso 1.0957989608 balse 1.70751 truth 0.1 user bias 0.833885152613 nb ratings 0
[1698 3047]
Argento Soma Special als [ 1.09988402] lasso [ 0.66086675] sub_als 2.03313304362 sub_lasso 0.586069092746 balse 0.878325 truth 2.0 user bias 1.29502762431 nb ratings 0
[2034 1180]
Hacka Doll als [ 1.64694415] lasso [ 0.37655969] sub_als 0.672194576379 sub_lasso 0.315973101292 balse 1.00582 truth -2.0 user bias 1.20058548009 nb ratings 0


In [43]:
with open(os.path.join(DATA_DIR, 'balse/balse.csv')) as f:
    tags = [line.split(',')[1] for line in f]

WORK_ID = 665
    
for tag, weight in zip(tags, T[WORK_ID]):
    if weight != 0:
        print(tag.strip(), weight)

NameError: name 'os' is not defined

In [138]:
for tag, weight in zip(tags, lasso.reg[2015].coef_):
    if weight != 0:
        print(tag, weight)

In [139]:
lasso.reg[2015].predict(np.array([T[WORK_ID]]))

array([ 0.73747573])

In [140]:
lasso.reg[2015].coef_.dot(T[WORK_ID])

0.0

In [141]:
lasso.reg[2015].intercept_

0.73747572817848728

In [142]:
for tag, weight in zip(tags, lasso.reg[2015].coef_ * T[WORK_ID]):
    if weight:
        print(tag.strip(), weight)

In [143]:
als.VT.T[3748]

array([-0.34922168, -0.02603758, -0.31870448,  0.37174495, -0.04622742,
        1.42612367, -0.84433803,  0.20316341,  0.80450289,  1.12107641])

In [144]:
from collections import Counter
Counter(User.objects.get(id=1).rating_set.values_list('choice', flat=True))

Counter({'dislike': 16,
         'favorite': 23,
         'like': 118,
         'neutral': 9,
         'willsee': 330,
         'wontsee': 68})

In [145]:
for work_id, title in enumerate(dataset.titles):
    if title == 'Millennium Actress':
        print(work_id)

77
9163


In [146]:
from collections import Counter, defaultdict

nb = defaultdict(Counter)
for (user_id, work_id), value in zip(dataset.anonymized.X, dataset.anonymized.y):
    nb[user_id][value] += 1

In [147]:
for (user_id, work_id), value in zip(dataset.anonymized.X, dataset.anonymized.y):
    if work_id == 77 and value == 4:
        print(user_id, nb[user_id])

981 Counter({-0.5: 126, 2.0: 92, -2.0: 51, 4.0: 46, 0.5: 42, 0.1: 28})
121 Counter({-0.5: 75, 0.5: 64, 4.0: 62, 2.0: 51, 0.1: 22, -2.0: 5})
1738 Counter({4.0: 21, -2.0: 8, 0.1: 7, 2.0: 7, 0.5: 4, -0.5: 1})
1120 Counter({0.5: 742, 2.0: 316, -0.5: 208, -2.0: 137, 0.1: 126, 4.0: 39})
326 Counter({2.0: 123, 4.0: 70, 0.1: 55, -2.0: 46, 0.5: 25, -0.5: 11})
1689 Counter({0.1: 71, -2.0: 43, 2.0: 42, 4.0: 7})
1979 Counter({2.0: 61, 0.5: 53, -0.5: 34, 4.0: 31, 0.1: 14})
278 Counter({2.0: 103, -0.5: 85, 0.5: 31, 4.0: 26, 0.1: 24, -2.0: 10})
407 Counter({0.1: 13, -2.0: 11, -0.5: 9, 2.0: 9, 0.5: 3, 4.0: 1})
512 Counter({4.0: 7, 2.0: 6, -2.0: 2, -0.5: 2, 0.5: 1})
453 Counter({-0.5: 518, 0.5: 437, 2.0: 278, 4.0: 77, 0.1: 17, -2.0: 11})
2015 Counter({0.5: 333, 2.0: 121, -0.5: 72, 4.0: 25, -2.0: 16, 0.1: 9})
1839 Counter({2.0: 197, -0.5: 73, 4.0: 62, 0.5: 57, 0.1: 46, -2.0: 16})
242 Counter({2.0: 199, 4.0: 37, 0.1: 34, -2.0: 26, -0.5: 4, 0.5: 4})
1678 Counter({2.0: 6, 0.5: 5, 4.0: 5, -2.0: 2})


In [26]:
# JJ est 2015 !

In [30]:
import os.path
import numpy as np

T = load_npz(os.path.join(DATA_DIR, 'balse/tag-matrix.npz')).toarray()

In [39]:
# Try various lassos on myself
USER_ID = 2015
X = []
y = []
for (user_id, work_id), value in zip(dataset.anonymized.X, dataset.anonymized.y):
    if user_id == USER_ID:
        X.append(T[work_id])
        y.append(value)
X = np.array(X)
y = np.array(y)

In [40]:
from sklearn.model_selection import train_test_split

i_train, i_test = train_test_split(range(len(X)))

In [41]:
X_train = X[i_train]
X_test = X[i_test]
y_train = y[i_train]
y_test = y[i_test]

In [42]:
X_train.shape

(432, 503)

In [35]:
import pandas as pd

pd.DataFrame(X).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,493,494,495,496,497,498,499,500,501,502
count,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,...,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0
mean,0.000508,0.0,0.0,0.000861,0.004601,0.009568,0.000916,0.0,0.0,0.001197,...,0.00101,0.037237,0.0,0.078183,0.0,0.001716,0.163196,0.000252,0.009215,0.0
std,0.008608,0.0,0.0,0.01203,0.02827,0.051571,0.016432,0.0,0.0,0.013173,...,0.014544,0.094976,0.0,0.122134,0.0,0.014617,0.262505,0.006056,0.046455,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.144202,0.0,0.0,0.226478,0.0,0.0,0.0
max,0.148561,0.0,0.0,0.187316,0.261796,0.67273,0.354625,0.0,0.0,0.201148,...,0.272276,0.613001,0.0,0.589839,0.0,0.160001,0.97752,0.145337,0.402184,0.0


In [36]:
from sklearn.preprocessing import scale

In [38]:
T = load_npz(os.path.join(DATA_DIR, 'balse/tag-matrix.npz')).tocsc()

T = scale(T, with_mean=False).toarray()

In [43]:
pd.DataFrame(T_scaled).describe()

NameError: name 'T_scaled' is not defined

In [44]:
np.std(T_scaled[:, 0].data)

NameError: name 'T_scaled' is not defined

In [45]:
np.std(T_scaled[:, 1].data)

NameError: name 'T_scaled' is not defined

In [46]:
T_scaled.toarray()

NameError: name 'T_scaled' is not defined

In [219]:
from sklearn.linear_model import LinearRegression, Lasso, LassoLarsCV, LassoCV

clf = LassoCV(cv=10, fit_intercept=False)
# clf = Lasso(alpha=0.1, fit_intercept=False)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(y_pred[:10])
print(y_test[:10])
print('alpha', clf.alpha_)
print(als.compute_rmse(y_pred, y_test))



[ 1.5708497   0.50914856  0.86323306  1.08109931  0.50466447  0.45824748
  1.06714965  0.78175205  0.69206829  0.60301172]
[ 0.5  0.5  0.5  0.5  2.   0.5  2.   0.1  0.5  0.5]
alpha 0.00675066364528
1.14569988748


In [49]:
from sklearn.linear_model import LinearRegression, Lasso, LassoLarsCV, LassoCV

# clf = LassoCV(cv=10, fit_intercept=False)
clf = Lasso(alpha=0.1, fit_intercept=True)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(y_pred[:10])
print(y_test[:10])
# print('alpha', clf.alpha_)
print(clf.intercept_)
print(lasso.compute_rmse(y_pred, y_test))

[ 0.81072104  0.76327843  0.48053424  0.60593274  0.68143621  0.84986515
  0.85026601  0.84986515  0.65577524  0.8695655 ]
[-0.5  0.5  0.5  0.5  0.5  2.   0.5  2.   0.5  2. ]
0.849865148083
1.0604782715


In [243]:
sum(x < 0 for x in y_pred)

0

In [244]:
clf.intercept_

0.0

In [52]:
for i, (tag, weight) in enumerate(zip(tags, clf.coef_)):
    if weight != 0:
        print(i, tag.strip(), weight)

20 petals 0.062048928418
22 butterfly -0.00575388224009
27 necktie -0.0441011232299
29 kimono 0.0108843921645
33 table 0.0609460759477
48 instrument -0.00326064812932
60 from behind -0.00583659841878
69 traditional media 0.0147549217731
73 magical girl 0.0312992390526
76 younger -0.00337236166619
92 child -0.010405530536
102 3boys -0.0148483643512
130 blonde hair -0.0562550649455
146 dutch angle 0.00639426176897
180 white hair -0.0311996357131
228 mole -0.00851666561641
229 blue hair -0.0126566933755
244 zettai ryouiki -0.00325367435046
282 backpack -0.0220788327809
284 twin drills 0.0149344607109
369 facial mark -0.0585771795163
416 short twintails 0.00372983158992
451 sky 0.0121431534467
476 frog 0.039292267754


In [54]:
def search_tag(dataset, tag_id):
    results = Counter()
    for (user_id, work_id), choice in zip(dataset.anonymized.X, dataset.anonymized.y):
        weight = T[work_id][tag_id]
        if user_id == USER_ID and weight != 0:
            results[dataset.titles[work_id], weight] = choice
    return results.most_common()

In [55]:
search_tag(dataset, 476)

[(('Princesse Mononoké', 64.162086538829627), 4.0)]

In [199]:
T.shape

(9979, 503)

In [57]:
T

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])