In [1]:
import pandas as pd

df = pd.DataFrame(list(Rating.objects.values_list('user_id', 'work_id', 'choice')),
                       columns=['user_id', 'work_id', 'choice'])

In [4]:
choice_values = ['dislike', 'wontsee', 'neutral', 'willsee', 'like', 'favorite']
rating_values = dict(zip(choice_values, range(len(choice_values))))

In [5]:
df['rating'] = df['choice'].map(rating_values)

In [6]:
from scipy.sparse import coo_matrix

sparse = coo_matrix((df['rating'], (df['user_id'], df['work_id'])))

In [7]:
sparse_csr = sparse.tocsr()
sparse_csc = sparse.tocsc()

In [8]:
nb_users, nb_works = sparse_csr.shape

In [16]:
from collections import defaultdict
from mord import LogisticAT

U = defaultdict(lambda: LogisticAT())
V = defaultdict(lambda: LogisticAT())

In [44]:
users, works = sparse_csr.nonzero()

In [54]:
import numpy as np

users = np.unique(df['user_id'])
works = np.unique(df['work_id'])

In [56]:
from mord.threshold_based import threshold_fit

In [57]:
def sanitize(data):
    values = np.unique(data)
    mapping = dict(zip(values, range(len(values))))
    return np.vectorize(mapping.get)(data)

In [58]:
for user_id in users:
    U[user_id].coef_ = np.random.random(20)
    U[user_id].data = sanitize(sparse_csr[user_id, :].data)

In [59]:
for work_id in works:
    V[work_id].coef_ = np.random.random(20)
    V[work_id].data = sanitize(sparse_csc[:, work_id].data)

In [60]:
def fit_user(user_id):
    work_ids = sparse_csr[user_id, :].indices
    X = np.array([V[work_id].coef_ for work_id in work_ids])
    #print(X.shape, len(work_ids))
    y = U[user_id].data
    U[user_id].fit(X, y)
    #U[user_id].coef_, U[user_id].theta_ = threshold_fit(X, y, U[user_id].alpha, len(choice_values))

def fit_work(work_id):
    user_ids = sparse_csc[:, work_id].indices
    for user_id in user_ids:
        try:
            U[user_id].coef_
        except:
            print('error', work_id, 'with', user_id)        
    X = np.array([U[user_id].coef_ for user_id in user_ids])
    #print(all(len(U[user_id].coef_) == len(U[user_ids[0]].coef_) for user_id in user_ids))
    #print(X.shape, len(user_ids), user_ids[0], U[user_ids[0]].coef_)
    y = V[work_id].data
    V[work_id].fit(X, y)
    #V[work_id].coef_, V[work_id].theta_ = threshold_fit(X, y, V[work_id].alpha, len(choice_values))

In [67]:
from datetime import datetime

for nb_iter in range(2):
    bla = datetime.now()
    for user_id in users:
        fit_user(user_id)
    print('user ok', datetime.now() - bla)
    bla = datetime.now()
    for work_id in works:
        fit_work(work_id)
    print('work ok', datetime.now() - bla)

user ok 0:00:21.701368
work ok 0:00:50.570585
user ok 0:00:19.572183
work ok 0:00:58.875789


In [87]:
USER_ID = 1
jj_work_ids = sparse_csr[USER_ID, :].indices
X = np.array([V[work_id].coef_ for work_id in jj_work_ids])
sum(U[USER_ID].predict(X) == U[USER_ID].data) / len(U[USER_ID].data)

0.73148148148148151

In [65]:
for nb_iter in range(2):
    for user_id in [1]:
        fit_user(user_id)
    print('ok')
    for work_id in jj_work_ids:
        fit_work(work_id)

ok
ok


In [88]:
jj = df.query('user_id == 1')

In [100]:
jj['pred'] = X.dot(U[1].coef_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [126]:
titles = dict(list(Work.objects.values_list('id', 'title')))

In [127]:
jj['title'] = jj['work_id'].map(titles)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [128]:
jj.sort_values('pred', ascending=False)

Unnamed: 0,user_id,work_id,choice,rating,pred,title
288648,1,4983,willsee,3,13.352409,Darwin's Game
37382,1,5417,willsee,3,11.019723,FullMetal Alchemist
10931,1,6,dislike,0,10.900932,Bleach
12399,1,330,like,4,10.451844,Tokyo Godfathers
256698,1,10378,willsee,3,10.370725,Sound of the Sky
37381,1,4356,willsee,3,9.010964,Assassination classroom
24834,1,6131,willsee,3,8.894626,Liar Game
37384,1,17,wontsee,1,8.801478,Naruto: Shippuuden
37533,1,108,wontsee,1,8.322807,Zero no Tsukaima: Futatsuki no Kishi
65278,1,1980,like,4,8.298896,Eden of The East the Movie II: Paradise Lost


In [86]:
X.shape

(273, 20)

In [88]:
X = np.array(X)
y = np.array(y)

In [89]:
from mord import LogisticAT

In [90]:
ordreg = LogisticAT(verbose=1, max_iter=10000)
ordreg.fit(X, y)

LogisticAT(alpha=1.0, max_iter=10000, verbose=1)

In [91]:
y_pred = ordreg.predict(X)

In [92]:
sum(y_pred == y)

398

In [93]:
ordreg.coef_

array([-0.66271501, -0.01857029, -0.74340387,  0.97398988, -0.02981268,
        1.17420778, -1.41939884, -0.51550002,  0.36530048,  0.2226114 ,
       -0.70468383,  0.8803067 , -0.19982504, -0.68572086, -0.52661859,
       -0.88023252,  0.75771539,  0.49302697,  1.21863566, -1.88408098])

In [94]:
ordreg.theta_

array([-3.917645  , -1.79567674, -1.64913484,  1.59998804,  4.18093472])

In [96]:
ordreg.predict(X[:5])

array([3, 3, 3, 3, 3])

In [97]:
y[:5]

array([4, 3, 3, 1, 3])

In [100]:
X[:5].dot(ordreg.coef_)

array([ 0.07766994,  0.49457707,  0.91142204, -0.56183871, -1.0200258 ])