In [23]:
import numpy as np
from scipy.sparse import lil_matrix, csr_matrix 
from tqdm import tqdm
import time

In [2]:
def h(x, w0, w, v):
    summ = 0
    n = x.shape[0]
    
    for f in range(v.shape[1]):
        summ += (x.dot(v[:,f]))**2 - (x.multiply(x)).dot(v[:,f]**2)

    return w0 + x.dot(w) + 0.5 * summ[0]

In [52]:
def MSE(y_true, y_pred):
    return np.sum((y_true - y_pred) ** 2) / float(y_true.shape[0])

def RMSE(y_true, y_pred):
    return np.sqrt(MSE(y_true, y_pred))

def R2(y_true, y_pred):
    return 1. - MSE(y_true, y_pred) / MSE(y_true, np.mean(y_true) * np.ones(shape=y_true.shape))

In [4]:
# Сбор данных

col_ids = []
row_ids = []
y = []

with open('./netflix-prize-data/combined_data_1.txt', 'r') as f:    
    counter = 0
    for line in tqdm(f): 
        if ':' in line:
            film_id = int(line[:-2])
        else:
            values = line.split(',')
            user_id = int(values[0])
            rating = int(values[1])
            
            row_ids.append(counter)
            col_ids.append(user_id)
            
            row_ids.append(counter)
            col_ids.append(film_id)

            y.append(rating)
            counter += 1


    values = np.ones(len(row_ids))
    x = csr_matrix((values, (row_ids, col_ids)), dtype=np.float32)
    y = np.array(y, dtype=np.float32)

24058263it [01:14, 324503.91it/s]


In [5]:
def H(X, w0, w, v):
    x_x = X.multiply(X)
    x_v = X.dot(v)
    x_v_2 = np.multiply(x_v, x_v)
    x_w = X.dot(w.transpose())
    
    v_v = np.multiply(v, v)
    xx_vv = x_x.dot(v_v)
    
    result = w0 + x_w + 0.5 * np.ravel(np.sum(x_v_2 - xx_vv, axis = 1))
    return result

In [6]:
def dL_dw0(X, w0, w, v, delta):
    return -2. * np.sum(delta) / float(X.shape[0])

In [7]:
def dL_dwi(X, w0, w, v, delta):
    return -2. * np.ravel(np.sum(delta.multiply(X), axis = 0)) / float(X.shape[0])

In [8]:
def dL_dv(X, w0, w, v, delta):
    x_x = X.multiply(X)
    x_v = X.dot(v)
    dx = (delta.multiply(X)).transpose()
    xx_v = dx.dot(x_v)
    
    dxx = np.sum(delta.multiply(x_x), axis = 0)
    vdxx = (v.T * np.ravel(dxx)).T

    return -2. * (xx_v - vdxx)

In [53]:
def gradDesc(X, y, X_test, y_test):
    w0 = 0
    
    k = 3
    n_iter = 1000
    stepsize = 0.01
    stepsize_v = 0.002
    w = np.random.normal(0, 0.1, X.shape[1])
    v = np.random.normal(0, 0.001, size = (X.shape[1], k))

    for i in range(n_iter):
        last_time = time.time()
        idxs = np.random.choice(range(X.shape[0]), size=200000, replace=False)
        X_s = X[idxs]
        y_s = y[idxs]
        
        y_pred = H(X_s, w0, w, v)

        delta = csr_matrix(y_s - y_pred, dtype = np.float32).transpose()

        grad_w0 = stepsize * dL_dw0(X_s, w0, w, v, delta)
        grad_w = stepsize * dL_dwi(X_s, w0, w, v, delta)
        grad_v = stepsize_v * dL_dv(X_s, w0, w, v, delta)

        if i % 10 == 0:
            iter_time = time.time() - last_time
            print('%d: RMSE(train) = %.5f RMSE(test) = %.5f iter_time = %.5f' % (
                i,
                RMSE(y, H(X, w0, w, v)), 
                RMSE(y_test, H(X_test, w0, w, v)),
                iter_time
            ))
            
        w0 = w0 - grad_w0
        w = w - grad_w
        v = v - grad_v
        
    return w0, w, v


def CV(X, y, n_folds):
    fold_indexes = np.random.randint(0, n_folds, X.shape[0])
    
    metr = []
    
    folds = []
    for i in range(n_folds):
        indexes = np.where(fold_indexes == i)
        x_slice = X[indexes]
        y_slice = y[indexes]
        
        indexes = np.where(fold_indexes != i)
        x_not_slice = X[indexes]
        y_not_slice = y[indexes]
        
        folds.append((x_slice, y_slice, x_not_slice, y_not_slice))
        
    for i in range(n_folds):
        x_val = folds[i][0]
        y_val = folds[i][1]
        x_learn = folds[i][2]
        y_learn = folds[i][3]
        
        w0, w, v = gradDesc(x_learn, y_learn, x_val, y_val)
        
        y_pr = H(x_val, w0, w, v)
        
        rm = R2(y_val, y_pr)
        metr.append(rm)
        print('fold index ' + str(i) + ': R2 = ' + str(rm))
    
    return metr

In [54]:
CV(x, y, 5)

0: RMSE(train) = 3.75992 RMSE(test) = 3.76010 iter_time = 3.50956
10: RMSE(train) = 3.13502 RMSE(test) = 3.13520 iter_time = 3.48985
20: RMSE(train) = 2.63713 RMSE(test) = 2.63731 iter_time = 3.51102
30: RMSE(train) = 2.24430 RMSE(test) = 2.24448 iter_time = 3.41701
40: RMSE(train) = 1.93736 RMSE(test) = 1.93756 iter_time = 3.49569
50: RMSE(train) = 1.69113 RMSE(test) = 1.69174 iter_time = 3.50101
60: RMSE(train) = 1.44694 RMSE(test) = 1.44992 iter_time = 3.48766
70: RMSE(train) = 1.22663 RMSE(test) = 1.23288 iter_time = 3.50187
80: RMSE(train) = 1.12916 RMSE(test) = 1.13804 iter_time = 3.50488
90: RMSE(train) = 1.08095 RMSE(test) = 1.09094 iter_time = 3.49745
100: RMSE(train) = 1.05351 RMSE(test) = 1.06433 iter_time = 3.45136
110: RMSE(train) = 1.03525 RMSE(test) = 1.04654 iter_time = 3.48526
120: RMSE(train) = 1.02284 RMSE(test) = 1.03477 iter_time = 3.46685
130: RMSE(train) = 1.01373 RMSE(test) = 1.02594 iter_time = 3.50586
140: RMSE(train) = 1.00676 RMSE(test) = 1.01925 iter_time =

210: RMSE(train) = 0.98655 RMSE(test) = 1.00146 iter_time = 3.50887
220: RMSE(train) = 0.98456 RMSE(test) = 0.99971 iter_time = 3.53314
230: RMSE(train) = 0.98275 RMSE(test) = 0.99822 iter_time = 3.44889
240: RMSE(train) = 0.98082 RMSE(test) = 0.99660 iter_time = 3.51441
250: RMSE(train) = 0.97881 RMSE(test) = 0.99486 iter_time = 3.53573
260: RMSE(train) = 0.97638 RMSE(test) = 0.99277 iter_time = 3.44319
270: RMSE(train) = 0.97347 RMSE(test) = 0.99053 iter_time = 3.51948
280: RMSE(train) = 0.97000 RMSE(test) = 0.98775 iter_time = 3.48777
290: RMSE(train) = 0.96588 RMSE(test) = 0.98459 iter_time = 3.52554
300: RMSE(train) = 0.96130 RMSE(test) = 0.98104 iter_time = 3.45890
310: RMSE(train) = 0.95664 RMSE(test) = 0.97763 iter_time = 3.52472
320: RMSE(train) = 0.95252 RMSE(test) = 0.97467 iter_time = 3.51424
330: RMSE(train) = 0.94881 RMSE(test) = 0.97196 iter_time = 3.52964
340: RMSE(train) = 0.94569 RMSE(test) = 0.96981 iter_time = 3.52128
350: RMSE(train) = 0.94284 RMSE(test) = 0.96785 

420: RMSE(train) = 0.92305 RMSE(test) = 0.95338 iter_time = 3.61581
430: RMSE(train) = 0.92075 RMSE(test) = 0.95172 iter_time = 3.50357
440: RMSE(train) = 0.91843 RMSE(test) = 0.95034 iter_time = 3.53094
450: RMSE(train) = 0.91611 RMSE(test) = 0.94882 iter_time = 3.52407
460: RMSE(train) = 0.91394 RMSE(test) = 0.94749 iter_time = 3.52276
470: RMSE(train) = 0.91180 RMSE(test) = 0.94624 iter_time = 3.52945
480: RMSE(train) = 0.90987 RMSE(test) = 0.94516 iter_time = 3.52464
490: RMSE(train) = 0.90801 RMSE(test) = 0.94393 iter_time = 3.56771
500: RMSE(train) = 0.90619 RMSE(test) = 0.94319 iter_time = 3.51023
510: RMSE(train) = 0.90457 RMSE(test) = 0.94201 iter_time = 3.52677
520: RMSE(train) = 0.90304 RMSE(test) = 0.94118 iter_time = 3.52817
530: RMSE(train) = 0.90160 RMSE(test) = 0.94027 iter_time = 3.52922
540: RMSE(train) = 0.90021 RMSE(test) = 0.93951 iter_time = 3.53619
550: RMSE(train) = 0.89888 RMSE(test) = 0.93873 iter_time = 3.52194
560: RMSE(train) = 0.89766 RMSE(test) = 0.93781 

630: RMSE(train) = 0.89198 RMSE(test) = 0.93346 iter_time = 3.53359
640: RMSE(train) = 0.89103 RMSE(test) = 0.93294 iter_time = 3.53152
650: RMSE(train) = 0.89011 RMSE(test) = 0.93243 iter_time = 3.49551
660: RMSE(train) = 0.88930 RMSE(test) = 0.93165 iter_time = 3.53964
670: RMSE(train) = 0.88843 RMSE(test) = 0.93111 iter_time = 3.47753
680: RMSE(train) = 0.88757 RMSE(test) = 0.93047 iter_time = 3.53200
690: RMSE(train) = 0.88681 RMSE(test) = 0.93000 iter_time = 3.49986
700: RMSE(train) = 0.88604 RMSE(test) = 0.92940 iter_time = 3.54672
710: RMSE(train) = 0.88534 RMSE(test) = 0.92905 iter_time = 3.54353
720: RMSE(train) = 0.88459 RMSE(test) = 0.92853 iter_time = 3.55054
730: RMSE(train) = 0.88392 RMSE(test) = 0.92807 iter_time = 3.53607
740: RMSE(train) = 0.88323 RMSE(test) = 0.92759 iter_time = 3.48384
750: RMSE(train) = 0.88259 RMSE(test) = 0.92722 iter_time = 3.54771
760: RMSE(train) = 0.88197 RMSE(test) = 0.92674 iter_time = 3.51956
770: RMSE(train) = 0.88141 RMSE(test) = 0.92632 

840: RMSE(train) = 0.87620 RMSE(test) = 0.92257 iter_time = 3.53189
850: RMSE(train) = 0.87576 RMSE(test) = 0.92237 iter_time = 3.51982
860: RMSE(train) = 0.87535 RMSE(test) = 0.92208 iter_time = 3.46344
870: RMSE(train) = 0.87500 RMSE(test) = 0.92192 iter_time = 3.53980
880: RMSE(train) = 0.87465 RMSE(test) = 0.92173 iter_time = 3.52422
890: RMSE(train) = 0.87422 RMSE(test) = 0.92144 iter_time = 3.52869
900: RMSE(train) = 0.87386 RMSE(test) = 0.92125 iter_time = 3.52146
910: RMSE(train) = 0.87353 RMSE(test) = 0.92107 iter_time = 3.49465
920: RMSE(train) = 0.87322 RMSE(test) = 0.92085 iter_time = 3.60635
930: RMSE(train) = 0.87288 RMSE(test) = 0.92069 iter_time = 3.47026
940: RMSE(train) = 0.87259 RMSE(test) = 0.92051 iter_time = 3.54898
950: RMSE(train) = 0.87228 RMSE(test) = 0.92025 iter_time = 3.49613
960: RMSE(train) = 0.87196 RMSE(test) = 0.92009 iter_time = 3.54469
970: RMSE(train) = 0.87164 RMSE(test) = 0.91986 iter_time = 3.54811
980: RMSE(train) = 0.87138 RMSE(test) = 0.91970 

[0.2631461910207641,
 0.28000941266604396,
 0.2819908433318903,
 0.2820953493687738,
 0.28361550318423523]

In [21]:


print(time.time())

1577379853.03


In [22]:
print(time.time())

1577379854.13


In [159]:
RMSE(y, np.mean(y) * np.ones(shape=y.shape))

1.0394660359078236

In [160]:
1 - (0.3064003742476023) **2 / (1.0394660359078236)**2

0.9131123650977964