# Task 2:

Netflix Prize Dataset: https://www.kaggle.com/netflix-inc/netflix-prize-data/data#combined_data_1.txt

Есть матрица рейтингов User-Item, по кросс-валидации бьем её на фолды, затем пытаемся предсказать скрытые рейтинги. Качество проверяем по RMSE, только для тех точек в которых прогноз есть.

Используем факторизационную машину 2-го порядка с квадратичной функцией потерь (аналогично линейной регрессии).

Статья про факторизационные машины: https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf



In [1]:
import time

import numpy
import pandas
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle



PARAMETERS

In [2]:
kfold_epochs = 8
step = 0.001



METRICS

In [3]:
def get_RMSE(Y, Ypred):
    return numpy.sqrt(numpy.sum((Ypred - Y) ** 2) / len(Y))


def get_R2(Y, Ypred):
    return 1 - numpy.sum((Ypred - Y) ** 2) / numpy.sum((Y - numpy.mean(Y)) ** 2)



PREDICTION FUNCTION

In [4]:
def prediction(W0, W, X, V):
    Xsquare = X.power(2)
    Vsquare = V ** 2
    XV = X @ V
    prediction = W0 + X @ W + (1 / 2) * numpy.sum(XV ** 2 - (Xsquare @ Vsquare), axis=1, keepdims=True)
    return prediction



DATA

In [5]:
def get_data(path):
    file = pandas.read_csv(path)
    OHE = OneHotEncoder(sparse=True)
    Y = file[['Rate']].values
    X = file[['User', 'Movie']].values
    X = OHE.fit_transform(X)
    return X, Y


def get_batches(X, Y, size, counter, shuffle_indices):
    beg = counter * size
    end = min(X.shape[0], (counter + 1) * size)
    output_indices = shuffle_indices[beg:end]
    return X[output_indices], Y[output_indices]



UPDATING

In [6]:
def update_W0(dY, c, W0):
    return W0 - (c * numpy.sum(dY))


def update_W(dY, c, X, W):
    return W - c * (numpy.transpose(X) @ dY)


def update_V(dY, c, V, X):
    XV = X @ V
    for factor in range(factors):
        dV = X.multiply(XV[:, factor].reshape(-1, 1))
        dV = dV - X.power(2).multiply(V[:, factor])
        V[:, factor] = V[:, factor] - (dY.transpose() @ dV) * c
    return V



LEARNING

In [7]:
def training(factors, epochs, size, X, Y):
    Xfeats = X.shape[1]
    Xsize = X.shape[0]
    limits = 1.0 / numpy.sqrt(Xsize)
    W0 = 0
    W = numpy.random.uniform(-limits, limits, size=(Xfeats, 1))
    V = numpy.random.uniform(-limits, limits, size=(Xfeats, factors))
    shuffle_indices = shuffle(numpy.arange(X.shape[0]))

    for epoch in range(epochs):
        for counter in range(0, Xsize // size):
            Xbatch, Ybatch = get_batches(X, Y, size, counter, shuffle_indices)
            pred = prediction(W0, W, Xbatch, V)
            dY = 2 * (pred - Ybatch) / len(pred)
            W0 = update_W0(dY, step, W0)
            W = update_W(dY, step, Xbatch, W)
            V = update_V(dY, step, V, Xbatch)
    return W0, W, V



FACTORIZATION MACHINE & RESULTS

In [8]:
path = '/Users/nikitavolkov/Desktop/ml/ml_tasks/task2_factorization_machine/netflix-prize-data/NetflixData.csv'
epochs = kfold_epochs
factors = 2
batch_size = 1024
X, Y = get_data(path)

folds = KFold(n_splits=epochs, random_state=None, shuffle=False)

RMSE_train = []
RMSE_test = []
R2_train = []
R2_test = []
iterator = 0

for indices_train, indices_test in folds.split(X):
    iterator = iterator + 1
    fold_start = time.time()

    Xtrain = X[indices_train]
    Xtest = X[indices_test]
    Ytrain = Y[indices_train]
    Ytest = Y[indices_test]

    W0, W, V = training(factors, epochs, batch_size, Xtrain, Ytrain)

    prediction_train = prediction(W0, W, Xtrain, V)
    RMSE_train.append(get_RMSE(Ytrain, prediction_train))
    R2_train.append(get_R2(Ytrain, prediction_train))

    prediction_test = prediction(W0, W, Xtest, V)
    RMSE_test.append(get_RMSE(Ytest, prediction_test))
    R2_test.append(get_R2(Ytest, prediction_test))
    print('fold %d finished. time: %d' % (iterator, (time.time() - fold_start)))

res = pandas.DataFrame(numpy.vstack([RMSE_train, RMSE_test, R2_train, R2_test]))
res.columns = ['Epoch %d' % (i + 1) for i in range(0, epochs)]
res.rename(index={0: 'RMSE Train', 1: 'RMSE Test', 2: 'R2 Train', 3: 'R2 Test'}, inplace=True)
res


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


fold 1 finished. time: 4503
fold 2 finished. time: 3925
fold 3 finished. time: 3893
fold 4 finished. time: 3864
fold 5 finished. time: 3891
fold 6 finished. time: 3745
fold 7 finished. time: 3878
fold 8 finished. time: 3946


Unnamed: 0,Epoch 1,Epoch 2,Epoch 3,Epoch 4,Epoch 5,Epoch 6,Epoch 7,Epoch 8
RMSE Train,1.042129,1.044798,1.051102,1.052983,1.052026,1.049424,1.045907,1.046572
RMSE Test,1.103722,1.081952,1.032815,1.015873,1.020655,1.038828,1.064706,1.062175
R2 Train,0.065798,0.065415,0.066625,0.068278,0.069559,0.070651,0.070939,0.070918
R2 Test,0.012116,0.057369,0.073613,0.062757,0.058867,0.051503,0.044792,0.039988
