In [98]:
import pandas as pd
import numpy as np
import re

from io import StringIO

from sklearn.externals import joblib
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer


In [99]:
###читаем даатасет c фичами фильмов
df_f = pd.read_csv('./movie_titles.csv', names=['MovieID', 'Year', 'Title'])
df_f = df_f.set_index('MovieID')

###читаем даатасет с инфой о пользователях
f = open('./combined_data_1.txt')
file = f.read()

movieID_u = list(map(lambda x: int(x[:-1]), re.findall(r'\d+\:', file)))
df_u = re.split(r'\d+\:', file)
df_u = df_u[1:]    

df = []
for df_i, movieID_i in zip(df_u, movieID_u):
    sub_df = pd.read_csv(StringIO(df_i), names=['UserID', 'Score', 'Date'])
    sub_df['MovieID'] = movieID_i
    df.append(sub_df)


df = pd.concat(df)
df_f = df_f.drop(df_f[df_f.index>df.MovieID.unique().shape[0]].index)
#print(df_f)
#print(df)


In [100]:
u_size = df.UserID.unique().shape[0]
m_size = df.MovieID.unique().shape[0]
#y_size = df_f.Year.unique().shape[0]
#s_size = df.Date.unique().shape[0]

In [103]:
df.sort_values(by=['UserID', 'MovieID'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [120]:
### разреженная матрица
sm = lil_matrix((df.shape[0], u_size + m_size))
#print(sm.shape)

In [121]:
j=0
temp = df.UserID.iat[0]

for i in range(df.shape[0]):
    
    if(temp<df.UserID.iat[i]):
        j+=1
        temp = df.UserID.iat[i]
        
    sm[i, j] = 1
    sm[i, u_size + df.MovieID.iat[i]-1] = 1
    
    if i % 50000 == 0:
        print("\r Progress: {}/{} ({}%)".format(i, df.shape[0], int(((i+1) / df.shape[0]) * 100)), end="")
    #print(i, j, sm[i,j])
    #print(i, j1, sm[i,j1])
print(sm[0,0])

 Progress: 24050000/24053764 (99%)1.0


In [122]:
sm = csr_matrix(sm)
joblib.dump(sm, 'sparse_df_1.bin')

['sparse_df_1.bin']

In [123]:
sm = joblib.load('sparse_df_1.bin')

In [127]:
print(sm[453, :])

  (0, 6)	1.0
  (0, 473670)	1.0


По лемме 1 нелинейную часть можно представить в виде: $\sum_{i=1}^n \sum_{j=i+1}^n \langle {v}_i, {v}_f\rangle x_ix_j = \frac{1}{2}\sum_{f=1}^k \left( \left(\sum_{i=1}^n v_{i,f}x_i \right)^2 - \sum_{i=1}^n v^2_{i,f}x^2_i \right)$
<br>
Производная $\frac{dy}{dv}$ нелинейной части бдует следующей:
<br>
(1) = $\left( \left( \sum_{i=1}^n v_{i,f}x_i \right)^2 \right)' _{v_if} = 2\sum_{i=1}^n \left(v_{i,f}x_i \sum_{k=1}^n x_k\right)$ или в матричном виде $\sum_{f=1}^k2 (X\mathbb{1})^TXV_f$
<br>
(2) = $ \left( \sum_{i=1}^n v_{i,f}^2x_i^2 \right)' _{v_if} = 2\sum_{i=1}^n v_{i,f}x_i^2 $ или в матричном виде $\sum_{f=1}^k2 X^TXV_f$
<br>
<br>
Тогда
<br>
$\frac{dy}{dv_f} = \frac{1}{2}\sum_{f=1}^k \left( (1) - (2) \right)$ или в матричном виде $\sum_{f=1}^k(X\mathbb{1}-X)^TXV_f$
<br>
<br>
И тогда $\frac{dy}{dv}$ в матричном виде $(X\mathbb{1}-X)^TXV\mathbb{1}$

In [87]:
def RMSE(x, y):
    return np.sqrt(np.sum(np.power(y - x, 2)) / y.shape[0])

def narrTheta(lr, iter_num):
	return lr/np.sqrt(iter_num)

def sgdFFM(X, y, max_iter = 1000, step = 1e-3, batch_size = 256, k = 8):
	w0 = 0
	iter_num = 1
	N = X.shape[0]
	np.random.seed(42)
	w1 = np.random.normal(size=X.shape[1])


	while iter_num <= max_iter:
		random_batch = np.random.choice(N, batch_size)
		new_y = predict(X[random_batch, :], w0, w1)
		#print(X[random_batch].shape)

		dy = 2*(new_y - y[random_batch])/batch_size
		#print(iter_num, dy)
		w0 -= dy.mean()* step
		w1 -= X[random_batch, :].T@dy * narrTheta(step, iter_num)
		iter_num += 1
	return w0, w1

def predict(X, w0, w1):
	return w0 + X@w1

In [96]:
X, y, coefs = make_regression(n_samples=1000000, n_features=8, n_targets=1, n_informative=4, coef=True)

folds_index = 5
fold_size = round(df.shape[0] / folds_index)
RMSE_test = []
RMSE_train = []

for i in range(folds_index):
    test = X[i * fold_size:(i + 1) * fold_size]
    testT = y[i * fold_size:(i + 1) * fold_size]
    if i == 0:
        train = X[(i + 1) * fold_size:]
        trainT = y[(i + 1) * fold_size:]
    else:
        train = X[:i * fold_size]
        trainT = y[:i * fold_size]
        if i != 4:
            train = np.concatenate((train, X[(i + 1) * fold_size:]))
            trainT = np.concatenate([trainT, y[(i + 1) * fold_size:]])

    Target = trainT
    Features = train
    w0, w1 = sgdFFM(Features, Target, 1e3, 1e-3, 256)

    train_pred = predict(Features, w0, w1)

    #RMSE_train.append(RMSE(train_pred, Target))
    TargetT = testT
    FeaturesT = test
    test_pred = predict(FeaturesT, w0, w1)
    #RMSE_test.append(RMSE(test_pred, Target))

    print('Fold {}'.format(i+1), '\nОбучающая: \n',train_pred-Target, '\nТестовая: \n',test_pred-TargetT, '\n', '\n')


Fold 1 
Обучающая: 
 [ 126.37195716   -6.73739439  -86.9637686  ... -328.71416459  194.64693737
 -103.24970393] 
Тестовая: 
 [ 201.80298459   27.95291549 -133.38659516 ... -302.74481683   -7.28493081
  135.63554295] 
 

Fold 2 
Обучающая: 
 [ 201.80297263   27.95291799 -133.38659145 ... -328.71415688  194.64694361
 -103.24970361] 
Тестовая: 
 [126.37194611  -6.73739554 -86.96376867 ... 142.29564483  18.83588498
 139.10109659] 
 

Fold 3 
Обучающая: 
 [ 201.8029694    27.95291607 -133.38660349 ... -328.71415829  194.64693776
 -103.24969413] 
Тестовая: 
 [  44.22443774  -68.8575299    87.24377208 ... -354.02619836 -145.24500191
   85.99620224] 
 

Fold 4 
Обучающая: 
 [ 201.80299122   27.9529079  -133.38658592 ... -328.71416304  194.64694529
 -103.24970761] 
Тестовая: 
 [ -96.55201672 -136.20803204  180.01644109 ...  100.79417176   18.3395286
 -217.34144083] 
 

Fold 5 
Обучающая: 
 [ 201.47780518   27.90003474 -133.1579091  ...  100.63397262   18.31251632
 -216.98041063] 
Тестовая: 
 [ 