In [1]:
import pandas as pd
import numpy as np
import re

from io import StringIO

from sklearn.externals import joblib
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer


In [2]:
###читаем даатасет c фичами фильмов
df_f = pd.read_csv('./movie_titles.csv', names=['MovieID', 'Year', 'Title'])
df_f = df_f.set_index('MovieID')

###читаем даатасет с инфой о пользователях
f = open('./combined_data_1.txt')
file = f.read()

movieID_u = list(map(lambda x: int(x[:-1]), re.findall(r'\d+\:', file)))
df_u = re.split(r'\d+\:', file)
df_u = df_u[1:]    

df = []
for df_i, movieID_i in zip(df_u, movieID_u):
    sub_df = pd.read_csv(StringIO(df_i), names=['UserID', 'Score', 'Date'])
    sub_df['MovieID'] = movieID_i
    df.append(sub_df)


df = pd.concat(df)
df_f = df_f.drop(df_f[df_f.index>df.MovieID.unique().shape[0]].index)
#print(df_f)
#print(df)


In [3]:
u_size = df.UserID.unique().shape[0]
m_size = df.MovieID.unique().shape[0]
#y_size = df_f.Year.unique().shape[0]
#s_size = df.Date.unique().shape[0]

In [4]:
df.sort_values(by=['UserID', 'MovieID'], inplace=True)
df.reset_index(drop=True, inplace=True)
df_targ = df.Score

In [8]:
### разреженная матрица
sm = lil_matrix((df.shape[0], u_size + m_size))

In [9]:
j=0
temp = df.UserID.iat[0]

for i in range(df.shape[0]):
    
    if(temp<df.UserID.iat[i]):
        j+=1
        temp = df.UserID.iat[i]
        
    sm[i, j] = 1
    sm[i, u_size + df.MovieID.iat[i]-1] = 1
    
    if i % 50000 == 0:
        print("\r Progress: {}/{} ({}%)".format(i, df.shape[0], int(((i+1) / df.shape[0]) * 100)), end="")
    #print(i, j, sm[i,j])
    #print(i, j1, sm[i,j1])
#print(sm[0,0])

 Progress: 1600000/24053764 (6%)

KeyboardInterrupt: 

In [None]:
sm = csr_matrix(sm)
joblib.dump(sm, 'sparse_df_1.bin')

In [5]:
sm = joblib.load('sparse_df_1.bin')

In [119]:
print(sm.shape)
print(df_targ.shape)

(24053764, 475257)
(24053764,)


По лемме 1 нелинейную часть можно представить в виде: $\sum_{i=1}^n \sum_{j=i+1}^n \langle {v}_i, {v}_f\rangle x_ix_j = \frac{1}{2}\sum_{f=1}^k \left( \left(\sum_{i=1}^n v_{i,f}x_i \right)^2 - \sum_{i=1}^n v^2_{i,f}x^2_i \right)$
<br>
Производная $\frac{dy}{dv}$ нелинейной части бдует следующей:
<br>
(1) = $\left( \left( \sum_{i=1}^n v_{i,f}x_i \right)^2 \right)' _{v_if} = 2\sum_{i=1}^n \left(v_{i,f}x_i \sum_{k=1}^n x_k\right)$ или в матричном виде $\sum_{f=1}^k2 (X\mathbb{1})^TXV_f$
<br>
(2) = $ \left( \sum_{i=1}^n v_{i,f}^2x_i^2 \right)' _{v_if} = 2\sum_{i=1}^n v_{i,f}x_i^2 $ или в матричном виде $\sum_{f=1}^k2 X^TXV_f$
<br>
<br>
Тогда
<br>
$\frac{dy}{dv_f} = \frac{1}{2}\sum_{f=1}^k \left( (1) - (2) \right)$ или в матричном виде $\sum_{f=1}^k(X\mathbb{1}-X)^TXV_f$
<br>
<br>
И тогда $\frac{dy}{dv}$ в матричном виде $(X\mathbb{1}-X)^TXV\mathbb{1}$

In [6]:
def RMSE(x, y):
    return np.sqrt(np.sum(np.power(y - x, 2)) / y.shape[0])


def narrTheta(lr, iter_num):
    return lr / np.sqrt(iter_num)


def sgdFFM(X, y, max_iter=1e3, step=1e-3, epsilon=1e-4, batch_size=256, k=9):
    w0 = 0
    iter_num = 1
    N = X.shape[0]
    np.random.seed(42)
    w1 = np.random.normal(size=X.shape[1])
    V = np.random.normal(size=(X.shape[1], k))

    while iter_num <= max_iter:
        random_batch = np.random.choice(N, batch_size)
        new_y = predict(X[random_batch, :], V, w0, w1)

        dy = 2 * (new_y - y[random_batch])
        w0 -= dy.mean() * step
        
        if hasattr(X, 'todense'):
            w1 -= X[random_batch, :].T @ dy * step / batch_size  
            #np.multiply(dy, X[random_batch, :]).mean(axis=0).T * step
        else:
            w1 -= X[random_batch, :].T @ dy * step / batch_size        
        
        for f in range(k):    
            if hasattr(X[random_batch, :], 'todense'):            
                dA = np.multiply(X[random_batch, :], (X[random_batch, :]@V[:,f]).reshape(-1, 1))            
                dB = (X[random_batch, :].power(2)).multiply(V[:,f])           
                V[:,f] -= step*np.asarray(((dA-dB).multiply(dy)).mean(axis=0))
            else:
                dA = X[random_batch, :]*(X[random_batch, :]@V[:,f]).reshape(-1, 1)          
                dB = (X[random_batch, :]**2)*V[:,f]        
                V[:,f] -= step*((dy.reshape(-1, 1)*(dA-dB)).mean(axis=0).T)
        
        iter_num += 1
        if (np.linalg.norm(y[random_batch] - new_y) < epsilon): 
            print(np.linalg.norm(y[random_batch] - new_y))
            return w0, w1, V
        
        if (iter_num%1000==0):
            print(np.linalg.norm(y[random_batch] - new_y))
        #print(np.linalg.norm(y[random_batch] - new_y))
        
    return w0, w1, V

def nonLin(X, V):
    if hasattr(X, 'todense'):
        A = (X@V)**2
        B = (X.power(2))@V**2
    else:
        A = (X@V)**2
        B = (X**2)@(V**2)
    
    return 1/2*((A-B)@np.ones(V.shape[1]))

def predict(X, V, w0, w1):
    return w0 + X @ w1 + nonLin(X, V)


<h1>Проверим на генерации</h1>

In [7]:
X, y, coefs = make_regression(n_samples=10000, n_features=8, n_targets=1, n_informative=4, coef=True)

folds_index = 5
fold_size = round(X.shape[0] / folds_index)
RMSE_test = []
RMSE_train = []

for i in range(folds_index):
    test = X[i * fold_size:(i + 1) * fold_size]
    testT = y[i * fold_size:(i + 1) * fold_size]
    if i == 0:
        train = X[(i + 1) * fold_size:, :]
        trainT = y[(i + 1) * fold_size:]
    else:
        train = X[:i * fold_size, :]
        trainT = y[:i * fold_size]
        if i != 4:
            train = np.concatenate((train, X[(i + 1) * fold_size:, :]))
            trainT = np.concatenate([trainT, y[(i + 1) * fold_size:]])
    
    Target = trainT
    Features = train
    w0, w1, V = sgdFFM(Features, Target, 1e4, 1e-3)

    train_pred = predict(Features, V, w0, w1)

    RMSE_train.append(RMSE(train_pred, Target))
    TargetT = testT
    FeaturesT = test
    test_pred = predict(FeaturesT, V, w0, w1)
    RMSE_test.append(RMSE(test_pred, TargetT))

print('Fold {}'.format(i + 1), '\nОбучающая: \n', train_pred - Target, '\nТестовая: \n', test_pred - TargetT, '\n', '\n')


162.86414344024138
22.4714031701688
3.466270955104093
0.47848650288152494
0.0678996866351257
0.009678311699661511
0.0014041348570196027
0.00020581446587118464
9.984488851270882e-05
158.88613073013812
22.342249650310134
3.2090986924241904
0.46553902796153646
0.06572528205273231
0.009143636289346046
0.0013404572196622
0.00018170017863419448
9.975910592368813e-05
156.9739123400919
22.096028473066163
3.0911646193880853
0.4030935037348191
0.05547992899950314
0.007591053328727859
0.0011455642581463259
0.00014657461189413373
9.758676864292708e-05
163.1305003676962
25.22490175797429
3.5342355735372486
0.4541991985875971
0.06681026557646229
0.009287119033901227
0.0015698131752767304
0.00021134975188909005
9.889562322818028e-05
164.34217658959165
25.38628792167202
3.408839128088837
0.4338258962262225
0.06754828160352498
0.008850233849077906
0.0014901408248083874
0.00021644460068296955
9.996066284383349e-05
Fold 5 
Обучающая: 
 [ 2.68228216e-06 -3.27938693e-07  3.62110456e-06 ...  5.68071250e-06


In [8]:
df = pd.DataFrame(np.vstack([RMSE_train, RMSE_test]), 
                  index=['rmse_train','rmse_test'])

df = pd.concat([df, df.mean(axis=1).rename('mean'),
                df.std(axis=1).rename('std')], axis=1)

df

Unnamed: 0,0,1,2,3,4,mean,std
rmse_train,7e-06,7e-06,7e-06,7e-06,7e-06,7e-06,2.394659e-07
rmse_test,7e-06,7e-06,7e-06,7e-06,7e-06,7e-06,1.846684e-07


<h1>Фигачим на данных1</h1>

In [None]:
print(sm.shape)
print(type(sm))
X = sm
y = df_targ.values

print(type(df_targ))


folds_index = 5
fold_size = round(X.shape[0] / folds_index)
RMSE_test = []
RMSE_train = []

for i in range(folds_index):
    print('{}-th fold'.format(i))
    test = sm[i * fold_size:(i + 1) * fold_size]
    testT = y[i * fold_size:(i + 1) * fold_size]
    if i == 0:
        train = sm[(i + 1) * fold_size:, :]
        trainT = y[(i + 1) * fold_size:]
    else:
        train = sm[:i * fold_size, :]
        trainT = y[:i * fold_size]
        if i != 4:
            train = np.concatenate((train, sm[(i + 1) * fold_size:, :]))
            trainT = np.concatenate([trainT, y[(i + 1) * fold_size:]])
    
    Target = trainT
    Features = train
    w0, w1, V = sgdFFM(Features, Target, 1e3, 1e-3)

    train_pred = predict(Features, V, w0, w1)

    RMSE_train.append(RMSE(train_pred, Target))
    TargetT = testT
    FeaturesT = test
    test_pred = predict(FeaturesT, V, w0, w1)
    RMSE_test.append(RMSE(test_pred, TargetT))

print('Fold {}'.format(i + 1), '\nОбучающая: \n', train_pred - Target, '\nТестовая: \n', test_pred - TargetT, '\n', '\n')


(24053764, 475257)
<class 'scipy.sparse.csr.csr_matrix'>
<class 'pandas.core.series.Series'>
0-th fold


In [None]:
df = pd.DataFrame(np.vstack([RMSE_train, RMSE_test]), 
                  index=['rmse_train','rmse_test'])

df = pd.concat([df, df.mean(axis=1).rename('mean'),
                df.std(axis=1).rename('std')], axis=1)

df