In [1]:
import numpy as np
import pandas as pd
import re
import os
from io import StringIO

from sklearn.externals import joblib
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.utils import shuffle
from sklearn.model_selection import KFold

import matplotlib.pyplot as plt
import sys
import time

%matplotlib inline

# Релизация модели FM

In [3]:
def RMSE_score(y_pred, y_real):
    return np.sqrt(np.sum(np.power(y_pred - y_real, 2)) / y_pred.shape[0])

def R2_score(y_pred, y_real):
    return 1 - np.sum(np.power(y_pred - y_real, 2)) / np.sum(np.power(y_real - np.mean(y_real), 2))

def R2_adj_score(y_pred, y_real, features):
    R2 = R2_score(y_pred, y_real)
    return 1 - (1 - R2) * (y_real.shape[0] - 1) / (y_real.shape[0] - features - 1)

In [10]:
class SGD2WAYFactorizationMachine(object):
    def __init__(self, 
                 k=5, 
                 epochs=5, 
                 batch_size = 64, 
                 epsilon=1e-4, 
                 step=0.001, 
                 step_V=None, 
                 lr_decay_rate=5e-7,
                 verbose=0, 
                 nobs_verbose=10
        ):
        self.epochs = epochs
        self.batch_size = batch_size
        self.epsilon = epsilon
        self.step = step
        self.verbose = verbose
        self.nobs_verbose = nobs_verbose
        self.lr_decay_rate = lr_decay_rate
        if step_V:
            self.stepV = step_V
        else:
            self.stepV = self.step / 10
        self.k = k
        
        self.w0 = None
        self.w1 = None
        self.V = None
        
        self.Z = None
        self.Z_squared = None
    
    def exp_decay(self, initial_lrate, epoch):
        k = self.lr_decay_rate
        lrate = initial_lrate * np.exp(-k*epoch)
        return lrate
    
    def fit(self, X, Y):
        # works only with sparse matrices
        self.w0 = 0
        self.w1 = np.random.normal(size=(X.shape[1], 1), scale=.1)
        self.V = np.random.normal(size=(X.shape[1], self.k), scale=.1)
        
        n = X.shape[0]
        timedelta = time.time()
        for epoch in range(2, self.epochs + 2):
            if self.verbose and epoch % self.nobs_verbose == 0:
                print("\r Progress: {}/{} ({}%), duration: {} sec".format(
                    epoch, self.epochs + 1, int(((epoch+1) / (self.epochs)) * 100), int(time.time() - timedelta)
                ), end="")
                timedelta = time.time()
            
            batch_indices = np.random.choice(n, self.batch_size)
            
            # densify sparse matrices
            x = X[batch_indices, :]
            y = Y[batch_indices]
            y_hat = self.predict(x)
            
            dy = -2 * (y - y_hat)
            
            self.w0 -= np.mean(dy) * self.step
            if hasattr(x, 'todense'):
                self.w1 -= x.multiply(dy).mean(axis=0).T * self.exp_decay(self.step, epoch)
            else:
                self.w1 -= np.mean(x * dy, axis=0, keepdims=True).T * self.exp_decay(self.step, epoch)
            
            for f in range(self.k):
                if hasattr(x, 'todense'):
                    left = x.multiply(self.Z[:, f].reshape(-1, 1))
                    right = self.X_squared.multiply(self.V[:, f])
                    self.V[:, f] -= (np.asarray((left - right).multiply(dy).mean(axis=0).T) * self.exp_decay(self.stepV, epoch)).squeeze()
                else:
                    left = x * self.Z[:, f].reshape(-1, 1)
                    right = self.X_squared * self.V[:, f]
                    self.V[:, f] -=  np.mean(dy * (left - right), axis=0) * self.exp_decay(self.stepV, epoch)
    
    def calculate_nonlinear_part(self, X):
        self.Z = X @ self.V
        
        if hasattr(X, 'todense'):
            self.X_squared = X.power(2)#np.power(X, 2)
        else:
            self.X_squared = np.power(X, 2)
        
        return 1/2 * np.sum(
            np.power(self.Z, 2) - self.X_squared @ np.power(self.V, 2), 
            axis=1, keepdims=True
        )
        
    def predict(self, X):
        return self.w0 + X @ self.w1 + self.calculate_nonlinear_part(X)

# Проверка корректности работы алгоритма на примере искусственного датасета

# и сравнение с SGD регрессором sklearn

In [11]:
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor

In [12]:
X, y, coefs = make_regression(n_samples=10000, n_features=4, n_targets=4, n_informative=2, coef=True)

In [13]:
X = np.concatenate([X, y[:, :-1]], axis=1)
y = y[:, -1].reshape(-1, 1)

In [22]:
fm = SGD2WAYFactorizationMachine(epochs=20000, batch_size=256, k=8, step=1e-5, step_V=1e-8, verbose=1, nobs_verbose=5000, lr_decay_rate=1e-5)
reg = SGDRegressor(fit_intercept=True, penalty='none', shuffle=False, learning_rate='constant', eta0=0.001, max_iter=10000)

In [23]:
reg.fit(X, y.squeeze())



SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.001, fit_intercept=True, l1_ratio=0.15,
       learning_rate='constant', loss='squared_loss', max_iter=10000,
       n_iter=None, n_iter_no_change=5, penalty='none', power_t=0.25,
       random_state=None, shuffle=False, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [24]:
fm.fit(X, y)

 Progress: 20000/20001 (100%), duration: 3 sec

In [25]:
reg.coef_, reg.intercept_

(array([ 1.75329105e+09, -1.17467670e+09, -7.18091047e+10,  8.85153006e+09,
         1.00692797e+11, -3.43837503e+10,  3.66537441e+10]),
 array([9.38799482e+10]))

In [26]:
fm.w1, fm.w0, fm.V

(array([[ 0.10038559],
        [-0.02807447],
        [ 0.16951733],
        [-0.02117882],
        [-0.28724782],
        [ 0.2635045 ],
        [ 0.93081021]]),
 0.053373035151704586,
 array([[-0.05867992,  0.1301452 ,  0.1888644 ,  0.00655247,  0.05789429,
         -0.05814951, -0.11663405,  0.00558362],
        [ 0.09327705,  0.11236647, -0.07478194, -0.18236786, -0.07602433,
         -0.15691578, -0.02714165,  0.0772217 ],
        [-0.14407354,  0.19513548,  0.06879782,  0.03924446,  0.04392671,
          0.01467825, -0.02790113, -0.03484998],
        [-0.14528486, -0.05072692, -0.04303462, -0.05974964, -0.01978388,
          0.02746084, -0.16375715, -0.18189178],
        [-0.09042885, -0.1907012 , -0.09081672,  0.14776575,  0.09549876,
          0.02505465, -0.00708806,  0.11401723],
        [ 0.06120799, -0.00096996, -0.11455083,  0.01317796, -0.07844233,
          0.17829282,  0.18398215, -0.01070418],
        [ 0.11176101,  0.06013751, -0.11215329,  0.07775149,  0.13295085,
  

In [27]:
print('предсказания полученные SGD регрессией')
reg.predict(X[:5])

предсказания полученные SGD регрессией


array([-3.66206158e+12,  1.75817059e+13, -3.38099890e+10,  4.49951384e+12,
        3.39950471e+12])

In [28]:
print('предсказания полученные FM')
fm.predict(X[:5])

предсказания полученные FM


array([[-21.24971036],
       [309.61409397],
       [ 93.40545585],
       [ 91.35701438],
       [-18.08437594]])

In [29]:
print('реальные значения таргета')
y[:5]

реальные значения таргета


array([[-20.72177077],
       [311.99005337],
       [ 93.37714163],
       [ 90.39896741],
       [-18.37828621]])

# Работа с данными

## Загрузка датасета

In [34]:
def process_document(doc):
    # movies ids
    film_ids = list(map(lambda x: int(x[:-1]), re.findall(r'\d+\:', doc)))
    # separate frames
    frames_raw = re.split(r'\d+\:', doc)
    frames_raw = frames_raw[1:]
    
    frames_totale = []

    for frame, movie_id in zip(frames_raw, film_ids):
        sub_df = pd.read_csv(StringIO(frame), names=['CustomerID','Rating','Date'])
        sub_df['MovieID'] = movie_id

        frames_totale.append(sub_df)

    dataset = pd.concat(frames_totale)
    
    return dataset

In [35]:
txtfile = None

In [36]:
if txtfile:
    del txtfile

with open('Task2/netflix-prize-data/combined_data_1.txt', 'r') as f:
    txtfile = f.read()
    
dataset_p1 = process_document(txtfile)

In [37]:
#if txtfile:
#    del txtfile
#
#with open('Task2/netflix-prize-data/combined_data_2.txt', 'r') as f:
#    txtfile = f.read()
#    
#dataset_p2 = process_document(txtfile)

In [38]:
dataset = dataset_p1
dataset = dataset[dataset.MovieID.isin(movie_table.index.unique().values)]

In [27]:
#dataset = pd.concat([dataset_p1, dataset_p2], axis=0)
#dataset = dataset.reset_index()

# Таблица с фичами для фильмов

In [32]:
txtfile = None

In [33]:
if txtfile:
    del txtfile
    
with open('Task2/netflix-prize-data/movie_titles.csv', 'r') as f:
    txtfile = f.read()

subs = re.compile(r'\d+,\d+,')
new_lines = []
for i in txtfile.split('\n'):
    sub_part = subs.sub('', i)
    if 'NULL' not in sub_part:
        if ',' not in sub_part:
            new_lines.append(i)
        else:
            left_part = i.replace(sub_part, '')
            right_part = sub_part
            new_lines.append(left_part + '"' + right_part + '"')
        
movie_info = '\n'.join(new_lines)
movie_table = pd.read_csv(StringIO(movie_info), names = ['MovieID', 'Year', 'Name'])

movie_table.MovieID = movie_table.MovieID.astype(int)
movie_table.Year = movie_table.Year.astype(int)

movie_table = movie_table.set_index('MovieID')

## Генерируем спарс матрицу

In [74]:
customer_alias = {j:i for i, j in enumerate(dataset.CustomerID.unique())}
movie_alias = {j:i for i, j in enumerate(dataset.MovieID.unique())}
year_alias = {j:i for i, j in enumerate(movie_table.Year.unique())}


f1_size = dataset.MovieID.unique().shape[0]
f2_size = dataset.CustomerID.unique().shape[0]
f3_size = movie_table.Year.unique().shape[0]

In [75]:
#sparse_dataset = csr_matrix((f1_size + f2_size + f3_size, dataset.size))
sparse_dataset = lil_matrix((dataset.shape[0], f1_size + f2_size + f3_size))

In [76]:
for j, i in enumerate(dataset.itertuples()):
    if j % 50000 == 0:
        print("\r Progress: {}/{} ({}%)".format(j, dataset.shape[0], int(((j+1) / dataset.shape[0]) * 100)), end="")
    #print(i)
    customer = customer_alias[i.CustomerID]
    movie = movie_alias[i.MovieID]
    year = year_alias[movie_table.loc[i.MovieID].Year]
    
    sparse_dataset[j, customer] = 1
    sparse_dataset[j, f1_size + movie] = 1
    sparse_dataset[j, f1_size + f2_size + year] = 1

 Progress: 24050000/24053575 (99%)

In [39]:
y = dataset.Rating.values

In [78]:
csr_sparse_dataset = csr_matrix(sparse_dataset)

In [81]:
joblib.dump(csr_sparse_dataset, 'sparse_df_1.bin')

['sparse_df_1.bin']

In [40]:
csr_sparse_dataset = joblib.load('sparse_df_1.bin')

## FM на полной матрице

In [46]:
rmse_train = []
rmse_test = []
r2_train = []
r2_test = []
r2_adj_train = []
r2_adj_test = []

kfold = KFold(n_splits=4)

count = 1
for ix_train, ix_test in kfold.split(csr_sparse_dataset):
    print('{}-th fold'.format(count))
    mdl = SGD2WAYFactorizationMachine(epochs=csr_sparse_dataset.shape[0] // 128, batch_size=256, k=2, step=.001, step_V=0.00001, verbose=True, nobs_verbose=100)
    
    X_train = csr_sparse_dataset[ix_train]
    y_train = y[ix_train].reshape(-1, 1)
    
    X_test = csr_sparse_dataset[ix_test]
    y_test = y[ix_test].reshape(-1, 1)
    
    mdl.fit(X_train, y_train)
    
    rmse_test.append(RMSE_score(mdl.predict(X_test), y_test))
    r2_test.append(R2_score(mdl.predict(X_test), y_test))
    r2_adj_test.append(R2_adj_score(mdl.predict(X_test), y_test, csr_sparse_dataset.shape[1]))
    
    rmse_train.append(RMSE_score(mdl.predict(X_train), y_train))
    r2_train.append(R2_score(mdl.predict(X_train), y_train))
    r2_adj_train.append(R2_adj_score(mdl.predict(X_train), y_train, csr_sparse_dataset.shape[1]))
    
    count += 1

1-th fold
 Progress: 187900/187919 (99%), duration: 3 sec2-th fold
 Progress: 187900/187919 (99%), duration: 3 sec3-th fold
 Progress: 187900/187919 (99%), duration: 3 sec4-th fold
 Progress: 187900/187919 (99%), duration: 4 sec

KeyboardInterrupt: 

In [48]:
rmse_test

[1.0870387847803527,
 1.0912097795611546,
 1.1055237437197836,
 1.1061612819509088]

In [49]:
rmse_train

[1.0478473775797363, 1.0490578400791624, 1.048496854418923, 1.0475487576807785]

# ----------------------------------------------------------------------------

In [50]:
df = pd.DataFrame(np.vstack([rmse_test, rmse_train]), index=[
    'rmse_test',
    'rmse_train',
])

df = pd.concat([df, df.mean(axis=1).rename('mean'), df.std(axis=1).rename('std')], axis=1)

df

Unnamed: 0,0,1,2,3,mean,std
rmse_test,1.087039,1.09121,1.105524,1.106161,1.097483,0.009805
rmse_train,1.047847,1.049058,1.048497,1.047549,1.048238,0.000675
