In [3]:
!pip install matrix_factorization

Collecting matrix_factorization
  Using cached matrix_factorization-1.3.tar.gz (12 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: matrix-factorization
  Building wheel for matrix-factorization (setup.py): started
  Building wheel for matrix-factorization (setup.py): finished with status 'done'
  Created wheel for matrix-factorization: filename=matrix_factorization-1.3-py3-none-any.whl size=16467 sha256=f8451f2bdcd9a3b4f09442f6675d0e0ecf19b3b8cd334b5056bcfe5e61cf1326
  Stored in directory: c:\users\taiminh\appdata\local\pip\cache\wheels\f6\34\7b\11dd792f060e9c2d7230855147d3bfadbd2b6bbcf5132dcbbc
Successfully built matrix-factorization
Installing collected packages: matrix-factorization
Successfully installed matrix-factorization-1.3


You should consider upgrading via the 'c:\users\taiminh\anaconda3\python.exe -m pip install --upgrade pip' command.


In [1]:
NUM_EPOCHS = 100
UPDATE_N_EPOCHS = 10

In [2]:
# Data manipulation
import numpy as np
import pandas as pd

# Modeling
from matrix_factorization import BaselineModel, KernelMF, train_update_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Other
import os
import random
import sys
    
# Saving model
import pickle

rand_seed = 2
np.random.seed(rand_seed)
random.seed(rand_seed)

DATA_PATH = './data/Ratings.csv'
COL_NAMES = ['user_id', 'item_id', 'rating']

def read_data(data_path, col_names):
    return pd.read_csv(data_path, header=0, names=col_names, usecols=[0, 1, 2])
    
def split_train_test(dataframe):
    X = data[['user_id', 'item_id']]
    y = data['rating']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    return (X_train, X_test, y_train, y_test)

class MF:
    def __init__(self, X_train, y_train, num_epochs=NUM_EPOCHS, save_model_path=None):
        """
            save_model_path (str): * do not include ".pkl"
            * if save_model_path is not none, other parameters do not need any values
        """
        if save_model_path is None:
            self.model = self.__train(X_train, y_train, num_epochs)
        else:
            self = MF.load_model(save_model_path)

    def __train(self, X_train, y_train, num_epochs):
        matrix_fact = KernelMF(n_epochs = num_epochs, n_factors = 100, verbose = 1, lr = 0.001, reg = 0.2)
        matrix_fact.fit(X_train, y_train)
        return matrix_fact

    def update(self, new_X_train: pd.DataFrame, new_y_train: pd.Series, n_epochs=UPDATE_N_EPOCHS):
        """
            new_X_train: a dataframe with two cols: user_id, item_id
            new_y_train: a series of rating corresponds to new_X_train
        """
        self.model.update_users(new_X_train, new_y_train, lr=0.001, n_epochs=n_epochs, verbose=1)

    def evaluate(self, X_test, y_test):
        """
            compute RMSE on test data
        """
        pred = self.model.predict(X_test)
        rmse = mean_squared_error(y_test, pred, squared = False)
        print(f'\nTest RMSE: {rmse:.4f}')
    
    def recommend(self, user_id, items_known, numMovie: int):
        return self.model.recommend(user=user_id, items_known=items_known, amount=numMovie)

    def save_model(self, save_path):
        """
            savePath (str): * do not include ".pkl"
        """
        file_to_store = open(save_path + ".pkl", "wb")
        pickle.dump(self, file_to_store)
        file_to_store.close()

    def load_model(save_path):
        """
            savePath (str): * do not include ".pkl"
        """
        file_to_read = open(save_path + ".pkl", "rb")
        mf = pickle.load(file_to_read)
        file_to_read.close()
        return mf

In [3]:
data = read_data(DATA_PATH, COL_NAMES)
data

Unnamed: 0,user_id,item_id,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1246,5.0
4,1,1968,4.0
...,...,...,...
996462,23624,1358,3.5
996463,23624,1392,4.0
996464,23624,1394,4.0
996465,23624,1914,3.5


In [4]:
X_train, X_test, y_train, y_test = split_train_test(data)

In [5]:
# create MF model and train
mf = MF(X_train, y_train, num_epochs=NUM_EPOCHS)

Epoch  1 / 100  -  train_rmse: 0.9734767464666649
Epoch  2 / 100  -  train_rmse: 0.9476491848061338
Epoch  3 / 100  -  train_rmse: 0.9327470177034417
Epoch  4 / 100  -  train_rmse: 0.9223976472407741
Epoch  5 / 100  -  train_rmse: 0.9146888815854218
Epoch  6 / 100  -  train_rmse: 0.9087601617231631
Epoch  7 / 100  -  train_rmse: 0.9038944851521955
Epoch  8 / 100  -  train_rmse: 0.8997712370422365
Epoch  9 / 100  -  train_rmse: 0.8963526498505504
Epoch  10 / 100  -  train_rmse: 0.8934368046993567
Epoch  11 / 100  -  train_rmse: 0.8907429555531162
Epoch  12 / 100  -  train_rmse: 0.8885654429673283
Epoch  13 / 100  -  train_rmse: 0.886582828965991
Epoch  14 / 100  -  train_rmse: 0.8847367506729163
Epoch  15 / 100  -  train_rmse: 0.8830523092231902
Epoch  16 / 100  -  train_rmse: 0.8815375444203958
Epoch  17 / 100  -  train_rmse: 0.8801285960809492
Epoch  18 / 100  -  train_rmse: 0.8789785835748605
Epoch  19 / 100  -  train_rmse: 0.8777834428479324
Epoch  20 / 100  -  train_rmse: 0.8766901

In [6]:
mf.evaluate(X_test, y_test)


Test RMSE: 0.8798


In [7]:
user = 200
items_known = data.query('user_id == @user')['item_id']
recommendation = mf.recommend(user, items_known, numMovie=12)

In [8]:
recommendation

Unnamed: 0,user_id,item_id,rating_pred
114,200,318,4.322971
1718,200,5121,4.318037
2477,200,668,4.298154
3456,200,632,4.289271
2716,200,8199,4.26789
1340,200,5056,4.266398
343,200,2019,4.263767
850,200,3134,4.258894
1302,200,8197,4.256908
1740,200,27033,4.25344


In [9]:
user_item = pd.DataFrame(data={'user_id': [int('1')],
                               'item_id': [int('102899')]})
rating = pd.Series(data=[float('4.3')]) 

In [10]:
mf.update(user_item, rating)

Epoch  1 / 10  -  train_rmse: 0.8276972097820896
Epoch  2 / 10  -  train_rmse: 0.826098435531974
Epoch  3 / 10  -  train_rmse: 0.8245030677328469
Epoch  4 / 10  -  train_rmse: 0.8229110991267063
Epoch  5 / 10  -  train_rmse: 0.8213225224710121
Epoch  6 / 10  -  train_rmse: 0.8197373305386586
Epoch  7 / 10  -  train_rmse: 0.8181555161179359
Epoch  8 / 10  -  train_rmse: 0.8165770720125014
Epoch  9 / 10  -  train_rmse: 0.8150019910413455
Epoch  10 / 10  -  train_rmse: 0.813430266038758


In [11]:
mf.save_model('./model/mf')

In [12]:
del mf

In [13]:
load_mf = MF.load_model('./model/mf') 

In [14]:
load_mf.recommend(user, items_known, 10)

Unnamed: 0,user_id,item_id,rating_pred
114,200,318,4.322971
1718,200,5121,4.318037
2477,200,668,4.298154
3456,200,632,4.289271
2716,200,8199,4.26789
1340,200,5056,4.266398
343,200,2019,4.263767
850,200,3134,4.258894
1302,200,8197,4.256908
1740,200,27033,4.25344


In [15]:
load_mf.update(user_item, rating)

Epoch  1 / 10  -  train_rmse: 1.050448049142393
Epoch  2 / 10  -  train_rmse: 1.0483746676630536
Epoch  3 / 10  -  train_rmse: 1.0463057038633146
Epoch  4 / 10  -  train_rmse: 1.0442411483305842
Epoch  5 / 10  -  train_rmse: 1.042180991672327
Epoch  6 / 10  -  train_rmse: 1.0401252245160175
Epoch  7 / 10  -  train_rmse: 1.0380738375091019
Epoch  8 / 10  -  train_rmse: 1.0360268213189525
Epoch  9 / 10  -  train_rmse: 1.0339841666328269
Epoch  10 / 10  -  train_rmse: 1.0319458641578256
