# Models: Neural Network Matrix Factorization

In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import sys
sys.path.append('../../src')

import numpy as np
import pandas as pd

import logging
import random

import api

from surprise import BaselineOnly, Dataset, Reader
from surprise.model_selection import cross_validate
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
TEMP_PATH                   = '../temp'
INTERACTIONS_PATH           = f'{TEMP_PATH}/interations.csv'
COMPLETED_INTERACTIONS_PATH = f'{TEMP_PATH}/completed_interations.csv'

!mkdir  {TEMP_PATH}

### Functions & Clases

In [7]:
def substring_after(s, delim): return s.partition(delim)[2]

def to_interaction_row(e):
    e['user']   = int(e['user'])
    e['item']   = int(substring_after(e['item'], 'items/').replace('/', ''))
    e['rating'] = float(e['rating'])
    return e

In [8]:
class InteractionsService:
    def __init__(self, df): self.df = df

    def rated_items_by_user(self):
        result = {}
        for _, row in self.df.iterrows():
            if row['rating'] != None and row['rating'] > 0:
                if row['user'] in result:
                    result[row['user']].add(row['item'])
                else:
                    result[row['user']] = set([row['item']])
        return result


    def unrated_items_by_user(self):
        all_items           = set(np.unique(self.df.item.values))
        rated_items_by_user = self.rated_items_by_user()

        return {user: list(all_items-user_rated_items) for user, user_rated_items in rated_items_by_user.items() }

### Get interactions

In [9]:
recsys = api.RecSysApi(
    host  = 'http://nonosoft.ddns.net:8888',
    token = 'e3ff025094fe0ee474501bbeda0a2a44e80230c1'
)

interactions_iterator = api.ResourceIterator(recsys, 'interactions', page_size = 100000)

interactions = api.to_dataframe(interactions_iterator, to_interaction_row)

# interactions.info()

Page 1 downloaded. Items: 100000/1205418.
Page 2 downloaded. Items: 200000/1205418.
Page 3 downloaded. Items: 300000/1205418.
Page 4 downloaded. Items: 400000/1205418.
Page 5 downloaded. Items: 500000/1205418.
Page 6 downloaded. Items: 600000/1205418.
Page 7 downloaded. Items: 700000/1205418.
Page 8 downloaded. Items: 800000/1205418.
Page 9 downloaded. Items: 900000/1205418.
Page 10 downloaded. Items: 1000000/1205418.
Page 11 downloaded. Items: 1100000/1205418.
Page 12 downloaded. Items: 1200000/1205418.
Page 13 downloaded. Items: 1300000/1205418.


In [10]:
# interactions.shape

In [11]:
interactions.to_csv(INTERACTIONS_PATH, encoding='utf-8', index=False, header=False)

### Train Model

In [12]:
reader = Reader(line_format="user item rating", sep=",")
data   = Dataset.load_from_file(INTERACTIONS_PATH, reader=reader)

In [13]:
model = SVD()
cross_validate(model, data, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7704  0.7707  0.7705  0.7680  0.7722  0.7703  0.0013  
MAE (testset)     0.5802  0.5792  0.5794  0.5780  0.5802  0.5794  0.0008  
Fit time          14.77   17.06   16.28   15.98   16.18   16.06   0.74    
Test time         1.84    1.72    1.93    1.63    1.60    1.74    0.13    


In [14]:
trainset = data.build_full_trainset()

In [15]:
model = SVD()
model.fit(trainset)

In [16]:
unrated_items_by_user = InteractionsService(interactions).unrated_items_by_user()

In [None]:
for user, items in unrated_items_by_user.items():    
    for item in items:
        interactions.append(
            {
                'user'  : user, 
                'item'  : item, 
                'rating': model.predict(str(user), str(item)).est
            }, 
            ignore_index=True
        )

In [None]:
interactions.to_csv(COMPLETED_INTERACTIONS_PATH, encoding='utf-8', index=False, header=False)