### Install Beta-recsys

## Loading dataset

In [32]:
import sys

sys.path.append("../")

import random

import numpy as np

from beta_rec.data.grocery_data import GroceryData
from beta_rec.datasets.instacart import Instacart_25

seed = 2021
random.seed(seed)  # Fix random seeds for reproducibility
np.random.seed(seed)

# make sure that you have already download the Instacart data from this link: https://www.kaggle.com/c/instacart-market-basket-analysis#
# uncompressed them and put them in this folder: ../datasets/instacart_25/raw/*.csv


dataset = Instacart_25(
    min_u_c=20, min_i_c=30, min_o_c=10
)  # Specifying the filtering conditions.
dataset.preprocess()  ## Resample 25%

# Split the data
split_dataset = dataset.load_temporal_basket_split(
    test_rate=0.2, n_test=10, force_redo=True
)
data = GroceryData(split_dataset)

Start loading data from raw data
Start sampling 25% users from the raw data
Loading raw data completed
--------------------------------------------------------------------------------
Raw interaction statistics
+---------+------------+------------+--------------+-----------------+-------------+
|         | col_user   | col_item   | col_rating   | col_timestamp   | col_order   |
|---------+------------+------------+--------------+-----------------+-------------|
| count   | 8415977    | 8415977    | 8415977      | 8415977         | 8415977     |
| nunique | 51552      | 47254      | 1            | 8415977         | 833644      |
+---------+------------+------------+--------------+-----------------+-------------+
--------------------------------------------------------------------------------
filter_user_item_order under condition min_u_c=20, min_i_c=30, min_o_c=10
--------------------------------------------------------------------------------
Dataset statistics before filter
+---------

In [None]:
import sys

sys.path.append("../")

import random

from beta_rec.data.grocery_data import GroceryData
from beta_rec.datasets.instacart import Instacart_25

random.seed(2021)  # Fix random seeds for reproducibility

dataset = Instacart_25(min_u_c=20, min_i_c=30, min_o_c=10)
dataset.preprocess()  ## Resample 25%
split_dataset = dataset.load_temporal_basket_split(
    test_rate=0.2, n_test=10, force_redo=True
)
data = GroceryData(split_dataset)

In [None]:
import sys

sys.path.append("../")

import random

from beta_rec.data.grocery_data import GroceryData
from beta_rec.datasets.instacart import Instacart_25

random.seed(2029)

dataset = Instacart_25(min_u_c=20, min_i_c=30, min_o_c=10)
dataset.preprocess()  ## Resample 25%
split_dataset = dataset.load_temporal_basket_split(
    test_rate=0.2, n_test=1, force_redo=True
)
# data = GroceryData(split_dataset)

In [None]:
import sys

sys.path.append("../")
from beta_rec.data.grocery_data import GroceryData
from beta_rec.datasets.instacart import Instacart_25

dataset = Instacart_25(min_u_c=20, min_i_c=30, min_o_c=10,)
split_dataset = dataset.load_temporal_basket_split()
data = GroceryData(split_dataset)

In [None]:
import sys

sys.path.append("../")
from beta_rec.data.grocery_data import GroceryData
from beta_rec.datasets.instacart import Instacart_25

dataset = Instacart_25(min_u_c=20, min_i_c=30, min_o_c=10,)
split_dataset = dataset.load_temporal_basket_split()
data = GroceryData(split_dataset)

In [None]:
!ls /home/zm324/workspace/beta_rec/datasets/instacart_25/raw

### Model config

In [None]:
config = {"config_file": "../configs/vbcar_default.json"}
config["n_sample"] = 100000  # To reduce the test running time
config["max_epoch"] = 5
# the 'config_file' key is required, that is used load a default config.
# Other keys can be specified to replace the default settings.

### Model intialization and training

In [None]:
from beta_rec.recommenders import VBCAR

model = VBCAR(config)
model.train(data)

# @To be discussed
# model.train(train_df)
# Case 1, without validation, stop training by loss or max_epoch

# model.train(train_df,valid_df[0])
# Case 2, with validation, stop training by performance on validation set

# model.train(train_df,valid_df[0],test_df[0])
# Case 3, same as Case 2, but also evaluate performance for each epoch on test set.

# Note that the best model will be save automatically, and record the model-save-dir.

### Model trainng

In [None]:
model.test(data.test[0])

### Load a pre-trained Model, and Predict for new dataset

In [None]:
from beta_rec.recommenders import MatrixFactorization

config = {
    "config_file": "../configs/mf_default.json",
}
# model_dir = model.config["system"]["model_save_dir"] # default saving dir for current model
model_dir = "/home/zm324/workspace/beta_rec/checkpoints/MF_default_20200912_173445_gccwwj/mf.model"  # a specfic model
model = MatrixFactorization(config)
model.init_engine(data)
# this is necessary, since we cannot intialize a model before we get the the numbers of users and items

model.load(model_dir=model_dir)
scores = model.predict(data.test[0])
scores

In [None]:
scores.shape

## Model tuning

In [None]:
# Before using tune, you need to install beta_rec to your local python lib
# E.g. go to the project folder, run:
# python setup.py install --record files.txt

# if something wrong, you can uninstall by: xargs rm -rf < files.txt

from beta_rec.data import BaseData
from beta_rec.datasets.movielens import Movielens_100k
from beta_rec.recommenders import MatrixFactorization

dataset = Movielens_100k()
split_dataset = dataset.load_leave_one_out(n_test=1)
data = BaseData(split_dataset)

config = {"config_file": "../configs/mf_default.json", "tune": True, "max_epoch": 2}

model = MatrixFactorization(config)
tune_result = model.train(data)
tune_result

### Note that ray version should be 0.8.5.


In [None]:
import ray

ray.__version__
ray.init()