In [1]:
import yaml
import logging
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
import scipy
import optuna

from implicit.als import AlternatingLeastSquares

from replay.metrics import NDCG


In [2]:
logger_config = """
version: 1
formatters:
    simple:
        class: logging.Formatter
        format: "%(asctime)s, %(name)s, %(levelname)s: %(message)s"
        datefmt: "%d-%b-%y %H:%M:%S"
handlers:
    file_handler:
        class: logging.FileHandler
        filename: logs/implicit_models.log
        level: DEBUG
        formatter: simple
    stream_handler:
        class: logging.StreamHandler
        stream: ext://sys.stdout
        level: DEBUG
        formatter: simple
loggers:
    implicit_models_logger:
        level: DEBUG
        handlers: [file_handler, stream_handler]
        propagate: no
"""
logger = logging.getLogger(name="implicit_models_logger")

config = yaml.load(logger_config, Loader=yaml.FullLoader)
logging.config.dictConfig(config)



In [3]:
DATA_NAME = "lastfm"
DATA_PATH = f"data/{DATA_NAME}/"
TRAIN_FILE_NAME = "train.csv"
VAL_FILE_NAME = "val.csv"
TEST_FILE_NAME = "test.csv"

k = 10
shuffle = False
budget = 10
seed = 1909



In [4]:
def load_data(data_path, train_file_name, val_file_name, test_file_name):
    return (
        pd.read_csv(f"{data_path}{train_file_name}"),
        pd.read_csv(f"{data_path}{val_file_name}"),
        pd.read_csv(f"{data_path}{test_file_name}"),
    )



In [5]:
train, val, test = load_data(
    data_path=DATA_PATH,
    train_file_name=TRAIN_FILE_NAME,
    val_file_name=VAL_FILE_NAME,
    test_file_name=TEST_FILE_NAME,
)
logger.info(msg=f"{DATA_NAME} dataset!")

train.shape, val.shape, test.shape


19-Jul-22 22:15:21, implicit_models_logger, INFO: lastfm dataset!


((13916045, 4), (110000, 4), (110000, 4))

In [6]:
train_data = scipy.sparse.csr_matrix(
    (train.relevance, (train.user_idx.values, train.item_idx.values))
)
val_data = scipy.sparse.csr_matrix(
    (val.relevance, (val.user_idx.values, val.item_idx.values))
)
test_data = scipy.sparse.csr_matrix(
    (test.relevance, (test.user_idx.values, test.item_idx.values))
)


In [7]:
ndcg = NDCG()

In [8]:
def predict(model):
    result = pd.DataFrame(columns=["user_idx", "item_idx", "relevance"])
    for user_idx in tqdm(test.user_idx.unique()):
        pred_ = model.recommend(
            user_idx,
            train_data,
            N=k,
            filter_already_liked_items=True,
        )
        pred = pd.DataFrame(
            {
                "user_idx": np.array([user_idx] * k),
                "item_idx": np.array([item_idx for item_idx, rel in pred_]),
                "relevance": np.array([rel for item_idx, rel in pred_]),
            }
        )
        result = pd.concat([result, pred])
    return result


In [9]:
def objective(trial):
    params = {
        "factors": trial.suggest_int("factors", 2**3, 2**9, log=True),
        "regularization": trial.suggest_float("regularization", 1e-4, 100, log=True),
        "use_cg": trial.suggest_categorical("use_cg", [True, False]),
        "iterations": trial.suggest_int("iterations", 5, 100),
    }

    model = AlternatingLeastSquares(
        use_native=True,
        random_state=seed,
        **params,
    )
    model.fit(train_data.T)
    val_preds = predict(model)

    return ndcg(val_preds, val, k)


In [10]:
study = optuna.create_study(direction="maximize")
logger.info(f"ALS model optimizing by NDCG@10 started")
study.optimize(objective, n_trials=budget, timeout=3600 * 5, show_progress_bar=True)
logger.info(f"ALS model optimizing by NDCG@10 finished")
logger.info(f"ALS model best params:\n{study.best_params}")


19-Jul-22 22:15:22, implicit_models_logger, INFO: ALS model optimizing by NDCG@10 started


  self._init_valid()


  0%|          | 0/10 [00:00<?, ?it/s]



  0%|          | 0/80 [00:00<?, ?it/s]

In [None]:
model = AlternatingLeastSquares(**study.best_params)
model.fit(train_data.T)


  0%|          | 0/33 [00:00<?, ?it/s]

In [None]:
result = predict(model)
ndcg(result, test, k)


  0%|          | 0/4030 [00:00<?, ?it/s]

                                                                                

0.019054172910326368

In [None]:
result.to_csv(f"{DATA_PATH}predicts/implicit_als_test_predict.csv")