In [1]:
# basic libs
import sys
import time
import logging
import yaml

import numpy as np
import pandas as pd

# data preparation
from replay.utils import convert2spark
from replay.session_handler import (
    get_spark_session,
    State,
    logger_with_settings,
)

# metric for optimization
from replay.metrics import NDCG

# models
from replay.models import (
    ALSWrap,
    SLIM,
    MultVAE,
    KNN,
    PopRec,
    UserPopRec,
    RandomRec,
)


In [2]:
spark = get_spark_session()
state = State(spark)
spark.conf.set("spark.local.dir", "/home/baurzhan/tmp")
display(spark.sparkContext.getConf().getAll())


22/07/20 06:04:49 WARN Utils: Your hostname, recsys-automl resolves to a loopback address: 127.0.1.1; using 192.168.0.5 instead (on interface ens160)
22/07/20 06:04:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/07/20 06:04:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/07/20 06:04:49 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


[('spark.app.id', 'local-1658297090466'),
 ('spark.local.dir', '/home/baurzhan/tmp'),
 ('spark.sql.shuffle.partitions', '36'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.host', 'localhost'),
 ('spark.app.name', 'pyspark-shell'),
 ('spark.driver.bindAddress', '127.0.0.1'),
 ('spark.driver.extraJavaOptions',
  '-Dio.netty.tryReflectionSetAccessible=true'),
 ('spark.driver.memory', '66g'),
 ('spark.sql.catalogImplementation', 'hive'),
 ('spark.sql.warehouse.dir',
  'file:/home/baurzhan/model_comparison/spark-warehouse'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.kryoserializer.buffer.max', '256m'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.startTime', '1658297089877'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.sql.execution.arrow.pyspark.enabled', 'true'),
 ('spark.driver.port', '42931'),
 ('spark.driver.maxResultSize', '4g')]

In [3]:
logger_config = """
version: 1
formatters:
    simple:
        class: logging.Formatter
        format: "%(asctime)s, %(name)s, %(levelname)s: %(message)s"
        datefmt: "%d-%b-%y %H:%M:%S"
handlers:
    file_handler:
        class: logging.FileHandler
        filename: logs/replay_models.log
        level: DEBUG
        formatter: simple
    stream_handler:
        class: logging.StreamHandler
        stream: ext://sys.stdout
        level: DEBUG
        formatter: simple
loggers:
    replay_models:
        level: DEBUG
        handlers: [file_handler, stream_handler]
        propagate: no
"""
logger = logging.getLogger("replay_models")

config = yaml.load(logger_config, Loader=yaml.FullLoader)
logging.config.dictConfig(config)


In [4]:
replay_logger = logging.getLogger("replay")
file_handler = logging.FileHandler(
    "logs/replay_logs",
    mode="a",
    encoding="utf-8",
)
simple_formatter = logging.Formatter(
    fmt="%(asctime)s, %(name)s, %(levelname)s: %(message)s",
    datefmt="%d-%b-%y %H:%M:%S",
)
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(simple_formatter)
replay_logger.addHandler(file_handler)


In [5]:
DATA_NAME = "bookcrossing"
DATA_PATH = f"data/{DATA_NAME}/"
TRAIN_FILE_NAME = "train.csv"
VAL_FILE_NAME = "val.csv"
TEST_FILE_NAME = "test.csv"

k = 10
shuffle = False
budget = 10
seed = 1909


In [6]:
def load_data(data_path, train_file_name, val_file_name, test_file_name):
    return (
        convert2spark(pd.read_csv(f"{data_path}{train_file_name}")),
        convert2spark(pd.read_csv(f"{data_path}{val_file_name}")),
        convert2spark(pd.read_csv(f"{data_path}{test_file_name}")),
    )


In [7]:
train, val, test = load_data(
    data_path=DATA_PATH,
    train_file_name=TRAIN_FILE_NAME,
    val_file_name=VAL_FILE_NAME,
    test_file_name=TEST_FILE_NAME,
)
logger.info(msg=f"{DATA_NAME} dataset!")


train.count(), val.count(), test.count()


20-Jul-22 06:04:53, replay_models, INFO: bookcrossing dataset!


22/07/20 06:04:54 WARN TaskSetManager: Stage 0 contains a task of very large size (1574 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

(602879, 25508, 25515)

## Baselines

In [8]:
pop_rec_model = PopRec()
user_pop_rec_model = UserPopRec()
random_rec_model = RandomRec(seed=seed)


In [9]:
try:
    logger.info(msg="random rec fitting started")
    random_rec_model.fit(log=train)
    logger.info(msg="random rec fitting finished")
except BaseException as e:
    logger.warning(msg="random rec model fitting error")
    logger.warning(msg=f"{e}")

try:
    logger.info(msg="random rec test prediction started")
    random_rec_test_predict = random_rec_model.predict(
        log=train,
        k=k,
        users=test.select("user_idx").distinct(),
        filter_seen_items=True,
    )
    logger.info(msg="random rec test prediction finished")
except BaseException as e:
    logger.warning(msg="random rec model prediction error")
    logger.warning(msg=f"{e}")


try:
    logger.info(msg="random rec test predict saving")
    random_rec_test_predict.toPandas().to_csv(
        f"{DATA_PATH}predicts/random_rec_test_predict.csv", index=False
    )
    logger.info(msg="random rec test predict saved")
except BaseException as e:
    logger.warning(msg="random rec test predict file saving failed")
    logger.warning(msg=f"{e}")


In [10]:
try:
    logger.info(msg="pop rec fitting started")
    pop_rec_model.fit(log=train)
    logger.info(msg="pop rec fitting finished")
except BaseException as e:
    logger.warning(msg="pop rec model fitting error")
    logger.warning(msg=f"{e}")

try:
    logger.info(msg="pop rec test prediction started")
    pop_rec_test_predict = pop_rec_model.predict(
        log=train,
        k=k,
        users=test.select("user_idx").distinct(),
        filter_seen_items=True,
)
    logger.info(msg="pop rec test prediction finished")
except BaseException as e:
    logger.warning(msg="pop rec model prediction error")
    logger.warning(msg=f"{e}")


try:
    logger.info(msg="pop rec test predict saving")
    pop_rec_test_predict.toPandas().to_csv(
        f"{DATA_PATH}predicts/pop_rec_test_predict.csv", index=False
    )
    logger.info(msg="pop rec test predict saved")
except BaseException as e:
    logger.warning(msg="pop rec test predict file saving failed")
    logger.warning(msg=f"{e}")


In [14]:
models_dict = {
    # "kNN": {
    #     "model": KNN,
    #     "param_space": {
    #         "num_neighbours": [10, 50],
    #         "shrink": [1e-2, 1e3 + 1e-2],
    #     },
    # },
    "SLIM": {
        "model": SLIM,
        "param_space": {
            "beta": [1e-6, 1e2],
            "lambda_": [1e-6, 1e2],
        },
    },
    # "ALS": {
    #     "model": ALSWrap,
    #     "param_space": {
    #         "rank": [5, 100],
    #     },
    # },
    # "MultVAE": {
    #     "model": MultVAE,
    #     "param_space": {
    #         "learning_rate": [1e-4, 1],
    #         "latent_dim": [100, 300],
    #         "dropout": [0, 0.5],
    #         "anneal": [0.0, 1.0],
    #         "l2_reg": [1e-5, 5],
    #     },
    # },
}


In [15]:
def optimize_algo(model_name, model_dict, train, val):
    logger.info(msg=f"Replay {model_name} optimizing started!")
    try:
        logger.info(msg=f"{model_name} model optimizing.")
        model = model_dict["model"]()
        best_params = model.optimize(
            train=train,
            test=val,
            param_borders=model_dict["param_space"],
            criterion=NDCG(),
            k=k,
            budget=budget,
        )
        logger.info(msg=f"Best params of {model_name} model are:\n{best_params}")
        logger.info(msg=f"{model_name} model optimizing finished.")

        return best_params
    except BaseException as e:
        logger.warning(msg=f"{model_name} model optimization failed")

        return dict()


def train_fit_test_predict(model_name, model_dict, train):
    logger.info(msg=f"Replay {model_name} training and prediction started!")
    try:
        model = model_dict["model"](**model_dict["best_params"])
        logger.debug(f"{model_name} model instance creatred.")

        logger.debug(f"{model_name} model training started.")
        start = time.time()
        model.fit(train)
        end = time.time()
        logger.debug(f"{model_name} model training finished.")
        logger.info(f"{model_name} model training time {end-start}")

        logger.debug(f"{model_name} model predict started.")
        pred = model.predict(
            log=train,
            k=k,
            users=test.select("user_idx").distinct(),
            filter_seen_items=True,
        )
        logger.debug(f"{model_name} model predict finished.")

        return pred
    except BaseException as e:
        logger.warning(msg=f"{model_name} model fit-predict failed")
        logger.warning(msg=f"{e}")
        return None


def save_test_predict(model_name, model_dict):
    try:
        logger.info(msg=f"{model_name} test predict saving started")
        model_dict["pred"].toPandas().to_csv(
            f"{DATA_PATH}predicts/replay_{model_name}_test_predict.csv",
            index=False,
        )
        logger.info(msg=f"{model_name} test predict saving finished")
    except BaseException as e:
        logger.warning(msg=f"{model_name} model test predict saving failed")
        logger.warning(msg=f"{e}")


In [16]:
for model_name, model_dict in models_dict.items():
    print(model_name)
    print(model_dict)
    model_dict["best_params"] = optimize_algo(model_name, model_dict, train, val)
    model_dict["pred"] = train_fit_test_predict(model_name, model_dict, train)
    save_test_predict(model_name, model_dict)


SLIM
{'model': <class 'replay.models.slim.SLIM'>, 'param_space': {'beta': [1e-06, 100.0], 'lambda_': [1e-06, 100.0]}}
20-Jul-22 06:06:47, replay_models, INFO: Replay SLIM training and prediction started!
20-Jul-22 06:06:47, replay_models, DEBUG: SLIM model instance creatred.
20-Jul-22 06:06:47, replay_models, DEBUG: SLIM model training started.


22/07/20 06:06:47 WARN TaskSetManager: Stage 6 contains a task of very large size (1574 KiB). The maximum recommended task size is 1000 KiB.
22/07/20 06:06:48 WARN TaskSetManager: Stage 9 contains a task of very large size (1574 KiB). The maximum recommended task size is 1000 KiB.
22/07/20 06:06:48 WARN TaskSetManager: Stage 18 contains a task of very large size (1574 KiB). The maximum recommended task size is 1000 KiB.
[Stage 20:>                                                       (0 + 12) / 36]