In [1]:
# basic libs
import sys
import time
import logging
import yaml

import numpy as np
import pandas as pd

# data preparation
from replay.utils import convert2spark
from replay.session_handler import get_spark_session, State, logger_with_settings
from replay.splitters import UserSplitter


In [2]:
spark = get_spark_session()
state = State(spark)
spark.conf.set("spark.local.dir", "/home/baurzhan/tmp")
display(spark.sparkContext.getConf().getAll())


22/07/20 16:02:07 WARN Utils: Your hostname, recsys-automl resolves to a loopback address: 127.0.1.1; using 192.168.0.5 instead (on interface ens160)
22/07/20 16:02:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/07/20 16:02:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/07/20 16:02:07 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


[('spark.local.dir', '/home/baurzhan/tmp'),
 ('spark.sql.shuffle.partitions', '36'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.host', 'localhost'),
 ('spark.app.name', 'pyspark-shell'),
 ('spark.app.id', 'local-1658332928155'),
 ('spark.driver.bindAddress', '127.0.0.1'),
 ('spark.driver.port', '41317'),
 ('spark.driver.extraJavaOptions',
  '-Dio.netty.tryReflectionSetAccessible=true'),
 ('spark.driver.memory', '66g'),
 ('spark.sql.catalogImplementation', 'hive'),
 ('spark.sql.warehouse.dir',
  'file:/home/baurzhan/model_comparison/spark-warehouse'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.kryoserializer.buffer.max', '256m'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.app.startTime', '1658332927563'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.sql.execution.arrow.pyspark.enabled', 'true'),
 ('spark.driver.maxResultSize', '4g')]

In [3]:
logger_config = """
version: 1
formatters:
    simple:
        class: logging.Formatter
        format: "%(asctime)s, %(name)s, %(levelname)s: %(message)s"
        datefmt: "%d-%b-%y %H:%M:%S"
handlers:
    file_handler:
        class: logging.FileHandler
        filename: logs/data_preparation.log
        level: DEBUG
        formatter: simple
    stream_handler:
        class: logging.StreamHandler
        stream: ext://sys.stdout
        level: DEBUG
        formatter: simple
loggers:
    data_preparation_logger:
        level: DEBUG
        handlers: [file_handler, stream_handler]
        propagate: no
"""
logger = logging.getLogger(name="data_preparation_logger")

config = yaml.load(logger_config, Loader=yaml.FullLoader)
logging.config.dictConfig(config)


In [44]:
DATA_NAME = "lastfm"
DATA_PATH = f"data/{DATA_NAME}/"
TRAIN_FILE_NAME = "train.csv"
TEST_FILE_NAME = "test.csv"
VAL_FILE_NAME = "val.csv"

k = 10
shuffle = False
budget = 10
trials_num = 10
seed = 1909


In [45]:
logger.info(msg=f"{DATA_NAME} dataset!")


20-Jul-22 16:10:23, data_preparation_logger, INFO: lastfm dataset!


In [46]:
def load_data(data_path, train_file_name, test_file_name):
    return (
        convert2spark(pd.read_csv(f"{data_path}{train_file_name}")),
        convert2spark(pd.read_csv(f"{data_path}{test_file_name}")),
    )


In [47]:
logger.debug(msg="train, val, test loading")
train, test = load_data(
    data_path=DATA_PATH,
    train_file_name=TRAIN_FILE_NAME,
    test_file_name=TEST_FILE_NAME,
)


20-Jul-22 16:10:24, data_preparation_logger, DEBUG: train, val, test loading


In [48]:
train_ = train.toPandas()
test_ = test.toPandas()


22/07/20 16:10:30 WARN TaskSetManager: Stage 4 contains a task of very large size (36240 KiB). The maximum recommended task size is 1000 KiB.
22/07/20 16:10:32 WARN TaskSetManager: Stage 5 contains a task of very large size (9177 KiB). The maximum recommended task size is 1000 KiB.


In [49]:
train_.shape, test_.shape


((13914378, 4), (3522481, 4))

In [50]:
sep = 1000 * 22
logger.info(msg=f"user_idx separation = {sep}")


20-Jul-22 16:10:33, data_preparation_logger, INFO: user_idx separation = 22000


In [51]:
test__ = test_.groupby("user_idx").count().sort_values("relevance", ascending=False).iloc[:sep]
val_user_idxs, test_user_idxs = set(test__[(np.arange(test__.shape[0]) % 2).astype(bool)].index), set(test__[((np.arange(test__.shape[0]) + 1) % 2).astype(bool)].index)

val_user_idxs_df = test_["user_idx"].apply(lambda x: x in val_user_idxs)
test_user_idxs_df = test_["user_idx"].apply(lambda x: x in test_user_idxs)


In [52]:
val_nunique = len(val_user_idxs)
logger.debug(msg=f"num of unique users in val set = {val_nunique}")


20-Jul-22 16:10:35, data_preparation_logger, DEBUG: num of unique users in val set = 11000


In [53]:
test_nunique = len(test_user_idxs)
logger.debug(msg=f"num of unique users in test set = {test_nunique}")


20-Jul-22 16:10:35, data_preparation_logger, DEBUG: num of unique users in test set = 11000


In [54]:
val_ = test_.loc[val_user_idxs_df]
test_ = test_.loc[test_user_idxs_df]


In [55]:
(val_.groupby("user_idx").count()["relevance"] < 10).sum()

0

In [56]:
(test_.groupby("user_idx").count()["relevance"] < 10).sum()


0

In [59]:
val_.to_csv(f"{DATA_PATH}{VAL_FILE_NAME}", index=False)
logger.info(msg=f"{DATA_NAME} data val set saved.")

test_.to_csv(f"{DATA_PATH}{TEST_FILE_NAME}", index=False)
logger.info(msg=f"{DATA_NAME} data test set saved.")


20-Jul-22 16:10:42, data_preparation_logger, INFO: lastfm data val set saved.
20-Jul-22 16:10:42, data_preparation_logger, INFO: lastfm data test set saved.
