In [None]:
import logging

import japanize_matplotlib
import matplotlib.pyplot as plt
import networkx as nx
import polars as pl
import rootutils
import seaborn as sns

sns.set_style("whitegrid")
logging.basicConfig(level=logging.INFO)

ROOT = rootutils.setup_root(".", pythonpath=True, cwd=True)

DATA_DIR = ROOT / "data"
INPUT_DIR = DATA_DIR / "atmacup19_dataset"
OUTPUT_DIR = DATA_DIR / "output"
CACHE_DIR = DATA_DIR / "cache"

for d in [DATA_DIR, INPUT_DIR, OUTPUT_DIR, CACHE_DIR]:
    d.mkdir(exist_ok=True, parents=True)

pl.Config.set_fmt_str_lengths(200)
pl.Config.set_tbl_cols(50)
pl.Config.set_tbl_rows(50)


### load data


In [None]:
ec_log_df = pl.read_csv(INPUT_DIR / "ec_log.csv", infer_schema_length=200000)
jan_df = pl.read_csv(INPUT_DIR / "jan.csv")
test_session_df = pl.read_csv(INPUT_DIR / "test_session.csv")
train_session_df = pl.read_csv(INPUT_DIR / "train_session.csv")
train_log_df = pl.read_csv(INPUT_DIR / "train_log.csv")

train_session_df = train_session_df.with_columns(pl.col("売上日").cast(pl.Date)).with_columns(
    pl.datetime(
        pl.col("売上日").dt.year(), pl.col("売上日").dt.month(), pl.col("売上日").dt.day(), pl.col("時刻")
    ).alias("session_datetime")
)

In [None]:
noleak_log_df = (
    train_log_df.join(
        train_session_df.select(["session_id", "session_datetime", "顧客CD"]), on="session_id", how="left"
    )
    .sort("session_datetime")
    .select(
        pl.col("session_id"),
        pl.col("JAN"),
        pl.col("売上数量")
        .rolling_sum_by(
            "session_datetime",
            window_size="6mo",
            closed="left",
        )
        .over(["顧客CD", "JAN"])
        .fill_null(0),
    )
)


In [None]:
import scipy.sparse as sp
from sklearn.preprocessing import LabelEncoder


def create_sparse_matrix(
    df: pl.DataFrame,
    session_col: str = "session_id",
    item_col: str = "JAN",
    value_col: str = "売上数量",
) -> tuple[sp.csr_matrix, LabelEncoder, LabelEncoder]:
    # `session_id`と`JAN`を数値に変更する
    session_encoder = LabelEncoder()
    item_encoder = LabelEncoder()
    session_array = session_encoder.fit_transform(df[session_col].to_numpy().ravel())
    jan_array = item_encoder.fit_transform(df[item_col].to_numpy().ravel())

    data_array = df[value_col].to_numpy().ravel()

    # スパース行列を作成する
    sparse_matrix = sp.csr_matrix(
        (data_array, (session_array, jan_array)),
        shape=(len(session_encoder.classes_), len(item_encoder.classes_)),
    )

    return sparse_matrix, session_encoder, item_encoder


sparse_matrix, session_encoder, item_encoder = create_sparse_matrix(noleak_log_df)


In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=32, random_state=42)

# セッションの埋め込み
session_embeddings = svd.fit_transform(sparse_matrix)
jan_embeddings = svd.components_.T


print(f"{session_embeddings.shape=}")
print(f"{jan_embeddings.shape=}")


In [None]:
import umap

umap_model = umap.UMAP(n_components=2, verbose=True)
umap_jan = umap_model.fit_transform(jan_embeddings)

In [None]:
import plotly.express as px

jan2division = (
    jan_df.select(["JAN", "ディビジョン"]).unique().to_pandas().groupby("JAN")["ディビジョン"].first().to_dict()
)
jan2name = jan_df.select(["JAN", "商品名"]).unique().to_pandas().groupby("JAN")["商品名"].first().to_dict()

fig = px.scatter(
    x=umap_jan[:, 0],
    y=umap_jan[:, 1],
    color=[jan2division[jan] for jan in item_encoder.classes_],
    hover_data={"商品名": [jan2name[jan] for jan in item_encoder.classes_]},
    opacity=0.5,
)
fig.show()