In [171]:
%cd /workspace/src
import sys

sys.path.append("./")

import hashlib
from functools import cached_property, wraps
from pathlib import Path
from typing import Callable

import joblib
import numpy as np
import pandas as pd
import torch
from custom.config_types import CONFIG_TYPES
from logger import Logger
from pytorch_pfn_extras.config import Config
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from util import load_yaml, reduce_mem_usage, sort_df_numpy

/workspace/src


In [9]:
logger = Logger(name="baseline")

# set config
pre_eval_config = load_yaml()
config = Config(pre_eval_config, types=CONFIG_TYPES)

# set const
DEBUG = True

## Data Load

In [141]:
class TaskDatset:
    def __init__(self, config, overwrite=False) -> None:
        self.config = config
        self.dirpath = Path(config["/global/resources"]) / "input"
        self.dataset_name = config["/fe/dataset"]

        self.raw_train_filepath = self.dirpath / f"{self.dataset_name}_raw_train.parquet"
        self.raw_test_filepath = self.dirpath / f"{self.dataset_name}_raw_test.parquet"

        self.overwrite = overwrite

    @property
    def raw_train_data(self):
        if self.raw_train_filepath.is_file() and (not self.overwrite):
            return pd.read_parquet(self.raw_train_filepath)

        uids = self.raw_data.query("x != 999")["uid"].unique()
        raw_train_df = self.raw_data[self.raw_data["uid"].isin(uids)].reset_index(drop=True)
        raw_train_df.to_parquet(self.raw_train_filepath)
        return raw_train_df

    @property
    def raw_test_data(self):
        if self.raw_test_filepath.is_file() and (not self.overwrite):
            return pd.read_parquet(self.raw_test_filepath)

        uids = self.raw_data.query("x == 999")["uid"].unique()
        raw_test_df = self.raw_data[self.raw_data["uid"].isin(uids)].reset_index(drop=True)
        raw_test_df.to_parquet(self.raw_test_filepath)
        return raw_test_df

    @cached_property
    def raw_data(self):
        return read_parquet_from_csv(
            filepath=self.dirpath / f"{self.dataset_name}.csv.gz",
            dirpath=self.dirpath,
            process_fns=[reduce_mem_usage, sort_df_numpy],
            overwrite=self.config["/fe/overwrite"],
        )

    @property
    def poi_data(self):
        return read_parquet_from_csv(
            filepath=self.dirpath / "cell_POIcat.csv.gz", dirpath=self.dirpath
        )


def read_parquet_from_csv(
    filepath: Path,
    dirpath: Path,
    process_fns: list[Callable] | None = None,
    overwrite: bool = False,
) -> pd.DataFrame:
    name = filepath.name.split(".")[0]
    parquet_filepath = dirpath / f"{name}.parquet"
    if parquet_filepath.is_file() and (not overwrite):
        logger.info(f"load parquet file ({str(filepath)})")
        return pd.read_parquet(parquet_filepath)

    logger.info(f"load csv & convert to parquet ({str(filepath)})")
    df = pd.read_csv(filepath)

    if process_fns is not None:
        for fn in process_fns:
            logger.info(f"excute {fn.__name__}")
            df = fn(df)

    df.to_parquet(parquet_filepath)
    return df

In [142]:
task_dataset = TaskDatset(config, overwrite=True)
raw_train_df = task_dataset.raw_train_data

if DEBUG:
    user_ids = raw_train_df["uid"].sample(100, random_state=config["/global/seed"]).tolist()
    raw_train_df = raw_train_df[raw_train_df["uid"].isin(user_ids)].reset_index(drop=True)

raw_test_df = task_dataset.raw_test_data
train_df = raw_train_df.copy()

[2023-08-06 21:32:46] [32m[baseline] [INFO] - load parquet file (/workspace/resources/input/task2_dataset.csv.gz)[0m


In [143]:
def make_sequences(df: pd.DataFrame, group_key: str, group_values: list[str]):
    grouped = df.groupby(group_key, sort=False)
    sequences = [torch.tensor(group[group_values].to_numpy()) for _, group in grouped]
    return sequences


# feature_names = [x for x in train_df.columns if x.startswith("f_")]
feature_seqs = make_sequences(df=train_df, group_key="uid", group_values=["d", "t"])
auxiliary_seqs = make_sequences(
    df=train_df.query("d >= 60"), group_key="uid", group_values=["d", "t"]
)  # features for prediction zone

target_seqs = make_sequences(
    df=train_df.query("d >= 60"),
    group_key="uid",
    group_values=["x", "y"],
)  # target is x & y over 60 zone

len(feature_seqs), len(target_seqs), len(auxiliary_seqs)

In [91]:
class TrainDataset(Dataset):
    def __init__(self, feature_seqs, auxiliary_seqs, target_seqs):
        self.feature_seqs = feature_seqs
        self.auxiliary_seqs = auxiliary_seqs
        self.target_seqs = target_seqs

    def __len__(self):
        return len(self.feature_seqs)

    def __getitem__(self, index: int) -> dict[str : torch.Tensor]:
        feature_seqs = torch.Tensor(self.feature_seqs[index]).float()
        auxiliary_seqs = torch.Tensor(self.auxiliary_seqs[index]).float()
        target_seqs = torch.Tensor(self.target_seqs[index]).float()
        return {
            "feature_seqs": feature_seqs,
            "auxiliary_seqs": auxiliary_seqs,
            "target_seqs": target_seqs,
        }


class TestDataset(Dataset):
    def __init__(self, feature_seqs):
        self.feature_seqs = feature_seqs
        self.auxiliary_seqs = auxiliary_seqs

    def __len__(self):
        return len(self.feature_seqs)

    def __getitem__(self, index: int) -> dict[str : torch.Tensor]:
        feature_seqs = torch.Tensor(self.feature_seqs[index]).float()
        auxiliary_seqs = torch.Tensor(self.auxiliary_seqs[index]).float()
        return {
            "feature_seqs": feature_seqs,
            "auxiliary_seqs": auxiliary_seqs,
        }

In [193]:
class PadSequenceCollateFn:
    def __init__(self, is_train_mode=True):
        self.is_train_mode = is_train_mode

    def __call__(self, batch):
        feature_seqs = [item["feature_seqs"] for item in batch]
        auxiliary_seqs = [item["auxiliary_seqs"] for item in batch]
        feature_lengths = [len(seq) for seq in feature_seqs]
        auxiliary_lengths = [len(seq) for seq in auxiliary_seqs]

        feature_seqs_padded = pad_sequence(
            [(seq) for seq in feature_seqs], batch_first=True
        )  # (sequence_len, feature_dim)
        auxiliary_seqs_padded = pad_sequence(
            [(seq) for seq in auxiliary_seqs], batch_first=True
        )  # (sequence_len, feature_dim)

        if not self.is_train_mode:
            return {
                "feature_seqs": feature_seqs_padded,
                "auxiliary_seqs": auxiliary_seqs_padded,
                "feature_lengths": feature_lengths,
                "auxiliary_lengths": auxiliary_lengths,
            }

        target_seqs = [item["target_seqs"] for item in batch]
        target_seqs_padded = pad_sequence(
            [(seq) for seq in target_seqs], batch_first=True
        )  # (sequence_len, target_dim)
        return {
            "feature_seqs": feature_seqs_padded,
            "auxiliary_seqs": auxiliary_seqs_padded,
            "target_seqs": target_seqs_padded,
            "feature_lengths": feature_lengths,
            "auxiliary_lengths": auxiliary_lengths,
        }


def to_device(batch, device):
    for k, v in batch.items():
        if not k.endswith("lengths"):
            batch[k] = v.to(device)
    return batch


def train_fn(config, wandb_logger=None):
    model = config["/model"]
    dataloader = config["/dataloader/train"]
    criterion = config["/criterion"]
    optimizer = config["/optimizer"]
    scheduler = config["/scheduler"]

    # training settings
    device = config["/nn/device"]
    use_amp = config["/nn/fp16"]
    gradient_accumulation_steps = config["/nn/gradient_accumulation_steps"]
    clip_grad_norm = config["/nn/clip_grad_norm"]

    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
    losses = []

    iteration_bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, batch in iteration_bar:
        batch = to_device(batch, device)

        with torch.cuda.amp.autocast(enabled=use_amp):
            batch_outputs = model(batch)
            loss = criterion(batch_outputs, batch)
            loss = torch.div(loss, gradient_accumulation_steps)

        scaler.scale(loss).backward()
        if config.clip_grad_norm is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)

        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

            if config.batch_scheduler:
                scheduler.step()

        if wandb_logger is not None:
            wandb_logger.log({"train_loss": loss, "lr": scheduler.get_lr()[0]})
        losses.append(float(loss))
        iteration_bar.set_description(
            f"loss: {np.mean(losses):.4f} lr: {scheduler.get_lr()[0]:.6f}"
        )

    loss = np.mean(losses)
    return {"loss": loss, "step": step}


def valid_fn(config):
    model = config["/model"]
    dataloader = config["/dataloader/valid"]
    criterion = config["/criterion"]

    # training settings
    device = config["/nn/device"]
    gradient_accumulation_steps = config["/nn/gradient_accumulation_steps"]

    model.eval()
    outputs, losses = [], []

    iteration_bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for _, batch in iteration_bar:
        batch = to_device(batch, device)

        with torch.no_grad():
            batch_outputs = model(batch)
            loss = criterion(batch_outputs, batch)
            loss = torch.div(loss, gradient_accumulation_steps)

        batch_outputs = batch_outputs.to("cpu").numpy()
        outputs.append(batch_outputs)
        losses.append(float(loss))

        iteration_bar.set_description(f"loss: {np.mean(losses):.4f}")

    outputs = np.concatenate(outputs)
    loss = np.mean(losses)
    return {"loss": loss, "outputs": outputs}

In [194]:
pre_eval_config = load_yaml()

In [195]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pre_eval_config["nn"]["device"] = device

In [199]:
class CustomLSTMModel(nn.Module):
    def __init__(self, input_size1, input_size2, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size

        self.lstm1 = nn.LSTM(input_size1, hidden_size, batch_first=True)
        self.lstm2 = nn.LSTM(input_size2, hidden_size, batch_first=True)

        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, batch):
        # to variable length
        s1 = pack_padded_sequence(
            batch["feature_seqs"],
            batch["feature_lengths"],
            batch_first=True,
            enforce_sorted=False,
        )
        s2 = pack_padded_sequence(
            batch["auxiliary_seqs"],
            batch["auxiliary_lengths"],
            batch_first=True,
            enforce_sorted=False,
        )
        x1, (hn_1, cn_1) = self.lstm1(s1)

        # Use the final hidden and cell state of lstm1 as initial state for lstm2
        x2, _ = self.lstm2(s2, (hn_1, cn_1))
        x, _ = pad_packed_sequence(x2, batch_first=True)  # to fixible length
        x = self.out(x)

        return x

In [200]:
max_epochs = 2
batch_size = 2

input_size1 = 2
input_size2 = 2
output_size = 2
hidden_size = 2

train_dataset = TrainDataset(
    feature_seqs=feature_seqs, auxiliary_seqs=auxiliary_seqs, target_seqs=target_seqs
)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=1,
    collate_fn=PadSequenceCollateFn(is_train_mode=True),
    shuffle=False,
)

model = CustomLSTMModel(
    input_size1=input_size1,
    input_size2=input_size2,
    hidden_size=hidden_size,
    output_size=output_size,
)

In [201]:
model.eval()
iteration_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
for _, batch in iteration_bar:
    batch = to_device(batch, device)
    output = model(batch)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:06<00:00, 15.88it/s]


In [202]:
output.shape

torch.Size([1, 218, 2])