In [None]:
import sys

sys.path.append("../../")


from pathlib import Path

import pandas as pd
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import DataLoader, Dataset

from src.logger import Logger
from src.util import read_parquet_from_csv

In [None]:
class Config:
    debug = True
    seed = 8823

In [None]:
HOME = Path("/workspace")
RESOURCES = HOME / "resources"
INPUT = RESOURCES / "input"


config = Config()
logger = Logger(__name__)

In [None]:
raw_train_df = read_parquet_from_csv(filepath=INPUT / "task1_dataset.csv.gz", dirpath=INPUT)
poi_df = read_parquet_from_csv(filepath=INPUT / "cell_POIcat.csv.gz", dirpath=INPUT)


if config.debug:
    user_ids = raw_train_df["uid"].sample(100, random_state=config.seed).tolist()
    raw_train_df = raw_train_df[raw_train_df["uid"].isin(user_ids)].reset_index(drop=True)

raw_train_df.head()

In [None]:
class GroupedDiffFeatureExtractor:
    def __init__(
        self,
        group_key: str = "uid",
        group_values: list[str] = ["t", "d"],
        intervals: list[int] = [1, 2],
    ):
        self.group_key = group_key
        self.group_values = group_values
        self.intervals = intervals

    def __call__(self, df):
        cols = [
            {v: f"{v}_grpby_{self.group_key}_diff_{interval}" for v in self.group_values}
            for interval in self.intervals
        ]
        out_df = pd.concat(
            [
                df.groupby(self.group_key)[self.group_values].diff(interval).rename(columns=col)
                for interval, col in zip(self.intervals, cols)
            ],
            axis=1,
        )
        return out_df.add_prefix("f_").fillna(-1)


def sort_df(df):
    return (
        df.assign(nunique_uid=df["uid"].map(df["uid"].value_counts()))
        .sort_values(["nunique_uid", "uid", "d", "t"])
        .reset_index(drop=True)
    )


def make_features(df):
    df = sort_df(df)
    funcs = [
        GroupedDiffFeatureExtractor(
            group_key="uid",
            group_values=["t", "d"],
            intervals=[1, 2],
        )
    ]
    features_df = pd.concat([df] + [func(df) for func in funcs], axis=1)
    return features_df


train_df = make_features(raw_train_df)
train_df.head()

In [None]:
def make_sequences(df: pd.DataFrame, group_key: str, group_values: list[str]):
    grouped = df.groupby(group_key, sort=False)
    sequences = [torch.tensor(group[group_values].to_numpy()) for _, group in grouped]
    return sequences


feature_names = [x for x in train_df.columns if x.startswith("f_")]
feature_seqs = make_sequences(df=train_df, group_key="uid", group_values=feature_names)
target_seqs = make_sequences(df=train_df, group_key="uid", group_values=["x", "y"])

In [None]:
class TrainDataset(Dataset):
    def __init__(self, feature_seqs, target_seqs):
        self.feature_seqs = feature_seqs
        self.target_seqs = target_seqs

    def __len__(self):
        return len(self.feature_seqs)

    def __getitem__(self, index: int) -> dict[str : torch.Tensor]:
        feature_seqs = torch.Tensor(self.feature_seqs[index]).float()
        target_seqs = torch.Tensor(self.target_seqs[index]).float()
        return {"feature_seqs": feature_seqs, "target_seqs": target_seqs}


class TestDataset(Dataset):
    def __init__(self, feature_seqs):
        self.feature_seqs = feature_seqs

    def __len__(self):
        return len(self.feature_seqs)

    def __getitem__(self, index: int) -> dict[str : torch.Tensor]:
        feature_seqs = torch.Tensor(self.feature_seqs[index]).float()
        return {"feature_seqs": feature_seqs}


class CustomCollate:
    def __init__(self, is_train_mode=True):
        self.is_train_mode = is_train_mode

    def __call__(self, batch):
        feature_seqs = [item["feature_seqs"] for item in batch]
        lengths = [len(seq) for seq in feature_seqs]
        feature_seqs_padded = pad_sequence(
            [(seq) for seq in feature_seqs], batch_first=True
        )  # (sequence_len, feature_dim)

        if not self.is_train_mode:
            return {
                "feature_seqs": feature_seqs_padded,
                "lengths": lengths,
            }

        target_seqs = [item["target_seqs"] for item in batch]
        target_seqs_padded = pad_sequence(
            [(seq) for seq in target_seqs], batch_first=True
        )  # (sequence_len, target_dim)
        return {
            "feature_seqs": feature_seqs_padded,
            "target_seqs": target_seqs_padded,
            "lengths": lengths,
        }


ds = TrainDataset(feature_seqs=feature_seqs, target_seqs=target_seqs)
dl = DataLoader(ds, batch_size=10, collate_fn=CustomCollate(is_train_mode=True), shuffle=False)

for x in dl:
    print(x["lengths"], [len(x_) for x_ in x["feature_seqs"]], [len(x_) for x_ in x["target_seqs"]])

In [None]:
class CustomLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        packed_out, _ = self.lstm(x)
        out, _ = pad_packed_sequence(packed_out, batch_first=True)  # to fixed length
        out = self.fc(out)
        return out


def noopt_train_loop(model, dataloader, criterion, device):
    total_loss = 0
    for batch in dataloader:
        feature_seqs = batch["feature_seqs"].to(device)
        target_seqs = batch["target_seqs"].to(device)
        lengths = batch["lengths"]

        # packing : remove padding
        packed_feature_seqs = pack_padded_sequence(
            feature_seqs,
            lengths,
            batch_first=True,
            enforce_sorted=False,
        )
        output = model(packed_feature_seqs)
        loss = criterion(output, target_seqs)
        total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

max_epochs = 2
batch_size = 2

input_dim = 4
output_dim = 2
hidden_dim = 32
num_layers = 2

train_dataset = TrainDataset(feature_seqs=feature_seqs, target_seqs=target_seqs)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=5,
    collate_fn=CustomCollate(is_train_mode=True),
    shuffle=False,
)

criterion = nn.MSELoss()
model = CustomLSTMModel(
    input_dim=input_dim,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    output_dim=output_dim,
)

for _ in range(max_epochs):
    train_loss = noopt_train_loop(
        model=model,
        dataloader=train_dataloader,
        criterion=criterion,
        device=device,
    )
    print(train_loss)

In [None]:
def infer(model, dataloader, device):
    model.eval()  # モデルを評価モードに設定
    all_outputs = []

    with torch.no_grad():  # 勾配の計算を無効化
        for data in dataloader:
            inputs = data["feature_seqs"].to(device)
            lengths = data["lengths"]
            inputs = pack_padded_sequence(inputs, lengths, batch_first=True, enforce_sorted=False)

            # モデルの予測を計算
            outputs = model(inputs)
            all_outputs.append(outputs)

    return all_outputs


test_dataset = TestDataset(feature_seqs=feature_seqs)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=5,
    collate_fn=CustomCollate(is_train_mode=False),
    shuffle=False,
)
model = CustomLSTMModel(
    input_dim=input_dim,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    output_dim=output_dim,
)
outputs = infer(model=model, dataloader=test_dataloader, device=device)

In [None]:
print(outputs[0].shape, len(outputs))