In [1]:
%cd /workspace

from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
from tqdm import tqdm

/workspace


In [2]:
INPUT = Path("/workspace/resources/input")
task_df = pd.read_parquet(INPUT / "task2_dataset_raw_train.parquet")
poi_df = pd.read_parquet(INPUT / "cell_POIcat.parquet")

# task_df = pd.concat([task_df] * 4)

In [18]:
task_df

Unnamed: 0,uid,d,t,x,y
0,2381,0,15,158,99
1,2381,0,16,167,90
2,2381,0,19,167,88
3,2381,0,20,167,88
4,2381,0,23,168,88
...,...,...,...,...,...
26621231,1792,74,42,75,135
26621232,1792,74,43,75,135
26621233,1792,74,45,75,135
26621234,1792,74,46,75,135


In [6]:
class GroupedSimpleFeatureExtoractor:
    def __init__(self, group_key, group_values, agg_methods):
        self.group_key = group_key
        self.group_values = group_values
        self.agg_methods = agg_methods

        self.group_key_name = "_".join(group_key)

    def __call__(self, df):
        agg_df = df.groupby(self.group_key)[self.group_values].agg(self.agg_methods)
        agg_df.columns = [
            f"{x[0]}_grpby_{self.group_key_name}_agg_{x[1]}" for x in agg_df.columns
        ]
        return (
            pd.merge(
                df[self.group_key],
                agg_df,
                how="left",
                left_on=self.group_key,
                right_index=True,
            )
            .drop(self.group_key, axis=1)
            .add_prefix("f_")
        )


df = task_df.copy()
group_key = ["uid"]
group_values = ["x", "y"]
agg_methods = ["mean", "max"]

e = GroupedSimpleFeatureExtoractor(group_key, group_values, agg_methods)
e(df)

Unnamed: 0,f_x_grpby_uid_agg_mean,f_x_grpby_uid_agg_max,f_y_grpby_uid_agg_mean,f_y_grpby_uid_agg_max
0,152.626970,190,94.427320,199
1,152.626970,190,94.427320,199
2,152.626970,190,94.427320,199
3,152.626970,190,94.427320,199
4,152.626970,190,94.427320,199
...,...,...,...,...
26621231,75.556635,93,131.611904,171
26621232,75.556635,93,131.611904,171
26621233,75.556635,93,131.611904,171
26621234,75.556635,93,131.611904,171


In [35]:
class TimeGroupedSimpleFeatureExtoractor:
    def __init__(self, group_key, group_values, time_range, agg_methods):
        self.group_key = group_key
        self.group_values = group_values
        self.time_range = time_range
        self.agg_methods = agg_methods

        self.group_key_name = "_".join(group_key)

        self.d_range = list(range(*time_range["d"])) if "d" in self.time_range else None
        self.t_range = list(range(*time_range["t"])) if "t" in self.time_range else None

        self.time_range_name = self.format_dict(time_range)

    @staticmethod
    def format_dict(d):
        result = []
        for key, values in d.items():
            result.append(f"{key}{values[0]}_{values[1]}")
        return "_".join(result)

    def __call__(self, df):
        selected_df = (
            df[df["d"].isin(self.d_range)].reset_index(drop=True)
            if self.d_range is not None
            else df.copy()
        )
        selected_df = (
            df[df["t"].isin(self.t_range)].reset_index(drop=True)
            if self.t_range is not None
            else selected_df
        )

        agg_df = selected_df.groupby(self.group_key)[self.group_values].agg(
            self.agg_methods
        )
        agg_df.columns = [
            f"{x[0]}_grpby_{self.group_key_name}_agg_{x[1]}_{self.time_range_name}"
            for x in agg_df.columns
        ]
        return (
            pd.merge(
                df[self.group_key],
                agg_df,
                how="left",
                left_on=self.group_key,
                right_index=True,
            )
            .drop(self.group_key, axis=1)
            .add_prefix("f_")
        )


group_key = ["uid"]
group_values = ["x", "y"]
time_range = {"d": [0, 7], "t": [0, 30]}
agg_methods = ["mean", "max"]

e = TimeGroupedSimpleFeatureExtoractor(group_key, group_values, time_range, agg_methods)
e(df)

Unnamed: 0,f_x_grpby_uid_agg_mean_d0_7_t0_30,f_x_grpby_uid_agg_max_d0_7_t0_30,f_y_grpby_uid_agg_mean_d0_7_t0_30,f_y_grpby_uid_agg_max_d0_7_t0_30
0,152.807843,190,92.968627,199
1,152.807843,190,92.968627,199
2,152.807843,190,92.968627,199
3,152.807843,190,92.968627,199
4,152.807843,190,92.968627,199
...,...,...,...,...
99995,134.755556,141,94.533333,107
99996,134.755556,141,94.533333,107
99997,134.755556,141,94.533333,107
99998,134.755556,141,94.533333,107


### GCN Examples

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F


# Graph Convolution 層の定義
class GraphConvolution(nn.Module):
    def __init__(self, in_features, out_features):
        super(GraphConvolution, self).__init__()
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.weight)

    def forward(self, input, adj):
        support = torch.matmul(input, self.weight)
        output = torch.matmul(adj, support)
        return output


# Mesh GCN モデルの定義
class MeshGCN(nn.Module):
    def __init__(self, feature_dim, hidden_dim, num_classes):
        super(MeshGCN, self).__init__()
        self.gc1 = GraphConvolution(feature_dim, hidden_dim)
        self.gc2 = GraphConvolution(hidden_dim, num_classes)

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = self.gc2(x, adj)
        return x


# メッシュの特徴データと隣接行列の準備
mesh_features = torch.rand((4, 10))
adjacency_matrix = torch.tensor(
    [
        [0, 1, 0, 0],
        [1, 0, 1, 0],
        [0, 1, 0, 1],
        [0, 0, 1, 0],
    ],
    dtype=torch.float32,
)

# ユーザーのメッシュ位置のインデックス
user_indices = torch.tensor([0, 2, 3])  # user1はm1, user2はm3, user3はm4に位置

# ユーザーデータの取得
user_data = mesh_features[user_indices]  # features

# ユーザーに対応する隣接行列の部分の取得
user_adj = adjacency_matrix[user_indices][:, user_indices]

# モデルのインスタンス化と予測の実行
model = MeshGCN(10, 64, 1)
updated_features = model(user_data, user_adj)

print(updated_features)

tensor([[0.0000],
        [0.2945],
        [0.5518]], grad_fn=<MmBackward0>)


In [16]:
mesh_features.shape

torch.Size([4, 10])

In [17]:
user_indices

tensor([0, 2, 3])

In [18]:
user_data.shape

torch.Size([3, 10])

In [19]:
user_adj

tensor([[0., 0., 0.],
        [0., 0., 1.],
        [0., 1., 0.]])

In [20]:
updated_features.shape

torch.Size([3, 1])

### Transformer Example

In [74]:
import torch
import torch.nn as nn


class CustomTransformerModelV1(nn.Module):
    def __init__(
        self,
        input_size1,
        input_size2,
        d_model,
        output_size,
        nhead=8,
        num_encoder_layers=6,
        num_decoder_layers=6,
    ):
        super().__init__()
        self.embedding_src = nn.Linear(input_size1, d_model)
        self.embedding_tgt = nn.Linear(input_size2, d_model)

        self.transformer = nn.Transformer(
            d_model,
            nhead,
            num_encoder_layers,
            num_decoder_layers,
            batch_first=True,
        )
        self.out = nn.Linear(d_model, output_size)

    def forward(self, batch):
        x_src = self.embedding_src(batch["feature_seqs"])
        x_tgt = self.embedding_tgt(batch["auxiliary_seqs"])

        src_mask = batch["feature_padding_mask"]
        tgt_mask = batch["auxiliary_padding_mask"]

        x = self.transformer(
            src=x_src,
            tgt=x_tgt,
            src_key_padding_mask=src_mask,
            tgt_key_padding_mask=tgt_mask,
        )
        x = self.out(x)

        return x


# ハイパーパラメータ
batch_size = 32
input_size1 = 5
input_size2 = 12
d_model = 128
seq_len_src = 20
seq_len_tgt = 10
output_size = 1
nhead = 2

# モデルのインスタンス化
model = CustomTransformerModelV1(
    nhead=nhead,
    input_size1=input_size1,
    input_size2=input_size2,
    d_model=d_model,
    output_size=output_size,
)

# 入力データの準備
batch = {
    "feature_seqs": torch.randn(batch_size, seq_len_src, input_size1),
    "auxiliary_seqs": torch.randn(batch_size, seq_len_tgt, input_size2),
    "feature_padding_mask": (torch.randn(batch_size, seq_len_src) > 0.5),  # 仮のマスク
    "auxiliary_padding_mask": (torch.randn(batch_size, seq_len_tgt) > 0.5),  # 仮のマスク
}
# モデルの実行
output = model(batch)
print(output.shape)  # [batch_size, seq_len_tgt, output_size] の形になるはず

torch.Size([32, 10, 1])
