In [1]:
%cd /kaggle/working

/kaggle/working


In [2]:
import numpy as np
import pandas as pd
import yaml
import sys
import os
import pickle
from glob import glob
from pathlib import Path
import pickle
from tqdm import tqdm
import gc

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
from torch.optim.lr_scheduler import ReduceLROnPlateau

import pytorch_lightning as pl
from torchmetrics import MetricCollection, MeanSquaredError
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

import warnings
warnings.filterwarnings('ignore')

sys.path.append('/kaggle/src')
from utils.set_seed import seed_base_torch
from utils.feature_contena import Features
from utils.lightning_utils import MyLightningDataModule

PACKAGE_DIR = Path("/kaggle/src")
CFG = yaml.safe_load(open(PACKAGE_DIR / "config.yaml", "r"))
print(CFG["wavenet"]["execution"]["exp_id"])

CFG["output_dir"] = f"/kaggle/output/{CFG['wavenet']['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

seed_base_torch(CFG["env"]["seed"])

2023-10-05 10:33:18.328104: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/opt/conda/lib
2023-10-05 10:33:18.328453: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/opt/conda/lib


exp_070


In [3]:
EPOCHS = 150
NNBATCHSIZE = 64
GROUP_BATCH_SIZE = 4000
SEED = 321
LR = 0.001
SPLITS = 5

ENMO_MEAN = 0.041315
ANGLEZ_MEAN = -8.810453
ENMO_STD = 0.101829
ANGLEZ_STD = 35.521877

In [4]:
def generate_features(train: pd.DataFrame):
    features = Features()
    features.add_num_features(["anglez", "enmo"])

    # 時刻
    timestamp = pd.to_datetime(train["timestamp"].values[0])
    total_seconds = (timestamp - timestamp.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds()
    train["total_seconds"] = (total_seconds + train.index * 5) % (24 * 60 * 60) # [sec]
    train["total_seconds"] /= train["total_seconds"].max() - 0.5
    features.add_num_feature("total_seconds")    

    columns = ["anglez", "enmo"]

    # その人のその時刻での平均的な測定値
    gb = train.groupby("total_seconds")[columns].mean()
    gb.columns = [f"{c}_mean" for c in columns]
    train["anglez_mean"] = train["total_seconds"].map(gb["anglez_mean"])
    train["enmo_mean"] = train["total_seconds"].map(gb["enmo_mean"])
    features.add_num_features(gb.columns.tolist())
    columns += gb.columns.tolist()

    # diff
    for c in features.all_features():
        for t in [-1, 1]:
            train[f"{c}_diff_{t}"] = (train[c] - train[c].shift(t)).fillna(0)
            features.add_num_feature(f"{c}_diff_{t}")
    return train, features

In [5]:
files = glob(f"{CFG['dataset']['step_csv_dir']}/*.csv")
cv_split = pd.read_csv(CFG['dataset']['cv_split_path'])
dfs = []
for f in tqdm(files):
    df = pd.read_csv(f)

    # normalize
    df["enmo"] = (df["enmo"] - ENMO_MEAN) / ENMO_STD
    df["anglez"] = (df["anglez"] - ANGLEZ_MEAN) / ANGLEZ_STD

    # feature engineering
    train, features = generate_features(df)

    sid = df["series_id"].values[0]
    df["fold"] = cv_split.loc[cv_split["series_id"] == sid, "fold"].values[0]
    for start in range(0, len(df), GROUP_BATCH_SIZE // 2):
        end = start + GROUP_BATCH_SIZE
        if end > len(df):
            end = len(df)
            start = end - GROUP_BATCH_SIZE
            assert start >= 0
        dfs.append(df.iloc[start: end])
gc.collect()
print(len(dfs))
dfs[0]

100%|██████████| 277/277 [01:23<00:00,  3.34it/s]


64107


Unnamed: 0,series_id,step,timestamp,anglez,enmo,event,target,total_seconds,anglez_mean,enmo_mean,...,anglez_diff_1,anglez_mean_diff_-1,anglez_mean_diff_1,enmo_diff_-1,enmo_diff_1,enmo_mean_diff_-1,enmo_mean_diff_1,total_seconds_diff_-1,total_seconds_diff_1,fold
0,af91d9a50547,0,2018-02-05T11:15:00-0500,2.747173,-0.405729,,1,0.468780,-0.337581,0.476960,...,0.000000,-0.001133,0.000000,0.000000,0.000000,0.080977,0.000000,-0.000058,0.000000,3
1,af91d9a50547,1,2018-02-05T11:15:05-0500,2.747199,-0.405729,,1,0.468838,-0.336448,0.395982,...,0.000025,0.039054,0.001133,0.000000,0.000000,-0.062687,-0.080977,-0.000058,0.000058,3
2,af91d9a50547,2,2018-02-05T11:15:10-0500,2.747283,-0.405729,,1,0.468896,-0.375502,0.458669,...,0.000084,-0.064311,-0.039054,0.000000,0.000000,-0.039977,0.062687,-0.000058,0.000058,3
3,af91d9a50547,3,2018-02-05T11:15:15-0500,2.747182,-0.405729,,1,0.468953,-0.311192,0.498646,...,-0.000101,0.039922,0.064311,0.000000,0.000000,-0.114448,0.039977,-0.000058,0.000058,3
4,af91d9a50547,4,2018-02-05T11:15:20-0500,2.747308,-0.405729,,1,0.469011,-0.351114,0.613095,...,0.000127,0.053927,-0.039922,0.000000,0.000000,0.163387,0.114448,-0.000058,0.000058,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,af91d9a50547,3995,2018-02-05T16:47:55-0500,-1.019410,-0.152363,,1,0.699987,-0.203260,0.601801,...,0.224729,0.146731,0.118163,-0.122755,-0.118827,0.133394,0.039732,-0.000058,0.000058,3
3996,af91d9a50547,3996,2018-02-05T16:48:00-0500,-1.153043,-0.029608,,1,0.700045,-0.349991,0.468408,...,-0.133633,0.086902,-0.146731,0.164982,0.122755,-0.032776,-0.133394,-0.000058,0.000058,3
3997,af91d9a50547,3997,2018-02-05T16:48:05-0500,-0.906871,-0.194591,,1,0.700102,-0.436893,0.501183,...,0.246172,0.084591,-0.086902,0.055976,-0.164982,0.159213,0.032776,-0.000058,0.000058,3
3998,af91d9a50547,3998,2018-02-05T16:48:10-0500,-0.871929,-0.250567,,1,0.700160,-0.521484,0.341970,...,0.034942,-0.071669,-0.084591,-0.054012,-0.055976,-0.597570,-0.159213,-0.000058,0.000058,3


In [6]:
class ZzzDataset(torch.utils.data.Dataset):
    def __init__(self, dfs: list[pd.DataFrame], mode: str, features: Features):
        self.dfs = dfs
        self.mode = mode
        self.features = features

    def __len__(self):
        return len(self.dfs)

    def __getitem__(self, index):
        df = self.dfs[index]

        feats = df[self.features.all_features()].values.astype(np.float32)

        if self.mode == "train":
            targets = df["target"].values.astype(np.float32)
            return feats, targets
        else:
            return feats

In [7]:
from utils.torch_models import WaveBlock    
from utils.lightning_utils import MyLightningDataModule
    
class WaveNetModel(pl.LightningModule):
    def __init__(self, dim_input: int, loss_fn=nn.CrossEntropyLoss(), lr=0.001, weight_decay=0):
        super().__init__()
        self.lr = lr
        self.weight_decay = weight_decay    
        self.loss_fn = loss_fn

        input_size = 128
        self.LSTM1 = nn.GRU(input_size=dim_input, hidden_size=64, num_layers=2, batch_first=True, bidirectional=True)

        self.LSTM = nn.GRU(input_size=input_size, hidden_size=64, num_layers=2, batch_first=True, bidirectional=True)
        #self.attention = Attention(input_size,4000)
        #self.rnn = nn.RNN(input_size, 64, 2, batch_first=True, nonlinearity='relu')
               
        self.wave_block1 = WaveBlock(128, 16, 4)
        self.wave_block2 = WaveBlock(16, 32, 4)
        self.wave_block3 = WaveBlock(32, 64, 2)
        self.wave_block4 = WaveBlock(64, 128, 1)
        self.fc = nn.Linear(128, 1)
        
        self.train_metrics = MetricCollection([], prefix="")
        self.valid_metrics = MetricCollection([], prefix="val_")
        
        self.val_step_outputs = []
        self.val_step_labels = []
        

    def forward(self, x):
        bs = x.shape[0]
        slen = x.shape[1]

        x,_ = self.LSTM1(x)
        x = x.permute(0, 2, 1)
      
        x = self.wave_block1(x)
        x = self.wave_block2(x)
        x = self.wave_block3(x)
        
        #x,_ = self.LSTM(x)
        x = self.wave_block4(x)
        x = x.permute(0, 2, 1)
        x,_ = self.LSTM(x)
        #x = self.conv1(x)
        #print(x.shape)
        #x = self.rnn(x)
        #x = self.attention(x)
        x = self.fc(x)
        logits = x.view(bs, slen)
        return logits

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=3, verbose=True)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}
    

    def training_step(self, batch, batch_idx):
        X, y = batch
        preds = self.forward(X)

        loss = self.loss_fn(preds, y)

        self.train_metrics(preds, y)
        self.log(
            "loss",
            loss,
            prog_bar=True,
            logger=True,
            on_epoch=True,
            on_step=True,
        )
        self.log_dict(
            self.train_metrics,
            prog_bar=True,
            logger=True,
            on_epoch=True,
            on_step=True,
        )
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        X, y = batch
        preds = self.forward(X)

        self.val_step_outputs.append(preds)
        self.val_step_labels.append(y)

    def on_validation_epoch_end(self):
        preds = torch.cat(self.val_step_outputs)
        labels = torch.cat(self.val_step_labels)
        self.val_step_outputs.clear()
        self.val_step_labels.clear()
        loss = self.loss_fn(preds, labels)

        self.valid_metrics(preds, labels)
        self.log(
            "val_loss",
            loss,
            prog_bar=False,
            logger=True,
            on_epoch=True,
            on_step=False,
        )
        self.log_dict(
            self.valid_metrics,
            prog_bar=False,
            logger=True,
            on_epoch=True,
            on_step=False,
        )

        # ログをprint
        self.print_metric(preds, labels, "valid")

    def print_metric(self, y_hat, y, train_or_valid="train"):
        """
        ログをprintする。次のepochが終わると上書きされてしまうので。
        TODO: たぶんもっとマシな方法があるので探す。
        """
        if train_or_valid == "train":
            metrics = self.train_metrics
        else:
            metrics = self.valid_metrics
        loss = self.loss_fn(y_hat, y)

        print(f"[epoch {self.trainer.current_epoch}] {train_or_valid}: ", end="")
        print(f"{type(self.loss_fn).__name__}={loss:.4f}", end=", ")
        for name in metrics:
            v = metrics[name](y_hat, y)
            print(f"{name}={v:.4f}", end=", ")
        print()


In [8]:
features = Features()
features.add_num_features(["anglez", "enmo"])

In [9]:
!rm -r logs

rm: cannot remove 'logs': No such file or directory


In [10]:
fold_oof_dfs = []
for fold in range(5):
    print(f"== fold {fold} ==")
    
    # 学習・評価データ
    train_dfs = [df for df in dfs if df["fold"].unique()[0] != fold]
    valid_dfs = [df for df in dfs if df["fold"].unique()[0] == fold]
    train_dataset = ZzzDataset(train_dfs, mode="train", features=features)
    valid_dataset = ZzzDataset(valid_dfs, mode="train", features=features)
    data_module = MyLightningDataModule(train_dataset, valid_dataset, batch_size=NNBATCHSIZE)

    # モデル
    model = WaveNetModel(lr=LR, dim_input=len(features.all_features()), loss_fn=nn.BCEWithLogitsLoss())
    
    # コールバック
    cp_callback = ModelCheckpoint(
        "logs/", 
        filename=f"best_model_fold{fold}",
        monitor="val_loss",
        mode="min",
        save_top_k=1,
        save_last=False,
    )
    es_callback = EarlyStopping(
        monitor="val_loss",
        mode="min",
        patience=5,
    )

    # 学習
    trainer = pl.Trainer(
        callbacks=[cp_callback, es_callback],
        )
    trainer.fit(model, datamodule=data_module)

    # 推論
    preds = []
    model = WaveNetModel.load_from_checkpoint(f"logs/best_model_fold{fold}.ckpt", dim_impput=len(features.all_features()), lr=0.001, loss_fn=nn.BCEWithLogitsLoss()).to("cuda")
    model.eval()
    with torch.no_grad():
        for X, y in data_module.val_dataloader():
            pred = F.sigmoid(model(X.to("cuda"))).detach().cpu().numpy()
            preds.append(pred)

    oof_dfs = []
    for pred, df in zip(np.vstack(preds), valid_dfs):
        df["oof"] = pred
        df = df.drop(columns=features.all_features())
        oof_dfs.append(df)

    oof_df = pd.concat(oof_dfs)
    oof_df = oof_df.groupby(["series_id", "step"]).mean().reset_index().sort_values(["series_id", "step"])
    fold_oof_dfs.append(oof_df)
    break

== fold 0 ==


Sanity Checking: 0it [00:00, ?it/s]

[epoch 0] valid: BCEWithLogitsLoss=0.6766, 


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[epoch 0] valid: BCEWithLogitsLoss=0.1569, 


Validation: 0it [00:00, ?it/s]

[epoch 1] valid: BCEWithLogitsLoss=0.1151, 


Validation: 0it [00:00, ?it/s]

[epoch 2] valid: BCEWithLogitsLoss=0.1183, 


Validation: 0it [00:00, ?it/s]

[epoch 3] valid: BCEWithLogitsLoss=0.1111, 


Validation: 0it [00:00, ?it/s]

[epoch 4] valid: BCEWithLogitsLoss=0.1110, 


Validation: 0it [00:00, ?it/s]

[epoch 5] valid: BCEWithLogitsLoss=0.1106, 


Validation: 0it [00:00, ?it/s]

[epoch 6] valid: BCEWithLogitsLoss=0.1142, 


Validation: 0it [00:00, ?it/s]

[epoch 7] valid: BCEWithLogitsLoss=0.1248, 
