# Directory Settings

In [None]:
# ====================================================
# directory settings
# ====================================================

import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Libraries

In [None]:
import os
import gc
from glob import glob
import sys
import math
import time
import random
import shutil
from pathlib import Path
from typing import Dict, List
from scipy.stats import entropy
from scipy.signal import butter, lfilter, freqz
from contextlib import contextmanager
from collections import defaultdict, Counter

from scipy.interpolate import interp1d
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, PowerTransformer
from sklearn.metrics import accuracy_score, log_loss
from tqdm.auto import tqdm
from functools import partial
import cv2
from PIL import Image
import torch
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau, OneCycleLR, CosineAnnealingLR, CosineAnnealingWarmRestarts
from sklearn.preprocessing import LabelEncoder
from torchvision.transforms import v2
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.model_selection import train_test_split
import albumentations as A
from albumentations import (Compose, Normalize, Resize, RandomResizedCrop, HorizontalFlip, VerticalFlip, ShiftScaleRotate, Transpose)
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform
import timm
import warnings
warnings.filterwarnings('ignore')
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from matplotlib import pyplot as plt
import joblib
VERSION=17
base_path = "/kaggle/input/final-deepmind-comp-dataset/final_deepmind_comp_dataset/zindi_data/"
additional_path = "/kaggle/input/final-deepmind-comp-dataset/final_deepmind_comp_dataset/image_classifier_results/"

# Config

In [None]:
# ====================================================
# CFG
# ====================================================

class CFG:
    wandb = False
    debug = False
    train=True
    apex=True
    t4_gpu=False
    scheduler='OneCycleLR' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts','OneCycleLR']
    # CosineAnnealingLR params
    cosanneal_params={
        'T_max':6,
        'eta_min':1e-5,
        'last_epoch':-1
    }
    #ReduceLROnPlateau params
    reduce_params={
        'mode':'min',
        'factor':0.2,
        'patience':4,
        'eps':1e-6,
        'verbose':True
    }
    # CosineAnnealingWarmRestarts params
    cosanneal_res_params={
        'T_0':20,
        'eta_min':1e-6,
        'T_mult':1,
        'last_epoch':-1
    }
    print_freq=15
    num_workers = 1
    cnn_model_name = 'resnet1d'
    model_name = 'resnet1d'
    optimizer='Adan'
    epochs = 25
    factor = 0.9
    patience = 2
    eps = 1e-6
    lr = 1e-3
    min_lr = 1e-6
    batch_size = 32
    weight_decay = 1e-2
    batch_scheduler=True
    gradient_accumulation_steps = 1
    max_grad_norm = 1e6
    seed = 2025
    target_cols = "label"
    target_size = 1
    in_channels = 1
    n_fold = 10
    # trn_fold = [2]
    trn_fold = [4, 5, 6]

# Utils

In [None]:
def init_logger(log_file=OUTPUT_DIR+'train.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

def get_score(preds, targets):

    return log_loss(targets, preds)


class ContrastiveLoss(torch.nn.Module):
    """
    Contrastive loss function.
    Based on: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    """

    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        output1_norm = F.normalize(output1)
        output2_norm = F.normalize(output2)
        euclidean_distance = F.cosine_similarity(output1_norm, output2_norm)
        loss_contrastive = torch.mean((1 - label) * torch.pow(euclidean_distance, 2) +
                                      label * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss_contrastive

def get_location(value):
  return value.split("_")[0] + '_' + value.split("_")[1]


# def seed_torch(seed=42):
#     random.seed(seed)
#     os.environ['PYTHONHASHSEED'] = str(seed)
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.cuda.manual_seed_all(seed)  # If using multi-GPU
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = False

def seed_torch(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.enabled = True

seed_torch(seed=CFG.seed)

# Load data

## Load precipitation data

In [None]:
data = pd.read_csv(base_path + "Train.csv")
data_test = pd.read_csv(base_path + "Test.csv")
data_with_cv = pd.read_csv(additional_path + "train_with_cv_results.csv")[['location_id', 'flood_probability']]
data_test_with_cv = pd.read_csv(additional_path + "test_with_cv_results.csv")[['location_id', 'flood_probability',]]

for df in [data, data_test]:

  df['location_id'] = df['event_id'].apply(lambda x: '_'.join(x.split('_')[0:2]))
  df['event_idx'] = df.groupby('location_id', sort=False).ngroup()

  df['event_t'] = df.groupby('location_id').cumcount()


data = pd.merge(data, data_with_cv, on='location_id', how='left')
data_test = pd.merge(data_test, data_test_with_cv, on='location_id', how='left')


print(len(set(data['location_id'])), len(set(data_test['location_id'])))
print(len(set(data['location_id']).intersection(set(data_test['location_id']))))

In [None]:
data.head()

In [None]:
gkf = StratifiedGroupKFold(n_splits=CFG.n_fold)

data['fold'] = -1

for fold_id, (_, val_idx) in enumerate(
    gkf.split(data, y=data['label'], groups=data['location_id'])
):
    print(f"Fold {fold_id}")
    print(f"group={data['location_id'][val_idx].unique()}")
    data.loc[val_idx, 'fold'] = fold_id

del gkf
_ = gc.collect()

In [None]:
def norm_feature_engineering(df):

    # df = apply_boxcox_transformation(df, "precipitation")

    df['precipitation'] = np.log(df["precipitation"] + 0.01).astype("float32")


    for w in range(1, 182):
        df['precipitation_shift_pos_' + str(w)] = df['precipitation'].shift(w).fillna(0)
        df['precipitation_shift_neg_' + str(w)] = df['precipitation'].shift(-w).fillna(0)

    for w in [3, 7, 14, 30, 60, 120]:
        df[f'precipitation_rolling_mean_{w}'] = df['precipitation'].rolling(w, min_periods=1).mean()
        df[f'precipitation_rolling_std_{w}'] = df['precipitation'].rolling(w, min_periods=1).std()
        df[f'precipitation_rolling_median_{w}'] = df['precipitation'].rolling(w, min_periods=1).median()


    for w in [3, 7, 14, 30, 60, 90, 120, 180]:
        df[f'precipitation_max_{w}'] = df['precipitation'].rolling(w, min_periods=1).max()
        df[f'precipitation_min_{w}'] = df['precipitation'].rolling(w, min_periods=1).min()




    df = df[[col for col in df.columns if 'precipitation' in col] + ['flood_probability']].fillna(0)

    return df.to_numpy().transpose(1, 0)

def apply_boxcox_transformation(df, col):
    """
    Applies the Box-Cox transformation to a specified column while handling zero values.

    Args:
        df (pd.DataFrame): Input DataFrame.
        col (str): Column name to transform.

    Returns:
        pd.DataFrame: DataFrame with the transformed column.
    """
    df = df.copy()
    pt = PowerTransformer(method='box-cox')

    # Mask for non-zero values
    non_zero_mask = df[col] != 0.0

    # Initialize transformed data array
    transformed_data = np.zeros_like(df[col], dtype=float)

    # Apply transformation to non-zero values
    non_zero_transformed = pt.fit_transform(df.loc[non_zero_mask, [col]]).flatten()

    # Assign transformed values
    min_value = non_zero_transformed.min()
    transformed_data[non_zero_mask] = non_zero_transformed
    transformed_data[~non_zero_mask] = min_value - 0.001  # Slightly below min

    # Assign back to DataFrame
    df[col] = transformed_data
    return df

def time_warp(features, sigma=0.2, knot=4):
    """
    Apply time warping to each feature independently.
    """
    warped_features = np.zeros_like(features)
    orig_steps = np.arange(features.shape[1])

    for i in range(features.shape[0]):  # Iterate over features (17)
        random_warp = np.random.normal(loc=1.0, scale=sigma, size=(knot,))
        interp = interp1d(np.linspace(0, features.shape[1], num=knot), random_warp, kind='linear', fill_value='extrapolate')
        warped_series = interp(orig_steps)
        warped_features[i] = np.interp(orig_steps * warped_series, orig_steps, features[i])

    return warped_features

def time_shift(features, max_shift=10):
    shift = np.random.randint(-max_shift, max_shift)
    return np.roll(features, shift, axis=1)  # Shift along the time dimension


def add_gaussian_noise(features, std=0.1):
    noise = np.random.normal(0, std, size=features.shape)
    return features + noise

def feature_dropout(features, drop_prob=0.2):
    mask = np.random.binomial(1, 1 - drop_prob, (features.shape[0], 1))  # Same mask for all time steps
    return features * mask

def scale_features(features, scale_range=(0.8, 1.2)):
    scales = np.random.uniform(scale_range[0], scale_range[1], size=(features.shape[0], 1))
    return features * scales

def freq_perturbation(features, alpha=0.1):
    fft_coeffs = np.fft.fft(features, axis=1)  # Apply FFT along time dimension
    perturb = np.random.normal(1, alpha, size=fft_coeffs.shape)
    return np.real(np.fft.ifft(fft_coeffs * perturb, axis=1))  # Apply inverse FFT

def random_flip(precipitation, label):
    precipitation = np.flip(precipitation).copy()  # Ensure positive strides
    label = np.flip(label).copy()  # Ensure positive strides
    return precipitation, label



class CustomDataset(Dataset):
    def __init__(
        self, df: pd.DataFrame, augment: bool = False,
        mode: bool = True
    ):
        self.df = df
        self.augment = augment
        self.mode = mode
        self.location_ids = df['location_id'].unique()



    def __len__(self):
        return len(self.df['location_id'].unique())

    def __getitem__(self, index):

        precipitation, label = self.__data_generation(index)

        if self.augment:
            if np.random.rand() < 0.3:
                precipitation = time_shift(precipitation)

            if np.random.rand() < 0.3:
                precipitation = add_gaussian_noise(precipitation)

            if np.random.rand() < 0.3:
                precipitation = feature_dropout(precipitation)

            if np.random.rand() < 0.3:
                precipitation, label = random_flip(precipitation, label)

        return {'precipitation': torch.tensor(precipitation, dtype=torch.float32), 'label': torch.tensor(label, dtype=torch.float32)}

    def __data_generation(self, index):

        event_id = self.location_ids[index]
        # precipitation = self.df[self.df['event_id'] == event_id].pivot(index='event_id', columns='event_t', values='precipitation').to_numpy()
        precipitation = self.df[self.df['location_id'] == event_id].sort_values('event_t')
        precipitation = norm_feature_engineering(precipitation)

        label = np.zeros(730, dtype='float32')
        if self.mode != 'test':
            label = self.df[self.df['location_id'] == event_id].pivot(index='location_id', columns='event_t', values='label').to_numpy()
            label = np.squeeze(label, axis=0)

        return precipitation, label

# W&B Settings

In [None]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:

    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_key")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

# Models

In [None]:
class SinusoidalPositionalEmbedding(torch.nn.Module):
    def __init__(self, seq_len, dim_model):
        """
        Initialize the sinusoidal positional embedding.

        Args:
        seq_len (int): The length of the sequence (e.g., 730 days).
        dim_model (int): The model's embedding size (e.g., 512).
        """
        super(SinusoidalPositionalEmbedding, self).__init__()
        self.seq_len = seq_len
        self.dim_model = dim_model
        self.positional_embedding = self._create_positional_embedding()

    def _create_positional_embedding(self):
        """
        Create the sinusoidal positional embedding tensor.

        Returns:
        torch.Tensor: Positional embeddings of shape (seq_len, dim_model).
        """
        position = torch.arange(self.seq_len, dtype=torch.float).unsqueeze(1)  # Shape: (seq_len, 1)
        div_term = torch.exp(
            torch.arange(0, self.dim_model, 2, dtype=torch.float) *
            (-math.log(10000.0) / self.dim_model)
        )  # Frequencies: Shape (dim_model / 2)

        # Compute sin and cos
        pos_embedding = torch.zeros(self.seq_len, self.dim_model)
        pos_embedding[:, 0::2] = torch.sin(position * div_term)  # Apply sin to even indices
        pos_embedding[:, 1::2] = torch.cos(position * div_term)  # Apply cos to odd indices

        return pos_embedding

    def forward(self, x):
        """
        Add positional embedding to input tensor.

        Args:
        x (torch.Tensor): Input tensor of shape (batch_size, seq_len, dim_model).

        Returns:
        torch.Tensor: Input tensor with positional embeddings added.
        """
        batch_size, seq_len, dim_model = x.size()
        if seq_len != self.seq_len or dim_model != self.dim_model:
            raise ValueError(
                f"Input shape mismatch: Expected (batch_size, {self.seq_len}, {self.dim_model}), got {x.size()}"
            )
        return x + self.positional_embedding.to(x.device).unsqueeze(0)  # Broadcast positional embeddings


class SEBlock(nn.Module):
    """Squeeze-and-Excitation block for channel attention."""
    def __init__(self, channels, reduction=16):
        super(SEBlock, self).__init__()
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.fc1 = nn.Linear(channels, channels // reduction, bias=False)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(channels // reduction, channels, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Squeeze: Global Average Pooling
        b, c, _ = x.size()
        y = self.global_avg_pool(x).view(b, c)

        # Excitation: Fully Connected Layers
        y = self.fc1(y)
        y = self.relu(y)
        y = self.fc2(y)
        y = self.sigmoid(y).view(b, c, 1)

        # Scale
        return x * y

class ResNetBlock1d(nn.Module):
    """ResNet block for 1D convolutions with Squeeze-and-Excitation."""

    def __init__(self, in_channels, filters, se_reduction=16):
        super(ResNetBlock1d, self).__init__()
        self.filters = filters

        # Main path
        self.conv1 = nn.Conv1d(
            in_channels=in_channels,
            out_channels=filters,
            kernel_size=3,
            stride=1,  # Preserve feature dimension
            padding=1,
            bias=False
        )
        self.bn1 = nn.BatchNorm1d(filters)
        self.conv2 = nn.Conv1d(
            in_channels=filters,
            out_channels=filters,
            kernel_size=3,
            stride=1,  # Preserve feature dimension
            padding=1,
            bias=False
        )
        self.bn2 = nn.BatchNorm1d(filters)

        # SE Block
        self.se_block = SEBlock(filters, reduction=se_reduction)

        # Activation
        self.act = nn.ReLU()

        # Projection for residual if in_channels != filters
        if in_channels != filters:
            self.projection = nn.Sequential(
                nn.Conv1d(
                    in_channels=in_channels,
                    out_channels=filters,
                    kernel_size=1,
                    stride=1,  # Preserve feature dimension
                    bias=False
                ),
                nn.BatchNorm1d(filters)
            )
        else:
            self.projection = None

    def forward(self, x):
        residual = x

        # Main path
        y = self.conv1(x)
        y = self.bn1(y)
        y = self.act(y)
        y = self.conv2(y)
        y = self.bn2(y)

        # Squeeze-and-Excitation
        y = self.se_block(y)

        # Projection if necessary
        if self.projection is not None:
            residual = self.projection(x)

        return self.act(residual + y)


class ResNet1d(nn.Module):
    """ResNet for 1D convolutions, returns embeddings."""

    def __init__(self, stage_sizes, num_filters=64, embed=True):
        super(ResNet1d, self).__init__()
        self.num_filters = num_filters
        self.stage_sizes = stage_sizes
        self.embed = embed

        # Initial layers
        self.conv_init = nn.Conv1d(
            in_channels=398,
            out_channels=num_filters,
            kernel_size=7,
            stride=1,  # Preserve feature dimension
            padding=3,
            bias=False
        )
        self.bn_init = nn.BatchNorm1d(num_filters)
        self.act = nn.ReLU()

        self.project_embedding = nn.Linear(398, 576)
        self.pos_emb = SinusoidalPositionalEmbedding(730, 576)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=576, nhead=576//32, dim_feedforward=2*576,
                dropout=0.2, activation=nn.GELU(), batch_first=True, norm_first=True), 2)

        # self.attn = nn.MultiheadAttention(embed_dim=512, num_heads=8, batch_first=True)
        self.rnn1 = nn.GRU(input_size=398, hidden_size=256, num_layers=1, bidirectional=True, batch_first=True)
        self.rnn2 = nn.GRU(input_size=398, hidden_size=256, num_layers=1, bidirectional=True, batch_first=True)
        self.logit = nn.Conv1d(in_channels=576, out_channels=1, kernel_size=7, padding=3, stride=1)
        # Build ResNet stages
        self.stages = self._make_stages()

    def _make_stages(self):
        stages = nn.ModuleList()
        in_channels = self.num_filters

        for i, block_size in enumerate(self.stage_sizes):
            blocks = nn.ModuleList()
            for j in range(block_size):
                filters = self.num_filters * (2 ** i)
                blocks.append(ResNetBlock1d(in_channels=in_channels, filters=filters))
                in_channels = filters  # Update in_channels for the next block
            stages.append(blocks)

        return stages

    def extract_features(self, x):
        project_precipitation = self.project_embedding(x.permute(0, 2, 1))
        # x = x.unsqueeze(1)  # Add channel dimension
        out = self.conv_init(x)
        out = self.bn_init(out)
        out = self.act(out)

        for i, blocks in enumerate(self.stages):
            for j, block in enumerate(blocks):
                out = block(out)
        rnn_out1, hidden = self.rnn1(x.permute(0, 2, 1))
        rnn_out2, hidden2 = self.rnn2(x.permute(0, 2, 1), hidden)
        # rnn_out2, _ = self.attn(rnn_out2, rnn_out2, rnn_out2)
        new_out1 = torch.cat([out.permute(0, 2, 1), rnn_out2], dim=2)

        pos_embedding = self.pos_emb(project_precipitation)
        transformer_input = pos_embedding  + new_out1
        transformer_fusion = self.transformer(transformer_input)
        return new_out1, transformer_fusion

    def forward(self, x):
        _, x = self.extract_features(x)
        x = self.logit(x.permute(0, 2, 1))
        return x

In [None]:
if CFG.debug:

    train_data = data[data['fold'] == 0]

    dataset = CustomDataset(train_data, augment=True, mode='valid')
    dataloader = DataLoader(dataset,
                            batch_size=CFG.batch_size,
                            shuffle=True,
                            num_workers=CFG.num_workers, pin_memory=True, drop_last=True)

    batch = next(iter(dataloader))



    model = ResNet1d(stage_sizes=[2, 3, 3], num_filters=16).to(device)

    pred_precipitation = model(batch['precipitation'].to(device))


    criteria = nn.BCEWithLogitsLoss()
    loss = criteria(pred_precipitation.squeeze(1), batch['label'].to(device))
    # contrastive_target = torch.zeros(image_emb.size(0)).to(device)  # Assuming all pairs are not similar

    # assert torch.all(torch.isfinite(image_emb)), "Non-finite values in output1"
    # assert torch.all(torch.isfinite(precipitation_emb)), "Non-finite values in output2"
    # assert torch.all((contrastive_target == 0) | (contrastive_target == 1)), "Labels must be 0 or 1"
    # contrastive_loss = ContrastiveLoss()(image_emb, precipitation_emb, contrastive_target)

    metric = log_loss(batch['label'].flatten().detach().cpu().numpy(), pred_precipitation.sigmoid().flatten().detach().cpu().numpy())

    # print("early_fusion: ", early_fusion.shape)
    # print("pred_image: ", pred_image.shape)
    print("pred_precipitation: ", pred_precipitation.shape)
    # print("image_emb: ", image_emb.shape)
    # print("precipitation_emb: ", precipitation_emb.shape)
    print("loss: ", loss.item())
    # print("contrastive loss: ", contrastive_loss.item())
    print("metric: ", metric)

# Adan Optimizer

In [None]:
import math
import torch
from torch.optim.optimizer import Optimizer


class Adan(Optimizer):
    """
    Implements a pytorch variant of Adan
    Adan was proposed in
    Adan: Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models[J]. arXiv preprint arXiv:2208.06677, 2022.
    https://arxiv.org/abs/2208.06677
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        betas (Tuple[float, float, flot], optional): coefficients used for computing
            running averages of gradient and its norm. (default: (0.98, 0.92, 0.99))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        weight_decay (float, optional): decoupled weight decay (L2 penalty) (default: 0)
        max_grad_norm (float, optional): value used to clip
            global grad norm (default: 0.0 no clip)
        no_prox (bool): how to perform the decoupled weight decay (default: False)
    """

    def __init__(self, params, lr=1e-3, betas=(0.98, 0.92, 0.99), eps=1e-8,
                 weight_decay=0.2, max_grad_norm=0.0, no_prox=False):
        if not 0.0 <= max_grad_norm:
            raise ValueError("Invalid Max grad norm: {}".format(max_grad_norm))
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        if not 0.0 <= betas[2] < 1.0:
            raise ValueError("Invalid beta parameter at index 2: {}".format(betas[2]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay,
                        max_grad_norm=max_grad_norm, no_prox=no_prox)
        super(Adan, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(Adan, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('no_prox', False)

    @torch.no_grad()
    def restart_opt(self):
        for group in self.param_groups:
            group['step'] = 0
            for p in group['params']:
                if p.requires_grad:
                    state = self.state[p]
                    # State initialization

                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p)
                    # Exponential moving average of gradient difference
                    state['exp_avg_diff'] = torch.zeros_like(p)

    @torch.no_grad()
    def step(self):
        """
            Performs a single optimization step.
        """
        if self.defaults['max_grad_norm'] > 0:
            device = self.param_groups[0]['params'][0].device
            global_grad_norm = torch.zeros(1, device=device)

            max_grad_norm = torch.tensor(self.defaults['max_grad_norm'], device=device)
            for group in self.param_groups:

                for p in group['params']:
                    if p.grad is not None:
                        grad = p.grad
                        global_grad_norm.add_(grad.pow(2).sum())

            global_grad_norm = torch.sqrt(global_grad_norm)

            clip_global_grad_norm = torch.clamp(max_grad_norm / (global_grad_norm + group['eps']), max=1.0)
        else:
            clip_global_grad_norm = 1.0

        for group in self.param_groups:
            beta1, beta2, beta3 = group['betas']
            # assume same step across group now to simplify things
            # per parameter step can be easily support by making it tensor, or pass list into kernel
            if 'step' in group:
                group['step'] += 1
            else:
                group['step'] = 1

            bias_correction1 = 1.0 - beta1 ** group['step']

            bias_correction2 = 1.0 - beta2 ** group['step']

            bias_correction3 = 1.0 - beta3 ** group['step']

            for p in group['params']:
                if p.grad is None:
                    continue

                state = self.state[p]
                if len(state) == 0:
                    state['exp_avg'] = torch.zeros_like(p)
                    state['exp_avg_sq'] = torch.zeros_like(p)
                    state['exp_avg_diff'] = torch.zeros_like(p)

                grad = p.grad.mul_(clip_global_grad_norm)
                if 'pre_grad' not in state or group['step'] == 1:
                    state['pre_grad'] = grad

                copy_grad = grad.clone()

                exp_avg, exp_avg_sq, exp_avg_diff = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_diff']
                diff = grad - state['pre_grad']

                update = grad + beta2 * diff
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)  # m_t
                exp_avg_diff.mul_(beta2).add_(diff, alpha=1 - beta2)  # diff_t
                exp_avg_sq.mul_(beta3).addcmul_(update, update, value=1 - beta3)  # n_t

                denom = ((exp_avg_sq).sqrt() / math.sqrt(bias_correction3)).add_(group['eps'])
                update = ((exp_avg / bias_correction1 + beta2 * exp_avg_diff / bias_correction2)).div_(denom)

                if group['no_prox']:
                    p.data.mul_(1 - group['lr'] * group['weight_decay'])
                    p.add_(update, alpha=-group['lr'])
                else:
                    p.add_(update, alpha=-group['lr'])
                    p.data.div_(1 + group['lr'] * group['weight_decay'])

                state['pre_grad'] = copy_grad

# Helper Functions

In [None]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.amp.GradScaler('cuda', enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, batch in enumerate(train_loader):
        precipitation = batch['precipitation'].to(device)
        labels = batch['label'].to(device)
        batch_size = labels.size(0)
        with torch.amp.autocast('cuda', enabled=CFG.apex):

            pred_precipitation = model(precipitation)

            #loss = criterion(F.log_softmax(y_preds, dim=1), labels)
            loss = criterion(pred_precipitation.squeeze(1), labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg



def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    targets = []
    start = end = time.time()
    for step, batch in enumerate(valid_loader):
        precipitation = batch['precipitation'].to(device)
        labels = batch['label'].to(device)
        batch_size = labels.size(0)
        with torch.no_grad():

            pred_precipitation = model(precipitation)

            #loss = criterion(F.log_softmax(y_preds, dim=1), labels)
            loss = criterion(pred_precipitation.squeeze(1), labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(pred_precipitation.sigmoid().flatten().detach().cpu().numpy())
        targets.append(labels.flatten().detach().cpu().numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    targets = np.concatenate(targets)
    return losses.avg, predictions, targets

# Train Loop

In [None]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[(folds['fold'] != fold)].reset_index(drop=True)

    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)

    train_dataset = CustomDataset(train_folds, augment=True, mode='train')
    valid_dataset = CustomDataset(valid_folds, augment=False, mode="train")

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================

    model = ResNet1d(stage_sizes=[2, 3, 3], num_filters=16).to(device)
    model.to(device)
    if CFG.t4_gpu:
        model = nn.DataParallel(model)

    def build_optimizer(cfg, optimizer_parameters, device):
        lr = cfg.lr
        # lr = default_configs["lr"]
        if cfg.optimizer == "SAM":
            base_optimizer = torch.optim.SGD  # define an optimizer for the "sharpness-aware" update
            optimizer_model = SAM(model.parameters(), base_optimizer, lr=lr, momentum=0.9, weight_decay=cfg.weight_decay, adaptive=True)
        elif cfg.optimizer == "Ranger21":
            optimizer_model = Ranger21(model.parameters(), lr=lr, weight_decay=cfg.weight_decay,
            num_epochs=cfg.epochs, num_batches_per_epoch=len(train_loader))
        elif cfg.optimizer == "SGD":
            optimizer_model = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=cfg.weight_decay, momentum=0.9)
        elif cfg.optimizer == "Adam":
            optimizer_model = Adam(model.parameters(), lr=lr, weight_decay=CFG.weight_decay)
        elif cfg.optimizer == "AdamW":
            optimizer_model = AdamW(optimizer_parameters, lr=lr)
        elif cfg.optimizer == "Lion":
            optimizer_model = Lion(model.parameters(), lr=lr, weight_decay=cfg.weight_decay)
        elif cfg.optimizer == "Adan":
            optimizer_model = Adan(optimizer_parameters, lr=lr)

        return optimizer_model

    def get_optimizer_params(model, lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

        optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': lr, 'weight_decay': 0.0}
        ]

        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                CFG.lr,
                                                weight_decay=CFG.weight_decay)

    optimizer = build_optimizer(CFG, optimizer_parameters, device)

    # ====================================================
    # scheduler
    # ====================================================
    # ====================================================

    def get_scheduler(optimizer):
        if CFG.scheduler=='ReduceLROnPlateau':
            scheduler = ReduceLROnPlateau(optimizer, **CFG.reduce_params)
        elif CFG.scheduler=='CosineAnnealingLR':
            scheduler = CosineAnnealingLR(optimizer, **CFG.cosanneal_params)
        elif CFG.scheduler=='CosineAnnealingWarmRestarts':
            scheduler = CosineAnnealingWarmRestarts(optimizer, **CFG.cosanneal_res_params)
        elif CFG.scheduler=='OneCycleLR':
            steps_per_epoch=len(train_loader),
            scheduler = OneCycleLR(optimizer=optimizer, epochs=CFG.epochs, anneal_strategy="cos", pct_start=0.05, steps_per_epoch=len(train_loader),
        max_lr=CFG.lr, final_div_factor=100)
        return scheduler

    scheduler = get_scheduler(optimizer)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()


    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions, targets = valid_fn(valid_loader, model, criterion, device)
        score = get_score(predictions, targets)
        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  logloss: {score}  time: {elapsed:.0f}s')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1,
                       f"[fold{fold}] avg_train_loss": avg_loss,
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})

        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best valid logloss: {score:.7f} Model')
            # CPMP: save the original model. It is stored as the module attribute of the DP model.

            state_dict = model.module.state_dict() if CFG.t4_gpu else model.state_dict()
            torch.save({'model': state_dict,
                            'predictions': predictions},
                             OUTPUT_DIR+f"{CFG.model_name}_fold{fold}_best_version{VERSION}.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model_name}_fold{fold}_best_version{VERSION}.pth",
                             map_location=torch.device('cpu'))['predictions']

    valid_folds["resnet1d_oof_preds"] = predictions
    torch.cuda.empty_cache()
    for i in range(100):
        _ = gc.collect()

    return valid_folds, best_score

In [None]:
if __name__ == '__main__':

    if CFG.train:
        if CFG.wandb:
            run = wandb.init(project='Inundata competition',
                         name=CFG.model_name + f'version{VERSION}',
                         config=class2dict(CFG),
                         group=CFG.model_name,
                         job_type="train",
                         )

        oof_df = pd.DataFrame()
        scores = []
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df, score = train_loop(data, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                scores.append(score)
                LOGGER.info(f"========== fold: {fold} result ==========")
                LOGGER.info(f'Score with best logloss weights: {score}')
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        LOGGER.info(f'Score with best logloss weights: {np.mean(scores)}')
        oof_df.to_csv(OUTPUT_DIR+f'{CFG.model_name}_oof_df_version{VERSION}.csv', index=False)

    if CFG.wandb:
        wandb.finish()

### Normalizing the predictions based on the Flood Probaility

In [None]:
from sklearn.metrics import log_loss

print(f"logloss before normalizing: {log_loss(oof_df['label'], oof_df['resnet1d_oof_preds'])}")

locations_to_normalize = oof_df[oof_df['flood_probability'] >= 0.5]['location_id'].unique()
oof_df['oof_sum_prob'] = oof_df.groupby('location_id')['resnet1d_oof_preds'].transform('sum')

# Avoid division by zero
epsilon = 1e-8
oof_df['oof_resnet1d_norm'] = oof_df['resnet1d_oof_preds']  # Copy original values

oof_df.loc[oof_df['location_id'].isin(locations_to_normalize), 'oof_resnet1d_norm'] = (
    oof_df.loc[oof_df['location_id'].isin(locations_to_normalize), 'resnet1d_oof_preds'] /
    (oof_df.loc[oof_df['location_id'].isin(locations_to_normalize), 'oof_sum_prob'] + epsilon)
)

print(f"logloss after normalizing: {log_loss(oof_df['label'], oof_df['oof_resnet1d_norm'])}")