In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
from sklearn.metrics import log_loss, roc_auc_score
import gc

In [None]:
!pip install fastai -q

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from fastai.tabular.all import *

In [None]:
base_path = "/kaggle/input/final-deepmind-comp-dataset/final_deepmind_comp_dataset/zindi_data/"
additional_path = "/kaggle/input/final-deepmind-comp-dataset/final_deepmind_comp_dataset/image_classifier_results/"
train = pd.read_csv(base_path + "Train.csv")
test = pd.read_csv(base_path + "Test.csv")
train_with_cv_results = pd.read_csv(additional_path + "train_with_cv_results.csv")[['location_id', 'flood_probability']]
test_with_cv_results = pd.read_csv(additional_path + "test_with_cv_results.csv")[['location_id', 'flood_probability',]]
submission = pd.read_csv(base_path + "SampleSubmission.csv")
images = np.load(base_path + "composite_images.npz")
display(train.head(), train.shape, train_with_cv_results.head(), train_with_cv_results.shape, test.head(), test.shape)


In [None]:
def get_location(value):
  return value.split("_")[0] + '_' + value.split("_")[1]

def get_event_id(value):
  return value.split("_")[3]
for df in [train, test]:

  df['location_id'] = df['event_id'].apply(lambda x: '_'.join(x.split('_')[0:2]))
  df['event_idx'] = df.groupby('location_id', sort=False).ngroup()

  df['event_t'] = df.groupby('location_id').cumcount()

print(len(set(train['location_id'])), len(set(test['location_id'])))
print(len(set(train['location_id']).intersection(set(test['location_id']))))
display(train.head(), test.head())

In [None]:
train_df = pd.merge(train, train_with_cv_results, on='location_id', how='left')
test_df = pd.merge(test, test_with_cv_results, on='location_id', how='left')


display(train_df.head(), train_df.shape, test_df.head(), test_df.shape)

In [None]:
seed = 2024
selected_columns = []
n_splits = 10
gkf = StratifiedGroupKFold(n_splits = n_splits)

train_df['fold'] = -1
for fold, (_, val_idx) in enumerate(gkf.split(train_df, train_df['label'], groups = train_df['location_id'])):
    train_df.loc[val_idx, "fold"] = fold

train_df['fold'].value_counts()

In [None]:
from types import new_class
def apply_expanding_combinations(df, group_cols_list, target_col='Sales', shift_periods=[1], min_periods=1, stats=['mean', 'std']):
    # Loop through the group column combinations
    for group_cols in group_cols_list:
        # Generate base name for the grouping
        group_name = '_'.join(group_cols)

        for shift_period in shift_periods:
            for stat in stats:
                expanding_col_name = f'expanding_grouped_{group_name}_{target_col}_shift_{shift_period}_{stat}'

                # Apply groupby, shift, and expanding for the given statistic
                df[expanding_col_name] = df.groupby(group_cols)[target_col].transform(
                    lambda x: x.shift(shift_period).expanding(min_periods=min_periods).agg(stat)
                )

    return df

def smoothen_target(df, group_cols, target_col):
  n_std = 10
  for i_smooth in [target_col]:
      df_id_outlier = df.groupby(group_cols,as_index=False).agg({
          f'{i_smooth}': lambda x: x.mean() + n_std*x.std()
      }).rename(columns={f'{i_smooth}':f'{i_smooth}_outlier'})

      df_id_mean = df.groupby(group_cols,as_index=False).agg({
          f'{i_smooth}': 'mean'
      }).rename(columns={f'{i_smooth}':f'{i_smooth}_mean'})

      df = df.merge(df_id_outlier, on=group_cols[0], how='left')
      df = df.merge(df_id_mean, on=group_cols[0], how='left')

      df[f'{i_smooth}'] = np.where(
          df[f'{i_smooth}'] > df[f'{i_smooth}_outlier'],
          df[f'{i_smooth}_mean'],
          df[f'{i_smooth}']
      )

  return df


def create_rolling_features(data, group_cols, target_col, windows, shift_period, min_period, statistics):
    def apply_statistic(x, stat):
        rolled = x.shift(shift_period).rolling(window=window, min_periods=min_period)
        if stat == 'mean':
            return rolled.mean()
        elif stat == 'median':
            return rolled.median()
        elif stat == 'std':
            return rolled.std()
        elif stat == 'min':
            return rolled.min()
        elif stat == 'max':
            return rolled.max()
        elif stat == 'skew':
            return rolled.skew()
        elif stat == 'sum':
            return rolled.sum()
        elif stat == 'quantile':
            return rolled.quantile(0.95)

        elif stat.startswith('quantile_'):
            q = float(stat.split('_')[1])
            return rolled.quantile(q)
        else:
            raise ValueError(f"Unknown statistic: {stat}")

    for window in windows:
        for stat in statistics:
            stat_name = stat if not stat.startswith('quantile_') else f"{stat.split('_')[1]}th"
            col_name = f'rolling_previous_grouped_{target_col}_{stat_name}_{window}_{shift_period}'

            data[col_name] = data.groupby(group_cols)[target_col].transform(
                lambda x: apply_statistic(x, stat)
            )

    return data



def custom_agg(x):
    return x.max() - x.min()

def get_date_features(df):
  # Simulate year (assuming 365 days per year)
  df['year'] = (df['event_t'] // 365) + 1  # Year 1 or 2

  # Simulate month (approximate)
  df['month'] = ((df['event_t'] % 365) // 30) + 1  # 30-day months approximation

  # Simulate week of the year
  df['week_of_year'] = (df['event_t'] % 365) // 7 + 1

  # Simulate day of the month
  df['day_of_month'] = (df['event_t'] % 30) + 1  # Assuming 30-day months

  # Simulate day of the week (0 = Monday, 6 = Sunday)
  df['day_of_week'] = df['event_t'] % 7

  # Simulate quarter
  df['quarter'] = ((df['month'] - 1) // 3) + 1
  return  df




def feature_engineering(train, test):
  data = pd.concat([train, test])
  data.sort_values(by = ['location_id', 'event_t'], inplace=True)
  data['event_t'] = data['event_t'].astype(int)
  # data = smoothen_target(data, ['location_id'], 'precipitation')

  data['event_binary'] = data['event_t'].apply(lambda x: 1 if (x >= 296 and x <= 435) else 0)


  group_cols =['location_id']
  # data = apply_expanding_combinations(
  #     data,
  #     [group_cols],
  #     target_col='precipitation',
  #     shift_periods=[1],#1,3, 4, 5, 6, 7, 8, 24
  #     min_periods=1,
  #     stats=['mean']
  # )

  statistics = ['mean'] #, 'median', 'std', 'quantile_0.25', 'quantile_0.75'
  min_period = 1

  shift_period = 0
  windows = [3, 4,10,20, 25, 30,55,60, 75, 296]#3, 4, 10, 20,25,30, 50, 55, 60, 75,
  data = create_rolling_features(data, group_cols,'precipitation', windows, shift_period, min_period, statistics)

  # shift_period = 2
  # windows = [3, 4,10,20, 25, 30,55,60, 75,296]#3, 4, 10, 20,25,30, 50, 55, 60, 75,
  # data = create_rolling_features(data, group_cols,'precipitation', windows, shift_period, min_period, statistics)

  # shift_period = 3
  # windows = [3, 4,10,20, 25, 30,55,60, 75,296]#3, 4, 10, 20,25,30, 50, 55, 60, 75,
  # data = create_rolling_features(data, group_cols,'precipitation', windows, shift_period, min_period, statistics)

  # shift_period = 4
  # windows = [3, 4,10,20, 25, 30,55,60, 75,296]#3, 4, 10, 20,25,30, 50, 55, 60, 75,
  # data = create_rolling_features(data, group_cols,'precipitation', windows, shift_period, min_period, statistics)

  # shift_period = 5
  # windows = [3, 4,10,20, 25, 30,55,60, 75,296]#3, 4, 10, 20,25,30, 50, 55, 60, 75,
  # data = create_rolling_features(data, group_cols,'precipitation', windows, shift_period, min_period, statistics)

  # shift_period = 6
  # windows = [ 3,4,10,20, 25, 30,55,60, 75,296]#3, 4, 10, 20,25,30, 50, 55, 60, 75,
  # data = create_rolling_features(data, group_cols,'precipitation', windows, shift_period, min_period, statistics)

  # shift_period = 8
  # windows = [ 3,4,10,20, 25, 30,55,60, 75,296]#3, 4, 10, 20,25,30, 50, 55, 60, 75,
  # data = create_rolling_features(data, group_cols,'precipitation', windows, shift_period, min_period, statistics)
  # # data = get_date_features(data)

  for col in ['precipitation']:
    # data[f"grouped_location_{col}_cum"] = data.groupby('location_id')[col].cumsum().shift(1)

    # quantile = 0.95  # Define the quantile you want to calculate
    # for stat in ['mean', 'quantile']:
    #     if stat != 'quantile':
    #         data[f"location_grouped_{col}_{stat}"] = data.groupby('location_id')[col].transform(stat)
    #         data[f"diff_{col}_{stat}"] = data[col] - data[f"location_grouped_{col}_{stat}"]


    for shift in range(1,365):
      data[f'{col}_shift_{shift}'] = data.groupby('location_id')[col].shift(shift)
      data[f'{col}_next_shift_{shift}'] = data.groupby('location_id')[col].shift(-shift)




    # for window in windows:
    #   data[f'{col}_rolling_grouped_custom_{window}'] = (
    #       data.groupby('location_id')[col]
    #       .rolling(window)
    #       .apply(custom_agg)
    #       .reset_index(level=0, drop=True)  # Reset the index to align with the original DataFrame
    #   )

    for span in [7]:
        data[f'{col}_ewm_grouped_mean_{span}'] = (
            data.groupby('location_id')[col]
            .ewm(span=span, adjust=False)
            .mean()
            .reset_index(level=0, drop=True)  # Reset the index to align it with the original DataFrame
        )




  train = data[data['label'].notna()].reset_index(drop = True)
  test = data[data['label'].isna()].reset_index(drop = True)

  return train, test

new_train, new_test = feature_engineering(train_df, test_df)
display(new_train.head(), new_train.shape, new_test.head(), new_test.shape)

In [None]:
new_train['label'].value_counts()

In [None]:
for i in range(n_splits):
  print(new_train[new_train['fold'] == i]['label'].value_counts())
  print("-"* 100)

In [None]:
selected_columns =['precipitation','flood_probability','event_binary', 'event_t', ] + [col for col in new_train if 'diff' in col or 'shift' in col or 'grouped' in col ]
target_col = 'label'

In [None]:
len(selected_columns)

In [None]:
import random
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value)
 #cpu vars
    torch.manual_seed(seed_value)
# cpu  vars
    random.seed(seed_value)
 # Python
    if use_cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
# gpu vars
        torch.backends.cudnn.deterministic = True
 #needed
        torch.backends.cudnn.benchmark = False
#Remember to use num_workers=0 when creating the DataBunch.

random_seed(42,True)

In [None]:
class TabTransformer(nn.Module):
    def __init__(self, num_features=743, num_classes=1, dim_embedding=96, num_heads=4, num_layers=2):
        super(TabTransformer, self).__init__()
        self.embedding = nn.Linear(num_features, dim_embedding)
        encoder_layer = nn.TransformerEncoderLayer(d_model=dim_embedding, nhead=num_heads, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(dim_embedding, num_classes)

    def forward(self, _, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)  # Adding a sequence length dimension
        x = self.transformer(x)
        x = torch.mean(x, dim=1)  # Pooling
        x = self.classifier(x)
        return x

criterion = nn.BCEWithLogitsLoss()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def fit_fastai_model(train, test, target_col, selected_columns, n_splits):
    train['oof_preds'] = 0.0  # Initialize OOF predictions in train
    test_preds = np.zeros(len(test), dtype=np.float32)  # Initialize test predictions
    scores_auc = []  # Store AUC scores
    scores_logloss = []  # Store Log Loss scores

    cat_feats = []  # Categorical features
    cont_feats = [col for col in selected_columns if col not in cat_feats]  # Continuous features

    for fold in range(n_splits):
        print("*" * 100)
        print(f"======================================TRAINING FOLD: {fold}=============================================")
        model = TabTransformer()
        # Split train into training and validation sets
        training = train[train['fold'] != fold]
        validation = train[train['fold'] == fold]

        splits = (
            list(range(len(training))),
            list(range(len(training), len(training) + len(validation)))
        )

        combined_data = pd.concat(
            [training[selected_columns + [target_col]], validation[selected_columns + [target_col]]]
        )

        # Prepare DataLoaders
        dls = TabularPandas(
            combined_data,
            cat_names=cat_feats,
            cont_names=cont_feats,
            y_names=target_col,
            splits=splits,
            procs=[Categorify, FillMissing, Normalize]
        ).dataloaders(bs=4096*2)

        # Define the model as a binary classifier
        #learn = tabular_learner(
        #    dls,
        #    layers= [256],#[ 256,512, 1024, 512, 256],
        #    n_out=1,
        #    loss_func=F.binary_cross_entropy_with_logits,
        #    metrics=[AccumMetric(roc_auc_score, invert_arg=True)]
        #)
        learn = Learner(
                    dls,
                    model=model,
                    metrics=[AccumMetric(roc_auc_score, invert_arg=True)],
                    loss_func=F.binary_cross_entropy_with_logits,
        )

        # Train the model
        learn.fit_one_cycle(25, 1e-3, cbs=[SaveModelCallback(monitor='valid_loss', fname=f'nn_approach_fold_{fold}')])

        # Validation predictions
        val_dl = learn.dls.test_dl(validation[selected_columns])
        preds, _ = learn.get_preds(dl=val_dl)
        val_preds = preds.sigmoid().squeeze().numpy()  # Sigmoid for probabilities
        auc_score = roc_auc_score(validation[target_col], val_preds)
        logloss_score = log_loss(validation[target_col], val_preds)

        scores_auc.append(auc_score)
        scores_logloss.append(logloss_score)

        print(f"Fold {fold} AUC: {auc_score:.4f}, LogLoss: {logloss_score:.4f}")

        # Assign OOF predictions to train
        train.loc[validation.index, 'oof_preds'] = val_preds

        # Test predictions
        test_dl = learn.dls.test_dl(test[selected_columns])
        preds, _ = learn.get_preds(dl=test_dl)
        test_preds += preds.sigmoid().squeeze().numpy()

        # Cleanup
        del dls, learn, val_dl, test_dl, preds, _
        gc.collect()
        torch.cuda.empty_cache()

    # Combine test predictions (mean across folds)
    combined_test_preds = test_preds / n_splits

    print(f"\nAverage AUC across {n_splits} folds: {np.mean(scores_auc):.4f} (+/- {np.std(scores_auc):.4f})")
    print(f"Average LogLoss across {n_splits} folds: {np.mean(scores_logloss):.4f} (+/- {np.std(scores_logloss):.4f})")

    # Overall scores for OOF
    overall_auc = roc_auc_score(train[target_col], train['oof_preds'])
    overall_logloss = log_loss(train[target_col], train['oof_preds'])
    print(f"Overall OOF AUC: {overall_auc:.4f}, LogLoss: {overall_logloss:.4f}")

    # Assign combined test predictions to the test set
    test['preds'] = combined_test_preds

    return train, test

In [None]:
n_splits = 10
sub_train, sub_test =  fit_fastai_model(new_train, new_test, target_col, selected_columns, n_splits)

In [None]:
display(sub_test.head(), sub_test.shape)

In [None]:
sub = sub_test[['event_id', 'preds']]
sub.head()

In [None]:
sub.to_csv("baseline_fastai_tabtransformer_10_folds.csv", index = False)

In [None]:
sub_train[['event_id','label', 'oof_preds']].to_csv("oof.csv", index=False)

In [None]:
sub_train[['event_id', 'location_id', 'event_t', 'flood_probability','label','oof_preds', ]].to_csv("transformer_tab_oof.csv", index=False)
sub_test[['event_id', 'location_id', 'event_t', 'flood_probability','label', 'preds']].to_csv("transformer_tab_test.csv", index=False)