In [None]:
!pip install fastai -q

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
from sklearn.metrics import log_loss, roc_auc_score


import matplotlib.pyplot as plt
import seaborn as sns


from itertools import combinations
import random

from PIL import Image


from sklearn.metrics import roc_auc_score, log_loss

import gc

import torch
from fastai.tabular.all import *


import warnings
warnings.filterwarnings("ignore")




In [None]:
base_path = "/kaggle/input/final-deepmind-comp-dataset/final_deepmind_comp_dataset/zindi_data/"
additional_path = "/kaggle/input/final-deepmind-comp-dataset/final_deepmind_comp_dataset/image_classifier_results/"
train = pd.read_csv(base_path + "Train.csv")
test = pd.read_csv(base_path + "Test.csv")
train_with_cv_results = pd.read_csv(additional_path + "train_with_cv_results.csv")[['location_id', 'flood_probability']]
test_with_cv_results = pd.read_csv(additional_path + "test_with_cv_results.csv")[['location_id', 'flood_probability',]]
submission = pd.read_csv(base_path + "SampleSubmission.csv")
images = np.load(base_path + "composite_images.npz")
display(train.head(), train.shape, train_with_cv_results.head(), train_with_cv_results.shape, test.head(), test.shape)


In [None]:
# List all keys in the .npz file
print("Keys in the .npz file:", images.files)

# Access the first image array (adjust the key name as needed)
key_name = images.files[0]  # Replace with the correct key if needed
image_array = images[key_name]

# Print the shape of the image
print("Shape of the image array:", image_array.shape)

# Extract the number of bands (assumes shape is (height, width, bands))
if image_array.ndim == 3:
    num_bands = image_array.shape[2]
    print("Number of bands in the image:", num_bands)
else:
    print("The image array does not have multiple bands (2D array).")

In [None]:
def get_location(value):
  return value.split("_")[0] + '_' + value.split("_")[1]

def get_event_id(value):
  return value.split("_")[3]
for df in [train, test]:

  df['location_id'] = df['event_id'].apply(lambda x: '_'.join(x.split('_')[0:2]))
  df['event_idx'] = df.groupby('location_id', sort=False).ngroup()

  df['event_t'] = df.groupby('location_id').cumcount()

print(len(set(train['location_id'])), len(set(test['location_id'])))
print(len(set(train['location_id']).intersection(set(test['location_id']))))
print(len(images))
display(train.head(), test.head())

* each image has 730 events
* no intersection of images betweeen the two data sets (unique sets)
* The numpy files has 898 images for both train and test

* The images are annual cloud-free composite images from Sentinel-2 satellite imagery. They are of size 128x128 and contain the following 6 channels:

      Sentinel-2 B2 (Blue)
      Sentinel-2 B3 (Green)
      Sentinel-2 B4 (Red)
      Sentinel-2 B8 (NIR)
      Sentinel-2 B11 (SWIR)
      Slope (derived from NASA SRTM)

* the images are essentially static for any event/location pair over the study period.
  * the images only serve as spatial representations of the environment for that location over the 730 day period
  * it reflects static or semi-static environmental conditons (e.g land use, vegetation, water bodies, topography) that could influence flood occurence
  * so the images cannot provide temporal insights but what we can do is extract spatial features such as NDVI, NDWI, NDBI, Topographic features like slope and elevation changes from the slope channel
  * combine the spatial features with temporal precipitation data to enrich the dataset by treating the spatial features as fixed covariates that describe each location.
    * Areas with high NDWI Might flood more frequently with heavy precipitation
    * LOcations with high slope values might experience flash floods after intense rainfall

  * Image processing:
    * Use pretrained models to extract image embeddings or use PCA for dimensionality reduction
    * create a binary classifier where 1 is images where a flood has occured in any of the 730 events and 0 if no floods has occured to create a soft flag for flood-prone locations. Even if not perfect they can serve as a proxy for environmental vulnerability to floods
    * The image classifier naturally reduces the extreme imbalance in the dataset by focusing on binary flood/non-flood classification
  
  * clustering locations:
    * group events/locations based on spatial features (e.g NDVI, NDWI) to identify patterns in flood susceptibility
  * correlating spatial features with precipitation thresholds:
    * study how spatial features interact with specific precipitation thresholds that leads to floods

  * You can think of event_id_X_1 being the 01/01/2024 and event_id_X_2 being 02/01/2024 (dd/mm/yyyy).

### Data Preprocessing + Feature Engineering

In [None]:
import numpy as np
import pandas as pd
from scipy.ndimage import uniform_filter

def calculate_statistics(array, stats):
    """
    Compute specified statistics on a given array.

    Args:
        array (numpy.ndarray): Input array.
        stats (list): List of statistics to calculate (e.g., ['mean', 'std', 'median']).

    Returns:
        dict: Dictionary of computed statistics with keys as stat names.
    """
    statistics = {}
    for stat in stats:
        if stat == 'mean':
            statistics['mean'] = np.nanmean(array)
        elif stat == 'std':
            statistics['std'] = np.nanstd(array)
        elif stat == 'median':
            statistics['median'] = np.nanmedian(array)
        elif stat == 'skew':
            statistics['skew'] = pd.Series(array.flatten()).skew()
        elif stat == 'min':
            statistics['min'] = np.nanmin(array)
        elif stat == 'max':
            statistics['max'] = np.nanmax(array)
    return statistics


def generate_flood_features(df, images, band_names, statistics=['mean', 'std']):
    """
    Generate tabular features for flood modeling from given bands, with customizable statistics.

    Args:
        df (pd.DataFrame): Input dataframe containing metadata for each location (train + test).
                           Should have a 'location_id' column to identify images.
        images (dict): Dictionary of image data indexed by 'location_id'.
        band_names (tuple): Tuple of available band names (e.g., 'B2', 'B3', ...).
        statistics (list): List of statistics to calculate for each feature (e.g., ['mean', 'std']).

    Returns:
        pd.DataFrame: Dataframe with additional features for modeling.
    """
    band_indices = {name: band_names.index(name) for name in band_names}
    features = []

    for _, row in df.iterrows():
        location_id = row['location_id']
        bands = {name: images[location_id][..., idx] for name, idx in band_indices.items()}

        # Vegetation Indices
        ndvi = (bands['B8'] - bands['B4']) / (bands['B8'] + bands['B4'] + 1e-6)
        # savi = (1.5) * (bands['B8'] - bands['B4']) / (bands['B8'] + bands['B4'] + 0.5 + 1e-6)

        # Water Indices
        ndwi = (bands['B3'] - bands['B8']) / (bands['B3'] + bands['B8'] + 1e-6)
        mndwi = (bands['B3'] - bands['B11']) / (bands['B3'] + bands['B11'] + 1e-6)

        # Moisture-related
        msi = bands['B11'] / (bands['B8'] + 1e-6)

        # Topographic Features
        slope = bands['SLOPE']
        # slope_variability = uniform_filter(slope, size=3)  # Smooth with a 3x3 window
        # elevation_change = np.gradient(slope, axis=(0, 1))

        # # Composite Ratios
        # band_ratios = [
        #     (bands['B2'] / (bands['B4'] + 1e-6)),
        #     (bands['B8'] / (bands['B11'] + 1e-6)),
        #     (bands['B2'] / (bands['B8'] + 1e-6)),
        # ]

        # Compile features for this location
        location_features = {'location_id': location_id}

        # Add statistics for each feature
        index_features = {
            'NDVI': ndvi,
            # 'SAVI': savi,
            'NDWI': ndwi,
            'MNDWI': mndwi,
            'MSI': msi,
            'Slope': slope,
            # 'Slope_variability': slope_variability,
            # 'Elevation_change': elevation_change,
        }

        for feature_name, feature_array in index_features.items():
            stats = calculate_statistics(feature_array, statistics)
            for stat_name, value in stats.items():
                location_features[f'{feature_name}_{stat_name}'] = value

        # # Add statistics for band ratios
        # for idx, ratio in enumerate(band_ratios):
        #     stats = calculate_statistics(ratio, statistics)
        #     for stat_name, value in stats.items():
        #         location_features[f'Band_Ratio_{idx + 1}_{stat_name}'] = value

        features.append(location_features)

    # Convert to a DataFrame
    feature_df = pd.DataFrame(features)

    # Merge with the original DataFrame (optional)
    df = df.merge(feature_df, on='location_id', how='left')

    return df

# Example list of statistics to calculate
stats_to_calculate = ['mean', 'median', 'std']#, 'std', 'median', 'min', 'max'
BAND_NAMES = ('B2', 'B3', 'B4', 'B8', 'B11', 'SLOPE')

# Generate features with these statistics
train_features = generate_flood_features(train_with_cv_results, images, BAND_NAMES, statistics=stats_to_calculate)
test_features = generate_flood_features(test_with_cv_results, images, BAND_NAMES, statistics=stats_to_calculate)

display(train_features.head(), train_features.shape)

In [None]:
train_df = pd.merge(train, train_features, on='location_id', how='left')
test_df = pd.merge(test, test_features, on='location_id', how='left')


display(train_df.head(), train_df.shape, test_df.head(), test_df.shape)

### More Feature Engineering

In [None]:
from types import new_class
def apply_expanding_combinations(df, group_cols_list, target_col='Sales', shift_periods=[1], min_periods=1, stats=['mean', 'std']):
    # Loop through the group column combinations
    for group_cols in group_cols_list:
        # Generate base name for the grouping
        group_name = '_'.join(group_cols)

        for shift_period in shift_periods:
            for stat in stats:
                expanding_col_name = f'expanding_grouped_{group_name}_{target_col}_shift_{shift_period}_{stat}'

                # Apply groupby, shift, and expanding for the given statistic
                df[expanding_col_name] = df.groupby(group_cols)[target_col].transform(
                    lambda x: x.shift(shift_period).expanding(min_periods=min_periods).agg(stat)
                )

    return df

def smoothen_target(df, group_cols, target_col):
  n_std = 10
  for i_smooth in [target_col]:
      df_id_outlier = df.groupby(group_cols,as_index=False).agg({
          f'{i_smooth}': lambda x: x.mean() + n_std*x.std()
      }).rename(columns={f'{i_smooth}':f'{i_smooth}_outlier'})

      df_id_mean = df.groupby(group_cols,as_index=False).agg({
          f'{i_smooth}': 'mean'
      }).rename(columns={f'{i_smooth}':f'{i_smooth}_mean'})

      df = df.merge(df_id_outlier, on=group_cols[0], how='left')
      df = df.merge(df_id_mean, on=group_cols[0], how='left')

      df[f'{i_smooth}'] = np.where(
          df[f'{i_smooth}'] > df[f'{i_smooth}_outlier'],
          df[f'{i_smooth}_mean'],
          df[f'{i_smooth}']
      )

  return df


def create_rolling_features(data, group_cols, target_col, windows, shift_period, min_period, statistics):
    def apply_statistic(x, stat):
        rolled = x.shift(shift_period).rolling(window=window, min_periods=min_period)
        if stat == 'mean':
            return rolled.mean()
        elif stat == 'median':
            return rolled.median()
        elif stat == 'std':
            return rolled.std()
        elif stat == 'min':
            return rolled.min()
        elif stat == 'max':
            return rolled.max()
        elif stat == 'skew':
            return rolled.skew()
        elif stat == 'sum':
            return rolled.sum()
        elif stat == 'quantile':
            return rolled.quantile(0.95)

        elif stat.startswith('quantile_'):
            q = float(stat.split('_')[1])
            return rolled.quantile(q)
        else:
            raise ValueError(f"Unknown statistic: {stat}")

    for window in windows:
        for stat in statistics:
            stat_name = stat if not stat.startswith('quantile_') else f"{stat.split('_')[1]}th"
            col_name = f'rolling_previous_grouped_{target_col}_{stat_name}_{window}_{shift_period}'

            data[col_name] = data.groupby(group_cols)[target_col].transform(
                lambda x: apply_statistic(x, stat)
            )

    return data



def custom_agg(x):
    return x.max() - x.min()

def get_date_features(df):
  # Simulate year (assuming 365 days per year)
  df['year'] = (df['event_t'] // 365) + 1  # Year 1 or 2

  # Simulate month (approximate)
  df['month'] = ((df['event_t'] % 365) // 30) + 1  # 30-day months approximation

  # Simulate week of the year
  df['week_of_year'] = (df['event_t'] % 365) // 7 + 1

  # Simulate day of the month
  df['day_of_month'] = (df['event_t'] % 30) + 1  # Assuming 30-day months

  # Simulate day of the week (0 = Monday, 6 = Sunday)
  df['day_of_week'] = df['event_t'] % 7

  # Simulate quarter
  df['quarter'] = ((df['month'] - 1) // 3) + 1
  return  df




def feature_engineering(train, test):
  data = pd.concat([train, test])
  data.sort_values(by = ['location_id', 'event_t'], inplace=True)
  data['event_t'] = data['event_t'].astype(int)
  # data = smoothen_target(data, ['location_id'], 'precipitation')

  data['event_binary'] = data['event_t'].apply(lambda x: 1 if (x >= 296 and x <= 435) else 0)


  group_cols =['location_id']
  # data = apply_expanding_combinations(
  #     data,
  #     [group_cols],
  #     target_col='precipitation',
  #     shift_periods=[1],#1,3, 4, 5, 6, 7, 8, 24
  #     min_periods=1,
  #     stats=['mean']
  # )

  statistics = ['mean'] #, 'median', 'std', 'quantile_0.25', 'quantile_0.75'
  min_period = 1

  shift_period = 0
  windows = [3, 4,10,20, 25, 30,55,60, 75, 296]#3, 4, 10, 20,25,30, 50, 55, 60, 75,
  data = create_rolling_features(data, group_cols,'precipitation', windows, shift_period, min_period, statistics)

  # shift_period = 2
  # windows = [3, 4,10,20, 25, 30,55,60, 75,296]#3, 4, 10, 20,25,30, 50, 55, 60, 75,
  # data = create_rolling_features(data, group_cols,'precipitation', windows, shift_period, min_period, statistics)

  # shift_period = 3
  # windows = [3, 4,10,20, 25, 30,55,60, 75,296]#3, 4, 10, 20,25,30, 50, 55, 60, 75,
  # data = create_rolling_features(data, group_cols,'precipitation', windows, shift_period, min_period, statistics)

  # shift_period = 4
  # windows = [3, 4,10,20, 25, 30,55,60, 75,296]#3, 4, 10, 20,25,30, 50, 55, 60, 75,
  # data = create_rolling_features(data, group_cols,'precipitation', windows, shift_period, min_period, statistics)

  # shift_period = 5
  # windows = [3, 4,10,20, 25, 30,55,60, 75,296]#3, 4, 10, 20,25,30, 50, 55, 60, 75,
  # data = create_rolling_features(data, group_cols,'precipitation', windows, shift_period, min_period, statistics)

  # shift_period = 6
  # windows = [ 3,4,10,20, 25, 30,55,60, 75,296]#3, 4, 10, 20,25,30, 50, 55, 60, 75,
  # data = create_rolling_features(data, group_cols,'precipitation', windows, shift_period, min_period, statistics)

  # shift_period = 8
  # windows = [ 3,4,10,20, 25, 30,55,60, 75,296]#3, 4, 10, 20,25,30, 50, 55, 60, 75,
  # data = create_rolling_features(data, group_cols,'precipitation', windows, shift_period, min_period, statistics)
  # # data = get_date_features(data)

  for col in ['precipitation']:
    # data[f"grouped_location_{col}_cum"] = data.groupby('location_id')[col].cumsum().shift(1)

    # quantile = 0.95  # Define the quantile you want to calculate
    # for stat in ['mean', 'quantile']:
    #     if stat != 'quantile':
    #         data[f"location_grouped_{col}_{stat}"] = data.groupby('location_id')[col].transform(stat)
    #         data[f"diff_{col}_{stat}"] = data[col] - data[f"location_grouped_{col}_{stat}"]


    for shift in range(1,365):
      data[f'{col}_shift_{shift}'] = data.groupby('location_id')[col].shift(shift)
      data[f'{col}_next_shift_{shift}'] = data.groupby('location_id')[col].shift(-shift)




    # for window in windows:
    #   data[f'{col}_rolling_grouped_custom_{window}'] = (
    #       data.groupby('location_id')[col]
    #       .rolling(window)
    #       .apply(custom_agg)
    #       .reset_index(level=0, drop=True)  # Reset the index to align with the original DataFrame
    #   )

    for span in [7]:
        data[f'{col}_ewm_grouped_mean_{span}'] = (
            data.groupby('location_id')[col]
            .ewm(span=span, adjust=False)
            .mean()
            .reset_index(level=0, drop=True)  # Reset the index to align it with the original DataFrame
        )




  train = data[data['label'].notna()].reset_index(drop = True)
  test = data[data['label'].isna()].reset_index(drop = True)

  return train, test

new_train, new_test = feature_engineering(train_df, test_df)
display(new_train.head(), new_train.shape, new_test.head(), new_test.shape)

### MODELLING
674 224

In [None]:
new_train['label'].value_counts()

In [None]:

n_splits = 10
seed = 2024
gkf = StratifiedGroupKFold(n_splits = n_splits)

new_train['fold'] = -1
for fold, (_, val_idx) in enumerate(gkf.split(new_train, new_train['label'], groups = new_train['location_id'])):
    new_train.loc[val_idx, "fold"] = fold
# new_train['fold'] = new_train['fold'].astype(int)
new_train['fold'].value_counts()



In [None]:
for i in range(n_splits):
  print(new_train[new_train['fold'] == i]['label'].value_counts())
  print("-"* 100)

### MODELLING

In [None]:

indices_cols = [
  'EVI_mean',
 'EVI_median',
 'EVI_std',
 'MNDWI_mean',
 'MNDWI_median',
 'MNDWI_std',
 'MSI_mean',
 'MSI_median',
 'MSI_std',
 'NDVI_mean',
 'NDVI_median',
 'NDVI_std',
 'NDWI_mean',
 'NDWI_median',
 'NDWI_std',
 'Slope_mean',
 'Slope_median',
 'Slope_std',
]

selected_columns =['precipitation','flood_probability','event_binary', 'event_t', ] + [col for col in new_train if 'diff' in col or 'shift' in col or 'grouped' in col ]

print(selected_columns)
target_col = 'label'


In [None]:

def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value)
 #cpu vars
    torch.manual_seed(seed_value)
# cpu  vars
    random.seed(seed_value)
 # Python
    if use_cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
# gpu vars
        torch.backends.cudnn.deterministic = True
 #needed
        torch.backends.cudnn.benchmark = False
#Remember to use num_workers=0 when creating the DataBunch.

random_seed(2024,True)

def fit_fastai_model(train, test, target_col, selected_columns, n_splits):
    train['oof_preds'] = 0.0  # Initialize OOF predictions in train
    test_preds = np.zeros(len(test), dtype=np.float32)  # Initialize test predictions
    scores_auc = []  # Store AUC scores
    scores_logloss = []  # Store Log Loss scores

    cat_feats = []  # Categorical features
    cont_feats = [col for col in selected_columns if col not in cat_feats]  # Continuous features

    for fold in range(n_splits):
        print("*" * 100)
        print(f"======================================TRAINING FOLD: {fold}=============================================")

        # Split train into training and validation sets
        training = train[train['fold'] != fold]
        validation = train[train['fold'] == fold]

        splits = (
            list(range(len(training))),
            list(range(len(training), len(training) + len(validation)))
        )

        combined_data = pd.concat(
            [training[selected_columns + [target_col]], validation[selected_columns + [target_col]]]
        )

        # Prepare DataLoaders
        dls = TabularPandas(
            combined_data,
            cat_names=cat_feats,
            cont_names=cont_feats,
            y_names=target_col,
            splits=splits,
            procs=[Categorify, FillMissing, Normalize]
        ).dataloaders(bs=4096)

        # Define the model as a binary classifier
        learn = tabular_learner(
            dls,
            layers=[ 256,512, 1024, 512, 256],
            n_out=1,
            loss_func=F.binary_cross_entropy_with_logits,
            metrics=[AccumMetric(roc_auc_score, invert_arg=True)]
        )

        # Train the model
        learn.fit_one_cycle(10, 2e-3, cbs=[SaveModelCallback(monitor='valid_loss', fname=f'nn_approach_fold_{fold}')])

        # Validation predictions
        val_dl = learn.dls.test_dl(validation[selected_columns])
        preds, _ = learn.get_preds(dl=val_dl)
        val_preds = preds.sigmoid().squeeze().numpy()  # Sigmoid for probabilities
        auc_score = roc_auc_score(validation[target_col], val_preds)
        logloss_score = log_loss(validation[target_col], val_preds)

        scores_auc.append(auc_score)
        scores_logloss.append(logloss_score)

        print(f"Fold {fold} AUC: {auc_score:.4f}, LogLoss: {logloss_score:.4f}")

        # Assign OOF predictions to train
        train.loc[validation.index, 'oof_fastai'] = val_preds

        # Test predictions
        test_dl = learn.dls.test_dl(test[selected_columns])
        preds, _ = learn.get_preds(dl=test_dl)
        test_preds += preds.sigmoid().squeeze().numpy()

        # Cleanup
        del dls, learn, val_dl, test_dl, preds, _
        gc.collect()
        torch.cuda.empty_cache()

    # Combine test predictions (mean across folds)
    combined_test_preds = test_preds / n_splits

    print(f"\nAverage AUC across {n_splits} folds: {np.mean(scores_auc):.4f} (+/- {np.std(scores_auc):.4f})")
    print(f"Average LogLoss across {n_splits} folds: {np.mean(scores_logloss):.4f} (+/- {np.std(scores_logloss):.4f})")

    # Overall scores for OOF
    overall_auc = roc_auc_score(train[target_col], train['oof_fastai'])
    overall_logloss = log_loss(train[target_col], train['oof_fastai'])
    print(f"Overall OOF AUC: {overall_auc:.4f}, LogLoss: {overall_logloss:.4f}")

    # Assign combined test predictions to the test set
    test['fastai_preds'] = combined_test_preds

    return train, test


In [None]:
sub_train, sub_test =  fit_fastai_model(new_train, new_test, target_col, selected_columns, n_splits)


Average AUC across 10 folds: 0.9386 (+/- 0.0274)
Average LogLoss across 10 folds: 0.0027 (+/- 0.0003)
Overall OOF AUC: 0.9310, LogLoss: 0.0027

In [None]:
display(sub_test.head(), sub_test.shape)

In [None]:

sub = sub_test[['event_id', 'fastai_preds']]
sub.head()

In [None]:
sub.to_csv("baseline_fastai_0.0027_10_folds_v2.csv", index = False)

#### Normalizing the Probabilities

In [None]:
from sklearn.metrics import log_loss

print(f"logloss before normalizing: {log_loss(sub_train['label'], sub_train['oof_fastai'])}")

locations_to_normalize = sub_train[sub_train['flood_probability'] >= 0.5]['location_id'].unique()
sub_train['oof_sum_prob'] = sub_train.groupby('location_id')['oof_fastai'].transform('sum')

# Avoid division by zero
epsilon = 1e-8
sub_train['oof_fastai_norm'] = sub_train['oof_fastai']  # Copy original values

sub_train.loc[sub_train['location_id'].isin(locations_to_normalize), 'oof_fastai_norm'] = (
    sub_train.loc[sub_train['location_id'].isin(locations_to_normalize), 'oof_fastai'] /
    (sub_train.loc[sub_train['location_id'].isin(locations_to_normalize), 'oof_sum_prob'] + epsilon)
)

print(f"logloss after normalizing: {log_loss(sub_train['label'], sub_train['oof_fastai_norm'])}")


In [None]:
sub_train[['event_id', 'location_id', 'event_t', 'flood_probability','label','oof_fastai', ]].to_csv("fastai_train_with_oof.csv", index=False)
sub_test[['event_id', 'location_id', 'event_t', 'flood_probability','label', 'fastai_preds']].to_csv("fastai_test_with_oof.csv", index=False)

### Normalize the predictions based on the flood probability


In [None]:
locations_to_normalize = sub_test[sub_test['flood_probability'] >= 0.7]['location_id'].unique()
sub_test['oof_sum_prob'] = sub_test.groupby('location_id')['fastai_preds'].transform('sum')

# Avoid division by zero
epsilon = 1e-8
sub_test['pred_norm'] = sub_test['fastai_preds']  # Copy original values

sub_test.loc[sub_test['location_id'].isin(locations_to_normalize), 'pred_norm'] = (
    sub_test.loc[sub_test['location_id'].isin(locations_to_normalize), 'fastai_preds'] /
    (sub_test.loc[sub_test['location_id'].isin(locations_to_normalize), 'oof_sum_prob'] + epsilon)
)

sub_test.head()

In [None]:

mod_sub = sub_test[['event_id', 'pred_norm']]
mod_sub.head()

In [None]:
mod_sub.to_csv("fastai_mod_0_25(corrected).csv", index = False)