### FLOOD PROBABILITY PREDICTION
* The goal of this notebook is to use computer vision on the static images to predict the probability of a location having a flood or not
* This probability will then be used as a feature for subsequent models and also used in normalizing the predictions to prevent overconfident predictions that hurt the loglosss score 

In [None]:
!pip install albumentations -q
!pip install timm -q

In [None]:
import torch
import os
import random
import albumentations
from albumentations.pytorch import ToTensorV2
import pandas as pd
import numpy as np
import gc 
import timm
from fastai.vision.all import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from fastai.metrics import accuracy

le = LabelEncoder()
import warnings
warnings.filterwarnings('ignore')

In [None]:
def random_seed(seed_value, use_cuda): 
    np.random.seed(seed_value)
 #cpu vars
    torch.manual_seed(seed_value) 
# cpu  vars
    random.seed(seed_value)
 # Python 
    if use_cuda: 
        torch.cuda.manual_seed(seed_value) 
        torch.cuda.manual_seed_all(seed_value) 
# gpu vars
        torch.backends.cudnn.deterministic = True 
 #needed
        torch.backends.cudnn.benchmark = False 
#Remember to use num_workers=0 when creating the DataBunch.

random_seed(2024,True)

In [None]:
path = "/kaggle/input/final-deepmind-comp-dataset/final_deepmind_comp_dataset/zindi_data/"
train = pd.read_csv(path + "Train.csv")
test = pd.read_csv(path + "Test.csv")
images = np.load(path + "composite_images.npz")
def get_location(value):
  return value.split("_")[0] + '_' + value.split("_")[1]

def get_event_id(value):
  return value.split("_")[3]
for df in [train, test]:
  df['location_id'] = df['event_id'].apply(lambda x: get_location(x))
  df['event'] = df['event_id'].apply(lambda x: get_event_id(x))

print(len(set(train['location_id'])), len(set(test['location_id'])))
print(len(set(train['location_id']).intersection(set(test['location_id']))))
print(len(images))
display(train.head(), test.head())

In [None]:
train_grouped = pd.DataFrame(train.groupby('location_id')['label'].agg('max')).reset_index()
test_grouped = pd.DataFrame(test.groupby('location_id')['event_id'].count()).reset_index()
test_grouped.columns = ['location_id', 'event_id_counts']
display(train_grouped.head(), test_grouped.head())

### COnfig

In [None]:
timm.list_models('eva*')

In [None]:
class Config:
    n_splits = 10
    seed = 2024
    image_path = "/kaggle/input/final-deepmind-comp-dataset/final_deepmind_comp_dataset/Moisture Stress/"
    image_size = 224
    img_extension = '.png'
    model_name = "eva02_tiny_patch14_224"
    batch_size = 32
    epochs = 15
    tta = 5
    num_classes = 2

len(os.listdir(Config.image_path)), train_grouped.shape

In [None]:
skf = StratifiedKFold(n_splits = Config.n_splits, shuffle=True, random_state = Config.seed)
train_grouped['fold'] = -1
for fold, (_, val_idx) in enumerate(skf.split(train_grouped, train_grouped['label'])):
    train_grouped.loc[val_idx, "fold"] = fold

train_grouped['fold'].value_counts()

In [None]:
train_grouped.label.value_counts()

In [None]:

train_grouped['image_path'] = Config.image_path + train_grouped['location_id'] + Config.img_extension
test_grouped['image_path'] = Config.image_path + test_grouped['location_id'] + Config.img_extension
display(train_grouped.head(), test_grouped.head())

### Albumentation Augmentations

In [None]:
class AlbumentationsTransform (RandTransform):
    split_idx,order=None,2
    def __init__(self, train_aug, valid_aug): store_attr()
    
    def before_call(self, b, split_idx):
        self.idx = split_idx
    
    def encodes(self, img: PILImage):
        if self.idx == 0:
            aug_img = self.train_aug(image=np.array(img))['image']
        else:
            aug_img = self.valid_aug(image=np.array(img))['image']
        return PILImage.create(aug_img)

In [None]:
def get_train_aug(): return albumentations.Compose([
            albumentations.Resize(Config.image_size, Config.image_size), #Extra tip, use size that's suitable for the efficentNetwork you are using.

            albumentations.HorizontalFlip(p=0.5),
            albumentations.VerticalFlip(p=0.5),
            
            albumentations.RandomBrightnessContrast(
                brightness_limit=(-0.1,0.1), 
                contrast_limit=(-0.1, 0.1), 
                p=0.5
            ),
            albumentations.HueSaturationValue(
                hue_shift_limit=0.2, 
                sat_shift_limit=0.2, 
                val_shift_limit=0.2, 
                p=0.5
            ),
              
            ], p=1.)



def get_valid_aug(): return albumentations.Compose([
            albumentations.Resize(Config.image_size, Config.image_size),
            ], p=1.0)

item_tfms = AlbumentationsTransform(get_train_aug(), get_valid_aug())
batch_tfms = [Normalize.from_stats(*imagenet_stats)]

### Data Loaders

In [None]:
# to learn more about datablocks in fast you have to visit fastai.docs
def get_datablock(Train, fold=0, bs=32):
    return DataBlock(blocks=(ImageBlock,CategoryBlock),
                get_x=ColReader("image_path"),
                get_y=ColReader(['label']),
                splitter=IndexSplitter(Train[Train.fold == fold].index),
                item_tfms = item_tfms,
                batch_tfms = batch_tfms).dataloaders(Train, bs=bs)

In [None]:
get_datablock(train_grouped).show_batch(figsize=(12,12))

### Model + Metrics

In [None]:
roc = RocAucBinary()
metrics = [roc, accuracy]


In [None]:
# Initialize OOF predictions array with zeros
oof_preds = np.zeros((len(train_grouped), Config.num_classes))
all_preds = []
for i in range(Config.n_splits):
    # Get data for the current fold
    print(f"=======================================TRAINING FOLD: {i+1}================================================")
    dls = get_datablock(train_grouped, i, Config.batch_size)
    learn = vision_learner(
        dls, Config.model_name, 
        loss_func=CrossEntropyLossFlat(), 
        metrics=metrics, 
        cbs=[SaveModelCallback()]
    )
    
    # Learning rate finder and fine-tune
    _valley, _slide = learn.lr_find(suggest_funcs=(valley, slide))
    learn.fine_tune(Config.epochs, _valley)
    
    # Generate OOF predictions for the validation set
    val_idx = dls.valid.items.index
    val_dl = learn.dls.valid
    val_preds, _ = learn.tta(dl=val_dl, n=Config.tta)
    
    # Save OOF predictions for the current fold
    oof_preds[val_idx] = val_preds.numpy()

    # Generate test predictions
    test_dl = learn.dls.test_dl(test_grouped)
    preds, _ = learn.tta(dl=test_dl, n=Config.tta)
    all_preds.append(preds)
    
    # Clean up to free memory
    del learn
    torch.cuda.empty_cache()
    gc.collect()

# Add OOF predictions as a new column to the training dataset
train_grouped['oof_preds'] = list(oof_preds)
display(train_grouped.head())


# Display the updated DataFrame
display(train_grouped.head())


In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

# Extract the correct probability for each row in train_grouped
train_grouped['flood_probability'] = train_grouped['oof_preds'].apply(
    lambda preds: preds[1]  # Assuming `preds[1]` is the probability for the positive class (label = 1)
    if len(preds) > 1 else preds[0]  # Safeguard if preds contains only one probability
)

# Calculate the ROC AUC score
roc_score = roc_auc_score(train_grouped['label'], train_grouped['flood_probability'])
print("ROC AUC Score:", roc_score)

# Get the predicted class by choosing the class with the highest probability
train_grouped['predicted_class'] = train_grouped['oof_preds'].apply(
    lambda preds: 1 if preds[1] > preds[0] else 0  # If probability for class 1 is higher, predict class 1
)

# Calculate the accuracy score
accuracy = accuracy_score(train_grouped['label'], train_grouped['predicted_class'])
print("Accuracy Score:", accuracy)

# Convert oof_preds (list of probabilities) into a NumPy array for log loss calculation
oof_probs = np.stack(train_grouped['oof_preds'].values)

# Calculate the log loss
logloss = log_loss(train_grouped['label'], oof_probs)
print("Log Loss:", logloss)


In [None]:
train_grouped.head()

In [None]:
train_grouped.to_csv("train_with_cv_results.csv", index=False)

### INFERENCE

In [None]:
# Convert the list of all_preds to a single NumPy array
# Shape of all_preds: (n_splits, num_test_samples, num_classes)
all_preds_array = np.array([pred.numpy() for pred in all_preds])

# Compute the mean predictions across folds (axis=0)
mean_test_preds = all_preds_array.mean(axis=0)

# Extract the probability of the predicted class (highest probability)
# For ROC AUC, we need the probabilities for the positive class (class 1)
# Assuming the second column is for the positive class (class 1), adjust as needed
test_grouped['flood_probability'] = mean_test_preds[:, 1]  # Class 1 probability

# Optionally, extract the predicted class (class with the highest probability)
test_grouped['predicted_class'] = mean_test_preds.argmax(axis=1)

# Display the updated test DataFrame
display(test_grouped.head())


In [None]:
test_grouped.to_csv("test_with_cv_results.csv", index=False)