<a href="https://colab.research.google.com/github/maya-halevy/Kaggle-Contrails/blob/main/vgg16_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import models, layers, regularizers
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
import cv2
from PIL import Image
# from focal_loss import BinaryFocalLoss
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
# import matplotlib.pyplot as plt
# import seaborn as sns
from tensorflow.keras.callbacks import LearningRateScheduler

## Unet
import os
import numpy as np
from matplotlib import pyplot as plt
from tensorflow.keras.optimizers import Adam
#import tensorflow as tf
from datetime import datetime
import cv2
from PIL import Image

# Set folder path

In [None]:
train_folder = '/kaggle/input/google-research-identify-contrails-reduce-global-warming/train'
validation_folder = '/kaggle/input/google-research-identify-contrails-reduce-global-warming/validation'
test_folder = '/kaggle/input/google-research-identify-contrails-reduce-global-warming/test'

# Metrics

In [None]:
def dice_coef(y_true, y_pred):
    y_true_f = K.flatten(y_true)
    y_pred_f = K.flatten(y_pred)
    intersection = K.sum(y_true_f * y_pred_f)
    return (2.0 * intersection + 1.0) / (K.sum(y_true_f) + K.sum(y_pred_f) + 1.0)


def jacard_coef(y_true, y_pred):
    y_true_f = K.flatten(y_true)
    y_pred_f = K.flatten(y_pred)
    intersection = K.sum(y_true_f * y_pred_f)
    return (intersection + 1.0) / (K.sum(y_true_f) + K.sum(y_pred_f) - intersection + 1.0)


def jacard_coef_loss(y_true, y_pred):
    return -jacard_coef(y_true, y_pred)


def dice_coef_loss(y_true, y_pred):
    return -dice_coef(y_true, y_pred)

# Data load functions

In [None]:
# Function to load a band file
def load_band_file(path):
    return np.load(path)

# Function to load a mask file
def load_mask_file(path):
    return np.load(path)

# Function to get all the band file paths for a given folder
def get_band_file_paths(folder_path):
    return sorted([os.path.join(folder_path, entry.name) for entry in os.scandir(folder_path) if entry.name.startswith("band_") and entry.name.endswith('.npy')])

# Function to get the mask file path for a given folder
def get_mask_file_path(folder_path):
    return os.path.join(folder_path, 'human_pixel_masks.npy')


In [None]:
def augment(images, mask):
    # Flip
    flip_type = np.random.randint(0, 3) # generates 0, 1, or 2
    if flip_type == 1:
        # Flip images and mask horizontally
        images = np.flip(images, axis=1)
        mask = np.flip(mask, axis=1)
    elif flip_type == 2:
        # Flip images and mask vertically
        images = np.flip(images, axis=0)
        mask = np.flip(mask, axis=0)
    # If flip_type == 0, do not flip at all

    # Rotation
    k = np.random.randint(0, 4) # generates 0, 1, 2 or 3
    # Rotate images and mask 0, 90, 180 or 270 degrees
    images = np.rot90(images, k, axes=(0, 1))
    mask = np.rot90(mask, k, axes=(0, 1))

    return images, mask


In [None]:
import cv2
import numpy as np
import os

def data_generator_random_preprocess(folder_path, batch_size):
    while True:
        subfolders = os.listdir(folder_path)
        np.random.shuffle(subfolders)
        for i in range(0, len(subfolders), batch_size):
            batch_subfolders = subfolders[i:i+batch_size]
            batch_images = []
            batch_masks = []
            for subfolder in batch_subfolders:
                try:
                    subfolder_path = os.path.join(folder_path, subfolder)
                    band_file_paths = get_band_file_paths(subfolder_path)
                    if not band_file_paths:
                        continue
                    band_file_paths = [path for path in band_file_paths if path.endswith(("08.npy", "12.npy", "16.npy"))]
                    processed_band_images = []
                    for path in band_file_paths:
                        band_image = load_band_file(path)[..., 4]  # Select only the 5th image

                        # Image Preprocessing
                        min_val = np.min(band_image)
                        max_val = np.max(band_image)
                        normalized_image = ((band_image - min_val) / (max_val - min_val))

                        # Apply sobel derivative to image
                        sobel_x = cv2.Sobel(normalized_image, cv2.CV_64F, 1, 0)
                        sobel_y = cv2.Sobel(normalized_image, cv2.CV_64F, 0, 1)

                        # Calculate the Sobel derivative magnitudes
                        sobel_mag = np.sqrt(np.square(np.abs(sobel_x)) + np.square(np.abs(sobel_y)))

                        # Normalize the Sobel magnitude result to the range [0, 1]
                        min_val, max_val = np.min(sobel_mag), np.max(sobel_mag)
                        sobel_mag = (sobel_mag - min_val) / (max_val - min_val)

                        # Add the processed image to the list
                        processed_band_images.append(sobel_mag)

                    processed_band_images = np.stack(processed_band_images, axis=-1)

                    mask_file_path = get_mask_file_path(subfolder_path)
                    if os.path.exists(mask_file_path):
                        mask = load_mask_file(mask_file_path).astype(np.float32)

                    batch_images.append(processed_band_images)
                    batch_masks.append(mask)
                except NotADirectoryError:
                    continue

            yield np.stack(batch_images, axis=0), np.stack(batch_masks, axis=0)


In [None]:
def data_generator_random_augmented(folder_path, batch_size):
    while True:
        subfolders = os.listdir(folder_path)
        np.random.shuffle(subfolders)
        for i in range(0, len(subfolders), batch_size):
            batch_subfolders = subfolders[i:i+batch_size]
            batch_images = []
            batch_masks = []
            for subfolder in batch_subfolders:
                try:
                    subfolder_path = os.path.join(folder_path, subfolder)
                    band_file_paths = get_band_file_paths(subfolder_path)
                    if not band_file_paths:
                        continue
                    band_file_paths = [path for path in band_file_paths if path.endswith(("08.npy", "12.npy", "16.npy"))]
                    band_images = [load_band_file(path)[..., 4] for path in band_file_paths]  # Select only the 5th image
                    band_images = np.stack(band_images, axis=-1)
                    band_images = (band_images - np.mean(band_images)) / np.std(band_images)  # Normalize the images

                    mask_file_path = get_mask_file_path(subfolder_path)
                    if os.path.exists(mask_file_path):
                        mask = load_mask_file(mask_file_path).astype(np.float32)

                    # Augment images and mask
                    band_images, mask = augment(band_images, mask)

                    batch_images.append(band_images)
                    batch_masks.append(mask)
                except NotADirectoryError:
                    continue

            yield np.stack(batch_images, axis=0), np.stack(batch_masks, axis=0)


In [None]:
def data_generator_random(folder_path, batch_size):
    while True:
        subfolders = os.listdir(folder_path)
        np.random.shuffle(subfolders)
        for i in range(0, len(subfolders), batch_size):
            batch_subfolders = subfolders[i:i+batch_size]
            batch_images = []
            batch_masks = []
            for subfolder in batch_subfolders:
                try:
                    subfolder_path = os.path.join(folder_path, subfolder)
                    band_file_paths = get_band_file_paths(subfolder_path)
                    if not band_file_paths:
                        continue
                    band_file_paths = [path for path in band_file_paths if path.endswith(("08.npy", "12.npy", "16.npy"))]
                    band_images = [load_band_file(path)[..., 4] for path in band_file_paths]  # Select only the 5th image
                    band_images = np.stack(band_images, axis=-1)
                    band_images = (band_images - np.mean(band_images)) / np.std(band_images)  # Normalize the images
                    batch_images.append(band_images)

                    mask_file_path = get_mask_file_path(subfolder_path)
                    if os.path.exists(mask_file_path):
                        mask = load_mask_file(mask_file_path).astype(np.float32)
                        batch_masks.append(mask)
                except NotADirectoryError:
                    continue

            yield np.stack(batch_images, axis=0), np.stack(batch_masks, axis=0)


# Model set up

In [None]:
class CustomModelCheckpoint(tf.keras.callbacks.Callback):
    def __init__(self, filepath, **kwargs):
        super().__init__(**kwargs)
        self.filepath = filepath

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        val_loss = logs.get('val_loss')
        train_loss = logs.get('loss')
        if val_loss is not None and train_loss is not None:
            filepath = self.filepath.format(epoch=epoch+1, loss=train_loss, val_loss=val_loss)
            self.model.save(filepath, overwrite=True)

In [None]:
# Create a callback for model checkpoints
checkpoint_cb = CustomModelCheckpoint('/kaggle/working/contrails_saved_models/test_att_resunet_{epoch:03d}_{loss:.3f}_{val_loss:.3f}.h5')


# Create a callback for early stopping
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
IMG_HEIGHT = 256
IMG_WIDTH  = 256
IMG_CHANNELS = 3
num_labels = 1  #Binary
input_shape = (IMG_HEIGHT,IMG_WIDTH,IMG_CHANNELS)
batch_size = 20

# Define metric

In [None]:
train_generator = data_generator_random_augmented(train_folder, batch_size=batch_size)
validation_generator = data_generator_random(validation_folder, batch_size=batch_size)

# train_generator = data_generator_random_preprocess(train_folder, batch_size=batch_size)
# validation_generator = data_generator_random_preprocess(validation_folder, batch_size=batch_size)

# Create model

In [None]:
!pip install -U -q segmentation-models

import os
os.environ["SM_FRAMEWORK"] = "tf.keras"
import tensorflow as tf
tf.config.run_functions_eagerly(True)
from tensorflow import keras

import segmentation_models as sm


In [None]:
BACKBONE = 'vgg16'
preprocess_input = sm.get_preprocessing(BACKBONE)

In [None]:
model = sm.Unet(BACKBONE, input_shape = input_shape,  encoder_weights='imagenet')

In [None]:
optimizer_adam = Adam(learning_rate=1e-3)
model.compile(
    optimizer_adam,
    loss=dice_coef_loss,
    metrics=['accuracy',jacard_coef]
)

# Run Model

In [None]:
history = model.fit(train_generator,
                    validation_data=validation_generator,
                    verbose=1,
                    batch_size = batch_size,
                    shuffle=False,
                    epochs=6,
                    steps_per_epoch=1000 ,
                    validation_steps=90,
                    callbacks=[checkpoint_cb, early_stopping_cb])

# Display Results

In [None]:
import matplotlib.pyplot as plt

# Create a function to visualize the masks
def visualize(**images):
    n = len(images)
    plt.figure(figsize=(16, 5))
    for i, (name, image) in enumerate(images.items()):
        plt.subplot(1, n, i + 1)
        plt.xticks([])
        plt.yticks([])
        plt.title(' '.join(name.split('_')).title())
        plt.imshow(image)
    plt.show()

# Choose a batch from the validation set
for i, (images, true_masks) in enumerate(validation_generator):
    if i > 5:  # Only visualize the first 5 batches
        break

    # Predict the masks
    pred_masks = model.predict(images)

    # Since the model's output is in the sigmoid domain, you need to convert it to binary
    pred_masks_t = (pred_masks > 0.5).astype(np.uint8)

    # Display the images, true masks, and predicted masks
    for image, true_mask, pred_mask in zip(images, true_masks, pred_masks_t):
        visualize(Image=image, True_Mask=true_mask[..., 0], Predicted_Mask=pred_mask[..., 0])


# Dataset EDA

In [None]:
train_folder = '/kaggle/input/google-research-identify-contrails-reduce-global-warming/train'
validation_folder = '/kaggle/input/google-research-identify-contrails-reduce-global-warming/validation'
test_folder = '/kaggle/input/google-research-identify-contrails-reduce-global-warming/test'

In [None]:
import os
import numpy as np

def count_ones_in_masks(folder_path):
    subfolders = os.listdir(folder_path)
    less_than_ten = 0
    total_folders = 0
    for subfolder in subfolders:
        try:
            subfolder_path = os.path.join(folder_path, subfolder)

            mask_file_path = os.path.join(subfolder_path, 'human_pixel_masks.npy')
            if os.path.exists(mask_file_path):
                total_folders += 1
                mask = np.load(mask_file_path)
                if np.sum(mask) < 10:  # count the number of ones in the mask
                    less_than_ten += 1
        except NotADirectoryError:
            continue

    if total_folders > 0:
        proportion = less_than_ten / total_folders
    else:
        proportion = 0

    return proportion
validation_folder = '/kaggle/input/google-research-identify-contrails-reduce-global-warming/validation'
print("Proportion of folders with less than ten 1s in the mask files:", count_ones_in_masks(validation_folder))

In [None]:
train_folder = '/kaggle/input/google-research-identify-contrails-reduce-global-warming/train'
print("Proportion of folders with less than ten 1s in the mask files:", count_ones_in_masks(train_folder))

In [None]:
! find /kaggle/input/google-research-identify-contrails-reduce-global-warming/validation -mindepth 1 -type d | wc -l

In [None]:
! find /kaggle/input/google-research-identify-contrails-reduce-global-warming/train -mindepth 1 -type d | wc -l

In [None]:
! ls /kaggle/input/google-research-identify-contrails-reduce-global-warming/validation/6406117761842360513

In [None]:
! ls /kaggle/input/google-research-identify-contrails-reduce-global-warming/train/6547735628981251995