# **Library**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install netCDF4

In [None]:
import os
import pandas as pd
import requests
import tensorflow as tf
from bs4 import BeautifulSoup
import tarfile
import numpy as np
import netCDF4
import matplotlib.pyplot as plt
import random
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
from contextlib import nullcontext
import gc
from keras import models, layers, metrics
from keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.layers import Conv2D, Dropout, Dense, Flatten, MaxPooling2D
from keras.preprocessing import image
from keras.models import Sequential
from keras import losses
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelBinarizer

# **Utilities Function**

This file downloads the satellite images which will then be processed

Outline of this file:
- Loops through each year and downloads .tar.gz files containing satellite images for Atlantic and Pacific hurricanes
- Only extracts files from the .tar.gz files that contain images of hurricanes that we know the wind speed of
- When script is finished, the Satellite Imagery folder contains all netcdf files, which hold the satellite images

Purpose: This builds and trains the neural network with the data processed. This file also validates the model, telling the user how accurate it is.

Outline of this file:
- Reads and augments images of hurricanes and their labels for use in convolutional neural network (CNN)
- Builds a CNN and trains it on those images and labels
- Validates the model using k-fold validation
- Prints MAE and saves two graphs that show additional details about the model's error

Purpose of this file: This file builds and trains the neural network with the data processed by assemble.py. This file also validates the model, telling the user how accurate it is.

Outline of this file:
- Reads and augments images of hurricanes and their labels for use in convolutional neural network (CNN)
- Builds a CNN and trains it on those images and labels
- Validates the model using k-fold validation
- Prints MAE and saves two graphs that show additional details about the model's error

In [None]:
def print_progress(action, progress, total):
    percent_progress = round((progress / total) * 100, 1)
    print('\r' + action + '... ' + str(percent_progress) + '% (' + str(progress) + ' of ' + str(total) + ')', end = '')

In [None]:
def read_and_prepare_data(validation_mode, k = 5, augment = True):
    if validation_mode == 'k_fold':

        # Read in data from files
        images = np.load(image_path)
        labels = np.load(label_path)

        # Split the image and label datasets into k number of subsets
        folded_images = []
        folded_labels = []
        for i in range(k):
            start = int((i / k) * len(images))
            end = int(((i + 1) / k) * len(images))
            folded_images.append(images[start:end])
            folded_labels.append(labels[start:end])

        # Generate augmented images for each fold
        folded_augmented_images = []
        folded_augmented_labels = []
        for i in range(k):
            if augment:
                print('\nAugmenting Fold ' + str(i + 1) + ' of ' + str(k))
                augmented_images, augmented_labels = augment_images(folded_images[i], folded_labels[i])
                folded_augmented_images.append(augmented_images)
                folded_augmented_labels.append(augmented_labels)

        # Combine the folds into sets for each iteration of the model and standardize the data
        train_images = []
        train_labels = []
        test_images = []
        test_labels = []
        for i in range(k):
            train_images.append(np.concatenate(folded_images[:i] + folded_images[(i+1):]))
            train_labels.append(np.concatenate(folded_labels[:i] + folded_labels[(i+1):]))
            if augment:
                train_images[i] = np.concatenate(([train_images[i]] + folded_augmented_images[:i] + folded_augmented_images[(i + 1):]))
                train_labels[i] = np.concatenate(([train_labels[i]] + folded_augmented_labels[:i] + folded_augmented_labels[(i + 1):]))
            test_images.append(folded_images[i])
            test_labels.append(folded_labels[i])
            train_images[i], test_images[i] = standardize_data(train_images[i], test_images[i])

        return train_images, train_labels, test_images, test_labels

In [None]:
def augment_images(images, labels):
    # Create generators to augment images
    from keras.preprocessing import image
    flip_generator = image.ImageDataGenerator(horizontal_flip = True, vertical_flip = True)
    rotate_generator = image.ImageDataGenerator(rotation_range = 360, fill_mode = 'nearest')

    # Accumulate augmented images and labels
    augmented_images = []
    augmented_labels = []

    # Loop each images in the set to augment
    for i in range(len(images)):

        # Reshape image for generator
        image = np.reshape(images[i], (1, images[i].shape[0], images[i].shape[1], 1))
        label = labels[i]

        # Reset the number of augmented images have been created to zero
        num_new_images = 0

        # Generate 2 new images if the image is of a tropical cyclone between 50 and 75 knots
        if 50 < label < 75:
            for batch in flip_generator.flow(image, batch_size=1):
                gc.collect()
                new_image = np.reshape(batch[0], (batch[0].shape[0], batch[0].shape[1], 1))
                augmented_images.append(new_image)
                augmented_labels.append(label)
                num_new_images += 1
                if num_new_images == 2:
                    break

        # Generate 6 new images if the image is of a tropical cyclone between 75 and 100 knots
        elif 75 < label < 100:
            for batch in rotate_generator.flow(image, batch_size=1):
                gc.collect()
                new_image = np.reshape(batch[0], (batch[0].shape[0], batch[0].shape[1], 1))
                augmented_images.append(new_image)
                augmented_labels.append(label)
                num_new_images += 1
                if num_new_images == 6:
                    break

        # Generate 12 new images if the image is of a tropical cyclone greater than or equal to 100 knots
        elif 100 <= label:
            for batch in rotate_generator.flow(image, batch_size=1):
                gc.collect()
                new_image = np.reshape(batch[0], (batch[0].shape[0], batch[0].shape[1], 1))
                augmented_images.append(new_image)
                augmented_labels.append(label)
                num_new_images += 1
                if num_new_images == 12:
                    break

        print_progress('Augmenting Images', i + 1, len(images))

    # Convert lists of images/labels into numpy arrays
    augmented_images = np.array(augmented_images)
    augmented_labels = np.array(augmented_labels)

    return augmented_images, augmented_labels

In [None]:
def build_model():
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation = 'relu', input_shape = (50, 50, 1)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Dropout(0.2))

    model.add(layers.Conv2D(64, (3, 3), activation = 'relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Dropout(0.2))

    model.add(layers.Conv2D(128, (3, 3), activation = 'relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Dropout(0.2))

    model.add(layers.Conv2D(128, (3, 3), activation = 'relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Dropout(0.2))

    model.add(layers.Flatten())
    model.add(layers.Dropout(0.4))
    model.add(layers.Dense(512, activation = 'relu'))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(1, activation = None))
    model.summary()
    # Configure model optimization
    model.compile(optimizer = 'rmsprop', loss = 'mse', metrics = [metrics.MeanAbsoluteError(), metrics.RootMeanSquaredError()])
    return model

In [None]:
build_model()

In [None]:
def train_model(model, train_images, train_labels, test_images, test_labels, show_performance_by_epoch=False):
    # Run model and get metrics for each epoch
    performance_log = model.fit(
        train_images, train_labels, epochs = 100,
        batch_size = 64,
        validation_data = (test_images, test_labels),
        callbacks = [EarlyStopping(monitor = 'val_mean_absolute_error', patience = 5, restore_best_weights = True)]
    )

    if show_performance_by_epoch:
        performance_by_epoch(performance_log)

    return model

In [None]:
def performance_by_epoch(performance_log):
    # Get metrics for each epoch after model finishes training
    train_loss = performance_log.history['loss']
    test_loss = performance_log.history['val_loss']
    train_mae = performance_log.history['mean_absolute_error']
    test_mae = performance_log.history['val_mean_absolute_error']
    epochs = range(1, len(train_loss) + 1)

    # Build a dataframe storing epoch metrics
    performance_df = pd.DataFrame(columns = ['epoch', 'train_or_test', 'loss_or_mae', 'value'])
    for i in range(len(train_loss)):
        new_row = {'epoch': epochs[i], 'train_or_test': 'train', 'loss_or_mae': 'loss', 'value': train_loss[i]}
        performance_df = performance_df.append(new_row, ignore_index=True)
        new_row = {'epoch': epochs[i], 'train_or_test': 'test', 'loss_or_mae': 'loss', 'value': test_loss[i]}
        performance_df = performance_df.append(new_row, ignore_index=True)
        new_row = {'epoch': epochs[i], 'train_or_test': 'train', 'loss_or_mae': 'mae', 'value': train_mae[i]}
        performance_df = performance_df.append(new_row, ignore_index=True)
        new_row = {'epoch': epochs[i], 'train_or_test': 'test', 'loss_or_mae': 'mae', 'value': test_mae[i]}
        performance_df = performance_df.append(new_row, ignore_index=True)
    performance_df = performance_df.astype({'epoch': np.int64})

    # Plot metrics on graph, fitted with exponential decay curves
    lm = sns.lmplot(x = 'epoch', y = 'value', data = performance_df, row = 'loss_or_mae', logx = True, truncate = False, sharey = False, hue = 'train_or_test')  # Note: If epoch = 1, this line causes an error. Make sure epoch >= 2
    axes = lm.axes
    max_mae = performance_df.loc[performance_df.loss_or_mae == 'mae']['value'].max()
    min_mae = performance_df.loc[performance_df.loss_or_mae == 'mae']['value'].min()
    axes[1, 0].set_ylim(min_mae - min_mae * 0.2, max_mae + max_mae * 0.2)
    plt.show()

In [None]:
def generate_predictions(model, test_images, test_labels):
    # Run validation data through model and print mean absolute error
    raw_predictions = model.predict(test_images)
    raw_predictions = raw_predictions.flatten()

    # Build a dataframe storing data for each prediction made by the model
    processed_predictions = pd.DataFrame(columns = ['prediction', 'actual', 'abs_error', 'category'])
    for i in range(len(raw_predictions)):
        abs_error = abs(raw_predictions[i] - test_labels[i])
        new_row = {'prediction': raw_predictions[i], 'actual': test_labels[i], 'abs_error': abs_error, 'abs_error_squared': abs_error ** 2, 'category': category_of(test_labels[i])}
        processed_predictions = processed_predictions.append(new_row, ignore_index = True)
        print_progress('Processing Predictions', i + 1, len(raw_predictions))

    return processed_predictions

In [None]:
def show_validation_results(predictions, show_plots = True, print_error = True):
    print('\n\nRESULTS')

    if print_error:
        mae = predictions['abs_error'].mean()
        print('\nMean Absolute Error: ' + str(round(float(mae), 2)) + ' knots')
        rmse = predictions['abs_error_squared'].mean() ** 0.5
        print('Root Mean Square Error: ' + str(round(float(rmse), 2)) + ' knots')

    if show_plots:
        # List of categories in order of ascending strength
        categories = ['T. Depression', 'T. Storm', 'Category 1', 'Category 2', 'Category 3', 'Category 4', 'Category 5']

        # Show bar graph of median absolute error for each category
        plt.figure(figsize = (8, 8), dpi = 300)
        sns.barplot(x = 'category', y = 'abs_error', data = predictions, estimator = np.median, order = categories)
        sns.despine()
        plt.tight_layout()
        plt.xlabel("Hurricane Strength")
        plt.ylabel("Absolute Error")
        plt.legend()
        plt.title("Median Absolute Error in Neural Network's Predictions By Category")
        plt.plot()
        plt.clf()


        # Show density plot of error for each category
        for category in categories:
            num_samples_tested = len(predictions.loc[predictions.category == category]['abs_error'])
            sns.distplot(predictions.loc[predictions.category == category]['abs_error'], label = category + ' (' + str(num_samples_tested) + ' samples tested)', hist = False, kde_kws = {"shade": True})
            sns.despine()
        plt.xlabel("Absolute Error")
        plt.title("Distribution of Absolute Error By Category")
        plt.legend()
        plt.xlim(0, None)
        plt.ylim(0, None)
        print('Graph of error distribution by category saved as error_dist_by_category.png')
        plt.plot()

In [None]:
def standardize_data(train_images, test_images):
    train_images[train_images < 0] = 0
    test_images[test_images < 0] = 0
    st_dev = np.std(train_images)
    mean = np.mean(train_images)
    train_images = np.divide(np.subtract(train_images, mean), st_dev)
    test_images = np.divide(np.subtract(test_images, mean), st_dev)
    return train_images, test_images

In [None]:
def category_of(wind_speed):
    if wind_speed <= 33:
        return 'T. Depression'
    elif wind_speed <= 64:
        return 'T. Storm'
    elif wind_speed <= 83:
        return 'Category 1'
    elif wind_speed <= 95:
        return 'Category 2'
    elif wind_speed <= 113:
        return 'Category 3'
    elif wind_speed <= 134:
        return 'Category 4'
    else:
        return 'Category 5'

In [None]:
def download_hursat(years, path):
    # Reads in the Best Track dataset, which contain records of the location and maximum wind speed of every recorded hurricane in the Atlantic and Eastern/Central Pacific basins
    best_track_data = pd.read_csv(path)

    for year in years:
        # Scrapes a webpage to get list of all .tar.gz files. Each file contains all the satellite images associated with a particular hurricane.
        year_directory_url = 'https://www.ncei.noaa.gov/data/hurricane-satellite-hursat-b1/archive/v06/' + year
        year_directory_page = requests.get(year_directory_url).text
        year_directory_soup = BeautifulSoup(year_directory_page, 'html.parser')
        year_directory_file_urls = [year_directory_url + '/' + node.get('href') for node in year_directory_soup.find_all('a') if node.get('href').endswith('tar.gz')]
        print('\n' + year + ' file loaded.')

        files_processed = 0
        for storm_file_url in year_directory_file_urls:
            # Determine whether the best track dataset has information about this particular hurricane. This filters
            # out storms in basins other than the Atlantic or Pacific, since the best track dataset doesn't have information for those storms.
            storm_name = storm_file_url.split('_')[-2]
            year = int(storm_file_url.split('_')[3][:4])
            print(year)
            file_has_match_in_best_track = not best_track_data.loc[(best_track_data['year'] == year) & (best_track_data['storm_name'] == storm_name)].empty

            if file_has_match_in_best_track:
                # Build a string, which will be file path where we save the .tar.gz when downloaded
                file_name = storm_file_url.split('/')[-1]
                storm_file_path = 'Satellite Imagery/' + file_name

                # Create the Satellite Imagery folder if it doesn't already exist
                if not os.path.exists('Satellite Imagery'):
                    os.makedirs('Satellite Imagery')

                # Open the .tar.gz and copy it's contents from the web, onto our computer
                request = requests.get(storm_file_url, allow_redirects = True)
                open(storm_file_path, 'wb').write(request.content)
                request.close()

                # Open the .tar.gz file and loop through each file inside. Each of these netcdf files contains a satellite image of a hurricane at a moment in time
                tar = tarfile.open(storm_file_path)
                file_prefixes_in_directory = []
                for file_name in tar.getnames():
                    # Get the date and time of the satellite image, and the name of the satellite that took the image
                    fulldate = file_name.split(".")[2] + file_name.split(".")[3] + file_name.split(".")[4]
                    time = file_name.split(".")[5]
                    satellite = file_name.split(".")[7][:3]

                    # Determine whether the best track dataset has a record for the date and time of this storm.
                    file_has_match_in_best_track = not best_track_data.loc[(best_track_data['fulldate'] == int(fulldate)) & (best_track_data['time'] == int(time))].empty

                    # Determine whether another image of this hurricane at this exact time has already been extracted from the .tar.gz
                    is_redundant = '.'.join(file_name.split('.')[:6]) in file_prefixes_in_directory

                    # If the requirements are met, extract the netcdf file from this .tar.gz and save it locally
                    if file_has_match_in_best_track and not is_redundant and satellite == "GOE":
                        f = tar.extractfile(file_name)
                        open('Satellite Imagery/' + file_name, 'wb').write(f.read())
                        file_prefixes_in_directory.append('.'.join(file_name.split('.')[:6]))

                tar.close()
                os.remove(storm_file_path)

            files_processed += 1
            print_progress('Processing Files for ' + str(year), files_processed, len(year_directory_file_urls))

# **Run only once**
If you already have the generated files saved in your google drive, do not run the below code again, because there is a limit on google colab time for running the codebase, your code might stop after the threshold time limit of 90 minutes is reached, to safeguard that process, run the below codes for each basin only once.

In [None]:
# replace the file path for each basin, the below code is an example of how the paths have to be changed
# csv_path = '/content/drive/MyDrive/HURSAT/hursat/ibtracs_NA.csv'
# image_path = '/content/drive/MyDrive/HURSAT/hursat/NA_images.npy'
# label_path = '/content/drive/MyDrive/HURSAT/hursat/NA_labels.npy'

In [None]:
csv_path = '/content/drive/MyDrive/HURSAT/hursat/ibtracs_NA.csv'
image_path = '/content/drive/MyDrive/HURSAT/hursat/NA_images.npy'
label_path = '/content/drive/MyDrive/HURSAT/hursat/NA_labels.npy'

In [None]:
if __name__ == "__main__":
    # Specify a list of years. Satellite images of hurricanes from those years will be downloaded.
    # More years will provide more data for the neural network to work with in model.py, but will take longer to download.
    YEARS_TO_DOWNLOAD = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']
    download_hursat(YEARS_TO_DOWNLOAD, csv_path)

In [None]:
# Reads in the Best Track dataset, which contain records of the location and maximum wind speed of every recorded hurricane in the Atlantic and Eastern/Central Pacific basins
best_track_data = pd.read_csv(csv_path)

# The number of pixels wide and tall to crop the images of hurricanes to
side_length = 50

# Lists to hold the hurricane images and the wind speed associated with those images. These lists are aligned so that
# the first image in the images list corresponds to the first label in the labels list.
images = []
labels = []

# Gets list of names of files, each file containing a satellite image
files = os.listdir('Satellite Imagery')
num_files = len(files)

for i in range(len(files)):
    # Get IR satellite image from the file
    raw_data = netCDF4.Dataset('Satellite Imagery/' + files[i])
    ir_data = raw_data.variables['IRWIN'][0]

    # 'Crop' the image by removing north, south, east, and west edges
    south_bound = (ir_data.shape[0] - side_length) // 2
    north_bound = south_bound + side_length
    cropped_ir_data = ir_data[south_bound:north_bound]
    west_bound = (ir_data.shape[1] - side_length) // 2
    east_bound = side_length
    cropped_ir_data = np.delete(cropped_ir_data, np.s_[:west_bound], axis=1)
    cropped_ir_data = np.delete(cropped_ir_data, np.s_[east_bound:], axis=1)

    # Get storm name, date, and time of the hurricane from the image's file name
    file_name = files[i]
    file_name = file_name.split('.')
    storm_name = file_name[1]
    date = int(file_name[2] + file_name[3] + file_name[4])
    time = int(file_name[5])

    # Filter the best track dataset to find the row that matches the name, date, and time of this hurricane image
    matching_best_track_data = best_track_data.loc[(best_track_data.storm_name == storm_name) & (best_track_data.fulldate == date) & (best_track_data.time == time)]

    # Get the wind speed from the row that matches the name, date, and time of this hurricane image
    try:
        wind_speed = matching_best_track_data.max_sus_wind_speed.reset_index(drop = True)[0]
    except Exception:
        print('\rCould not find label for image of ' + storm_name + ' at date ' + str(date) + ' and time ' + str(time), end='\n')
        continue  # Skip to the next hurricane image if the a wind speed could not be found for this hurricane image

    # Add the image and wind speed to these lists. This way, the lists of images and labels always line up. The first hurricane image in the images list is associated with the first wind speed in the labels list.
    images.append(cropped_ir_data)
    labels.append(wind_speed)

    raw_data.close()

    print('\rProcessing Samples... ' + str(round(((i + 1) / num_files) * 100, 1)) + '% (' + str(i + 1) + ' of ' + str(num_files) + ')', end='')

print('\nSaving NumPy arrays...')

# Turn the list of images and labels into NumPy arrays
images = np.array(images)
labels = np.array(labels)

# Add a fourth dimension to the images array. This is one since we only have one color channel: grayscale. The fourth dimension would typically be 3 if we were working with color images
images = images.reshape((images.shape[0], side_length, side_length, 1))

# Save the NumPy arrays for use in model.py, where the neural network is trained and validated on this data
np.save(image_path, images)
np.save(label_path, labels)

print("\nNumPy files saved. Processing complete.")

In [None]:
images = np.load(image_path)
labels = np.load(label_path)

print('Number of images:', len(images), '\n')
print('Number of labels:', len(labels), '\n')

for x in range(5):
    i = random.randint(0, images.shape[0])
    image = np.reshape(images[i], (images[i].shape[0], images[i].shape[1]))
    plt.imshow(image, cmap = 'binary')
    plt.title('Image #' + str(i) + '   ' + str(labels[i]) + ' knots')
    plt.show()

# **North Atlantic**

In [None]:
csv_path = '/content/drive/MyDrive/HURSAT/hursat/ibtracs_NA.csv'
image_path = '/content/drive/MyDrive/HURSAT/hursat/NA_images.npy'
label_path = '/content/drive/MyDrive/HURSAT/hursat/NA_labels.npy'

In [None]:
data = np.load(label_path)

In [None]:
df = pd.DataFrame(data, columns = ['Wind Speed'])
display(df.dtypes)
df.head()

In [None]:
df['Wind Speed'] = df['Wind Speed'].apply(lambda row: category_of(row))
df.head()

In [None]:
df.to_csv('/content/drive/MyDrive/HURSAT/hursat/NA_labels.csv')

In [None]:
def read_and_prepare_data(validation_mode, k = 5, augment = True):
    if validation_mode == 'k_fold':

        # Read in data from files
        images = np.load(image_path)
        labels = np.load(label_path)

        # Split the image and label datasets into k number of subsets
        folded_images = []
        folded_labels = []
        for i in range(k):
            start = int((i / k) * len(images))
            end = int(((i + 1) / k) * len(images))
            folded_images.append(images[start:end])
            folded_labels.append(labels[start:end])

        # Generate augmented images for each fold
        folded_augmented_images = []
        folded_augmented_labels = []
        for i in range(k):
            if augment:
                print('\nAugmenting Fold ' + str(i + 1) + ' of ' + str(k))
                augmented_images, augmented_labels = augment_images(folded_images[i], folded_labels[i])
                folded_augmented_images.append(augmented_images)
                folded_augmented_labels.append(augmented_labels)

        # Combine the folds into sets for each iteration of the model and standardize the data
        train_images = []
        train_labels = []
        test_images = []
        test_labels = []
        for i in range(k):
            train_images.append(np.concatenate(folded_images[:i] + folded_images[(i+1):]))
            train_labels.append(np.concatenate(folded_labels[:i] + folded_labels[(i+1):]))
            if augment:
                train_images[i] = np.concatenate(([train_images[i]] + folded_augmented_images[:i] + folded_augmented_images[(i + 1):]))
                train_labels[i] = np.concatenate(([train_labels[i]] + folded_augmented_labels[:i] + folded_augmented_labels[(i + 1):]))
            test_images.append(folded_images[i])
            test_labels.append(folded_labels[i])
            train_images[i], test_images[i] = standardize_data(train_images[i], test_images[i])

        return train_images, train_labels, test_images, test_labels

This file takes the satellite image files downloaded and turns them into NumPy arrays that will be fed to the neural network

Outline of this file:
- Crops each satellite image
- Matches each satellite image of a hurricane with the maximum sustained wind speed of that hurricane
- Collects these images and their associated labels in arrays and saves them as numpy files

In [None]:
if __name__ == "__main__":
    # Specify whether the script should use Keras's ImageDataGenerator to augment the training dataset. Assigning
    # this variable to True will improve accuracy, but will also increase execution time.
    AUGMENT = True

    # Specify how many folds in the k-fold validation process. Can be any integer greater than or equal to 2. Larger
    # integers will increase execution time.
    NUM_FOLDS = 5

    train_images, train_labels, test_images, test_labels = read_and_prepare_data('k_fold', NUM_FOLDS, augment = AUGMENT)
    model = build_model()
    predictions = pd.DataFrame(columns = ['prediction', 'actual', 'abs_error', 'category'])
    for i in range(NUM_FOLDS):
        print('\n\nTraining Fold ' + str(i + 1) + ' of ' + str(NUM_FOLDS) + '\n')
        model = train_model(model, train_images[i], train_labels[i], test_images[i], test_labels[i])
        kth_fold_predictions = generate_predictions(model, test_images[i], test_labels[i])
        predictions = predictions.append(kth_fold_predictions, ignore_index = True)
    show_validation_results(predictions)

In [None]:
model.save('/content/drive/MyDrive/hursat/NA.hdf5')

# **North Pacific**

In [None]:
csv_path = '/content/drive/MyDrive/HURSAT/hursat/ibtracs_NP.csv'
image_path = '/content/drive/MyDrive/HURSAT/hursat/NP_images.npy'
label_path = '/content/drive/MyDrive/HURSAT/hursat/NP_labels.npy'

In [None]:
data = np.load(label_path)
df = pd.DataFrame(data, columns = ['Wind Speed'])
df['Wind Speed'] = df.apply(lambda row: categorise(row), axis = 1)
df.to_csv('/content/drive/MyDrive/hursat/NP_labels.csv')

In [None]:
def read_and_prepare_data(validation_mode, k = 5, augment = True):
    if validation_mode == 'k_fold':

        # Read in data from files
        images = np.load(image_path)
        labels = np.load(label_path)

        # Split the image and label datasets into k number of subsets
        folded_images = []
        folded_labels = []
        for i in range(k):
            start = int((i / k) * len(images))
            end = int(((i + 1) / k) * len(images))
            folded_images.append(images[start:end])
            folded_labels.append(labels[start:end])

        # Generate augmented images for each fold
        folded_augmented_images = []
        folded_augmented_labels = []
        for i in range(k):
            if augment:
                print('\nAugmenting Fold ' + str(i + 1) + ' of ' + str(k))
                augmented_images, augmented_labels = augment_images(folded_images[i], folded_labels[i])
                folded_augmented_images.append(augmented_images)
                folded_augmented_labels.append(augmented_labels)

        # Combine the folds into sets for each iteration of the model and standardize the data
        train_images = []
        train_labels = []
        test_images = []
        test_labels = []
        for i in range(k):
            train_images.append(np.concatenate(folded_images[:i] + folded_images[(i+1):]))
            train_labels.append(np.concatenate(folded_labels[:i] + folded_labels[(i+1):]))
            if augment:
                train_images[i] = np.concatenate(([train_images[i]] + folded_augmented_images[:i] + folded_augmented_images[(i + 1):]))
                train_labels[i] = np.concatenate(([train_labels[i]] + folded_augmented_labels[:i] + folded_augmented_labels[(i + 1):]))
            test_images.append(folded_images[i])
            test_labels.append(folded_labels[i])
            train_images[i], test_images[i] = standardize_data(train_images[i], test_images[i])

        return train_images, train_labels, test_images, test_labels

In [None]:
if __name__ == "__main__":
    AUGMENT = True

    NUM_FOLDS = 5

    train_images, train_labels, test_images, test_labels = read_and_prepare_data('k_fold', NUM_FOLDS, augment = AUGMENT)
    model = build_model()
    predictions = pd.DataFrame(columns = ['prediction', 'actual', 'abs_error', 'category'])
    for i in range(NUM_FOLDS):
        print('\n\nTraining Fold ' + str(i + 1) + ' of ' + str(NUM_FOLDS) + '\n')
        model = train_model(model, train_images[i], train_labels[i], test_images[i], test_labels[i])
        kth_fold_predictions = generate_predictions(model, test_images[i], test_labels[i])
        predictions = predictions.append(kth_fold_predictions, ignore_index = True)
    show_validation_results(predictions)

In [None]:
model.save('/content/drive/MyDrive/hursat/NP.hdf5')

# **North Indian**

In [None]:
csv_path = '/content/drive/MyDrive/HURSAT/hursat/ibtracs_NI.csv'
image_path = '/content/drive/MyDrive/HURSAT/hursat/NI_images.npy'
label_path = '/content/drive/MyDrive/HURSAT/hursat/NI_labels.npy'

In [None]:
def read_and_prepare_data(validation_mode, k = 5, augment = True):
    if validation_mode == 'k_fold':

        # Read in data from files
        images = np.load(image_path)
        labels = np.load(label_path)

        # Split the image and label datasets into k number of subsets
        folded_images = []
        folded_labels = []
        for i in range(k):
            start = int((i / k) * len(images))
            end = int(((i + 1) / k) * len(images))
            folded_images.append(images[start:end])
            folded_labels.append(labels[start:end])

        # Generate augmented images for each fold
        folded_augmented_images = []
        folded_augmented_labels = []
        for i in range(k):
            if augment:
                print('\nAugmenting Fold ' + str(i + 1) + ' of ' + str(k))
                augmented_images, augmented_labels = augment_images(folded_images[i], folded_labels[i])
                folded_augmented_images.append(augmented_images)
                folded_augmented_labels.append(augmented_labels)

        # Combine the folds into sets for each iteration of the model and standardize the data
        train_images = []
        train_labels = []
        test_images = []
        test_labels = []
        for i in range(k):
            train_images.append(np.concatenate(folded_images[:i] + folded_images[(i+1):]))
            train_labels.append(np.concatenate(folded_labels[:i] + folded_labels[(i+1):]))
            if augment:
                train_images[i] = np.concatenate(([train_images[i]] + folded_augmented_images[:i] + folded_augmented_images[(i + 1):]))
                train_labels[i] = np.concatenate(([train_labels[i]] + folded_augmented_labels[:i] + folded_augmented_labels[(i + 1):]))
            test_images.append(folded_images[i])
            test_labels.append(folded_labels[i])
            train_images[i], test_images[i] = standardize_data(train_images[i], test_images[i])

        return train_images, train_labels, test_images, test_labels

In [None]:
if __name__ == "__main__":
    AUGMENT = True
    NUM_FOLDS = 5

    train_images, train_labels, test_images, test_labels = read_and_prepare_data('k_fold', NUM_FOLDS, augment = AUGMENT)
    model = build_model()
    predictions = pd.DataFrame(columns = ['prediction', 'actual', 'abs_error', 'category'])
    for i in range(NUM_FOLDS):
        print('\n\nTraining Fold ' + str(i + 1) + ' of ' + str(NUM_FOLDS) + '\n')
        model = train_model(model, train_images[i], train_labels[i], test_images[i], test_labels[i])
        kth_fold_predictions = generate_predictions(model, test_images[i], test_labels[i])
        predictions = predictions.append(kth_fold_predictions, ignore_index = True)
    show_validation_results(predictions)

In [None]:
model.save('/content/drive/MyDrive/hursat/NI.hdf5')

# **North Atlantic + North Pacific + North Indian**

In [None]:
csv_path = '/content/drive/MyDrive/hursat/ibtracs_NP_NI_NA.csv'
image_path = '/content/drive/MyDrive/hursat/NP_NI_NA_images.npy'
label_path = '/content/drive/MyDrive/hursat/NP_NI_NA_labels.npy'

In [None]:
def read_and_prepare_data(validation_mode, k = 5, augment = True):
    if validation_mode == 'k_fold':

        # Read in data from files
        images = np.load(image_path)
        labels = np.load(label_path)

        # Split the image and label datasets into k number of subsets
        folded_images = []
        folded_labels = []
        for i in range(k):
            start = int((i / k) * len(images))
            end = int(((i + 1) / k) * len(images))
            folded_images.append(images[start:end])
            folded_labels.append(labels[start:end])

        # Generate augmented images for each fold
        folded_augmented_images = []
        folded_augmented_labels = []
        for i in range(k):
            if augment:
                print('\nAugmenting Fold ' + str(i + 1) + ' of ' + str(k))
                augmented_images, augmented_labels = augment_images(folded_images[i], folded_labels[i])
                folded_augmented_images.append(augmented_images)
                folded_augmented_labels.append(augmented_labels)

        # Combine the folds into sets for each iteration of the model and standardize the data
        train_images = []
        train_labels = []
        test_images = []
        test_labels = []
        for i in range(k):
            train_images.append(np.concatenate(folded_images[:i] + folded_images[(i+1):]))
            train_labels.append(np.concatenate(folded_labels[:i] + folded_labels[(i+1):]))
            if augment:
                train_images[i] = np.concatenate(([train_images[i]] + folded_augmented_images[:i] + folded_augmented_images[(i + 1):]))
                train_labels[i] = np.concatenate(([train_labels[i]] + folded_augmented_labels[:i] + folded_augmented_labels[(i + 1):]))
            test_images.append(folded_images[i])
            test_labels.append(folded_labels[i])
            train_images[i], test_images[i] = standardize_data(train_images[i], test_images[i])

        return train_images, train_labels, test_images, test_labels

In [None]:
if __name__ == "__main__":
    AUGMENT = True
    NUM_FOLDS = 5

    train_images, train_labels, test_images, test_labels = read_and_prepare_data('k_fold', NUM_FOLDS, augment = AUGMENT)
    model = build_model()
    predictions = pd.DataFrame(columns=['prediction', 'actual', 'abs_error', 'category'])
    for i in range(NUM_FOLDS):
        print('\n\nTraining Fold ' + str(i + 1) + ' of ' + str(NUM_FOLDS) + '\n')
        model = train_model(model, train_images[i], train_labels[i], test_images[i], test_labels[i])
        kth_fold_predictions = generate_predictions(model, test_images[i], test_labels[i])
        predictions = predictions.append(kth_fold_predictions, ignore_index = True)
    show_validation_results(predictions)

In [None]:
model.save('/content/drive/MyDrive/hursat/NP_NI_NA.hdf5')

# **North Atlantic + North Pacific**

In [None]:
csv_path = '/content/drive/MyDrive/hursat/ibtracs_NP_NA.csv'
image_path = '/content/drive/MyDrive/hursat/NP_NA_images.npy'
label_path = '/content/drive/MyDrive/hursat/NP_NA_labels.npy'

In [None]:
data = np.load(label_path)
df = pd.DataFrame(data, columns = ['Wind Speed'])
df['Wind Speed'] = df.apply(lambda row: categorise(row), axis = 1)
df.to_csv('/content/drive/MyDrive/hursat/NP_NA_labels.csv')

In [None]:
def read_and_prepare_data(validation_mode, k = 5, augment = True):
    if validation_mode == 'k_fold':

        # Read in data from files
        images = np.load(image_path)
        labels = np.load(label_path)

        # Split the image and label datasets into k number of subsets
        folded_images = []
        folded_labels = []
        for i in range(k):
            start = int((i / k) * len(images))
            end = int(((i + 1) / k) * len(images))
            folded_images.append(images[start:end])
            folded_labels.append(labels[start:end])

        # Generate augmented images for each fold
        folded_augmented_images = []
        folded_augmented_labels = []
        for i in range(k):
            if augment:
                print('\nAugmenting Fold ' + str(i + 1) + ' of ' + str(k))
                augmented_images, augmented_labels = augment_images(folded_images[i], folded_labels[i])
                folded_augmented_images.append(augmented_images)
                folded_augmented_labels.append(augmented_labels)

        # Combine the folds into sets for each iteration of the model and standardize the data
        train_images = []
        train_labels = []
        test_images = []
        test_labels = []
        for i in range(k):
            train_images.append(np.concatenate(folded_images[:i] + folded_images[(i+1):]))
            train_labels.append(np.concatenate(folded_labels[:i] + folded_labels[(i+1):]))
            if augment:
                train_images[i] = np.concatenate(([train_images[i]] + folded_augmented_images[:i] + folded_augmented_images[(i + 1):]))
                train_labels[i] = np.concatenate(([train_labels[i]] + folded_augmented_labels[:i] + folded_augmented_labels[(i + 1):]))
            test_images.append(folded_images[i])
            test_labels.append(folded_labels[i])
            train_images[i], test_images[i] = standardize_data(train_images[i], test_images[i])

        return train_images, train_labels, test_images, test_labels

In [None]:
if __name__ == "__main__":
    AUGMENT = True

    NUM_FOLDS = 5

    train_images, train_labels, test_images, test_labels = read_and_prepare_data('k_fold', NUM_FOLDS, augment=AUGMENT)
    model = build_model()
    predictions = pd.DataFrame(columns = ['prediction', 'actual', 'abs_error', 'category'])
    for i in range(NUM_FOLDS):
        print('\n\nTraining Fold ' + str(i + 1) + ' of ' + str(NUM_FOLDS) + '\n')
        model = train_model(model, train_images[i], train_labels[i], test_images[i], test_labels[i])
        kth_fold_predictions = generate_predictions(model, test_images[i], test_labels[i])
        predictions = predictions.append(kth_fold_predictions, ignore_index = True)
    show_validation_results(predictions)

In [None]:
model.save('/content/drive/MyDrive/hursat/NP_NA.hdf5')

# **North Indian + North Atlantic**

In [None]:
if __name__ == "__main__":
    # Specify a list of years. Satellite images of hurricanes from those years will be downloaded.
    YEARS_TO_DOWNLOAD = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']
    path = '/content/drive/MyDrive/hursat/ibtracs_NI_NA.csv'
    download_hursat(YEARS_TO_DOWNLOAD, path)

In [None]:
# This file takes the satellite image files downloaded and turns them into NumPy arrays that will be fed to the neural network
# Outline of this file:
# - Crops each satellite image
# - Matches each satellite image of a hurricane with the maximum sustained wind speed of that hurricane
# - Collects these images and their associated labels in arrays and saves them as numpy files

# Reads in the Best Track dataset, which contain records of the location and maximum wind speed of every recorded hurricane in the Atlantic and Eastern/Central Pacific basins
best_track_data = pd.read_csv('/content/drive/MyDrive/hursat/ibtracs_NI_NA.csv')

# The number of pixels wide and tall to crop the images of hurricanes to
side_length = 50

# Lists to hold the hurricane images and the wind speed associated with those images. These lists are aligned so that
# the first image in the images list corresponds to the first label in the labels list.
images = []
labels = []

# Gets list of names of files, each file containing a satellite image
files = os.listdir('Satellite Imagery')
num_files = len(files)

for i in range(len(files)):
    # Get IR satellite image from the file
    raw_data = netCDF4.Dataset('Satellite Imagery/' + files[i])
    ir_data = raw_data.variables['IRWIN'][0]

    # 'Crop' the image by removing north, south, east, and west edges
    south_bound = (ir_data.shape[0] - side_length) // 2
    north_bound = south_bound + side_length
    cropped_ir_data = ir_data[south_bound:north_bound]
    west_bound = (ir_data.shape[1] - side_length) // 2
    east_bound = side_length
    cropped_ir_data = np.delete(cropped_ir_data, np.s_[:west_bound], axis=1)
    cropped_ir_data = np.delete(cropped_ir_data, np.s_[east_bound:], axis=1)

    # Get storm name, date, and time of the hurricane from the image's file name
    file_name = files[i]
    file_name = file_name.split('.')
    storm_name = file_name[1]
    date = int(file_name[2] + file_name[3] + file_name[4])
    time = int(file_name[5])

    # Filter the best track dataset to find the row that matches the name, date, and time of this hurricane image
    matching_best_track_data = best_track_data.loc[(best_track_data.storm_name == storm_name) & (best_track_data.fulldate == date) & (best_track_data.time == time)]

    # Get the wind speed from the row that matches the name, date, and time of this hurricane image
    try:
        wind_speed = matching_best_track_data.max_sus_wind_speed.reset_index(drop = True)[0]
    except Exception:
        print('\rCould not find label for image of ' + storm_name + ' at date ' + str(date) + ' and time ' + str(time), end='\n')
        continue  # Skip to the next hurricane image if the a wind speed could not be found for this hurricane image

    # Add the image and wind speed to these lists. This way, the lists of images and labels always line up. The first
    # hurricane image in the images list is associated with the first wind speed in the labels list.
    images.append(cropped_ir_data)
    labels.append(wind_speed)

    raw_data.close()

    print('\rProcessing Samples... ' + str(round(((i + 1) / num_files) * 100, 1)) + '% (' + str(i + 1) + ' of ' + str(num_files) + ')', end='')

print('\nSaving NumPy arrays...')

# Turn the list of images and labels into NumPy arrays
images = np.array(images)
labels = np.array(labels)

# Add a fourth dimension to the images array. This is one since we only have one color channel: grayscale. The fourth
# dimension would typically be 3 if we were working with color images
images = images.reshape((images.shape[0], side_length, side_length, 1))

# Save the NumPy arrays for use in model.py, where the neural network is trained and validated on this data
np.save('/content/drive/MyDrive/hursat/NI_NA_images.npy', images)
np.save('/content/drive/MyDrive/hursat/NI_NA_labels.npy', labels)

print("\nNumPy files saved. Processing complete.")

In [None]:
# Outline:
# - Reads numpy files containing hurricane images and the wind speed they are associated with
# - Shows 10 random satellite images and their wind speeds
# - When running the script, closing the current matplotlib window will cause the next one to open

images = np.load('/content/drive/MyDrive/hursat/NI_NA_images.npy')
labels = np.load('/content/drive/MyDrive/hursat/NI_NA_labels.npy')

for x in range(10):
    i = random.randint(0, images.shape[0])
    image = np.reshape(images[i], (images[i].shape[0], images[i].shape[1]))
    plt.imshow(image, cmap='binary')
    plt.title('Image #' + str(i) + '   ' + str(labels[i]) + ' knots')
    plt.show()

In [None]:
if __name__ == "__main__":
    # Specify whether the script should use Keras's ImageDataGenerator to augment the training dataset. Assigning
    # this variable to True will improve accuracy, but will also increase execution time.
    AUGMENT = True

    # Specify how many folds in the k-fold validation process. Can be any integer greater than or equal to 2. Larger
    # integers will increase execution time.
    NUM_FOLDS = 5

    train_images, train_labels, test_images, test_labels = read_and_prepare_data('k_fold', NUM_FOLDS, augment=AUGMENT)
    model = build_model()
    predictions = pd.DataFrame(columns=['prediction', 'actual', 'abs_error', 'category'])
    for i in range(NUM_FOLDS):
        print('\n\nTraining Fold ' + str(i + 1) + ' of ' + str(NUM_FOLDS) + '\n')
        model = train_model(model, train_images[i], train_labels[i], test_images[i], test_labels[i])
        kth_fold_predictions = generate_predictions(model, test_images[i], test_labels[i])
        predictions = predictions.append(kth_fold_predictions, ignore_index = True)
    show_validation_results(predictions)

# **Testing NA model on NI dataset**

In [None]:
tf.keras.backend.clear_session()

In [None]:
model = load_model('/content/drive/MyDrive/hursat/NA.hdf5')

In [None]:
csv_path = '/content/drive/MyDrive/hursat/ibtracs_NI.csv'
image_path = '/content/drive/MyDrive/hursat/NI_images.npy'
label_path = '/content/drive/MyDrive/hursat/NI_labels.npy'

In [None]:
def standardize_data(images):
    images[images < 0] = 0
    st_dev = np.std(images)
    mean = np.mean(images)
    images = np.divide(np.subtract(images, mean), st_dev)
    return images

In [None]:
def read_and_prepare_data(k):
    images = np.load(image_path)
    labels = np.load(label_path)

    folded_images = []
    folded_labels = []
    for i in range(k):
        start = int((i / k) * len(images))
        end = int(((i + 1) / k) * len(images))
        folded_images.append(images[start:end])
        folded_labels.append(labels[start:end])

    test_images = []
    test_labels = []
    for i in range(k):
        test_images.append(folded_images[i])
        test_labels.append(folded_labels[i])
        test_images[i] = standardize_data(test_images[i])

    return test_images, test_labels

In [None]:
def generate_predictions(model, images, labels):
    raw_predictions = model.predict(images)
    raw_predictions = raw_predictions.flatten()

    processed_predictions = pd.DataFrame(columns = ['prediction', 'actual', 'abs_error', 'category'])
    for i in range(len(raw_predictions)):
        abs_error = abs(raw_predictions[i] - labels[i])
        new_row = {'prediction': raw_predictions[i], 'actual': labels[i], 'abs_error': abs_error,
                   'abs_error_squared': abs_error ** 2, 'category': category_of(labels[i])}
        processed_predictions = processed_predictions.append(new_row, ignore_index = True)
        print_progress('Processing Predictions', i + 1, len(raw_predictions))

    return processed_predictions

In [None]:
num_folds = 1

images, labels = read_and_prepare_data(num_folds)

predictions = pd.DataFrame(columns = ['prediction', 'actual', 'abs_error', 'category'])

for i in range(num_folds):
    print('\n\nTesting Fold ' + str(i + 1) + ' of ' + str(num_folds) + '\n')
    kth_fold_predictions = generate_predictions(model, images[i], labels[i])
    predictions = predictions.append(kth_fold_predictions, ignore_index = True)
show_validation_results(predictions)

# **Testing NP model on NI dataset**

In [None]:
tf.keras.backend.clear_session()

In [None]:
model = load_model('/content/drive/MyDrive/hursat/NP.hdf5')

In [None]:
csv_path = '/content/drive/MyDrive/hursat/ibtracs_NI.csv'
image_path = '/content/drive/MyDrive/hursat/NI_images.npy'
label_path = '/content/drive/MyDrive/hursat/NI_labels.npy'

In [None]:
def standardize_data(images):
    images[images < 0] = 0
    st_dev = np.std(images)
    mean = np.mean(images)
    images = np.divide(np.subtract(images, mean), st_dev)
    return images

In [None]:
def read_and_prepare_data(k):
    images = np.load(image_path)
    labels = np.load(label_path)

    folded_images = []
    folded_labels = []
    for i in range(k):
        start = int((i / k) * len(images))
        end = int(((i + 1) / k) * len(images))
        folded_images.append(images[start:end])
        folded_labels.append(labels[start:end])

    test_images = []
    test_labels = []
    for i in range(k):
        test_images.append(folded_images[i])
        test_labels.append(folded_labels[i])
        test_images[i] = standardize_data(test_images[i])

    return test_images, test_labels

In [None]:
def generate_predictions(model, images, labels):
    raw_predictions = model.predict(images)
    raw_predictions = raw_predictions.flatten()

    processed_predictions = pd.DataFrame(columns = ['prediction', 'actual', 'abs_error', 'category'])
    for i in range(len(raw_predictions)):
        abs_error = abs(raw_predictions[i] - labels[i])
        new_row = {'prediction': raw_predictions[i], 'actual': labels[i], 'abs_error': abs_error,
                   'abs_error_squared': abs_error ** 2, 'category': category_of(labels[i])}
        processed_predictions = processed_predictions.append(new_row, ignore_index = True)
        print_progress('Processing Predictions', i + 1, len(raw_predictions))

    return processed_predictions

In [None]:
num_folds = 1

test_images, test_labels = read_and_prepare_data(num_folds)

predictions = pd.DataFrame(columns = ['prediction', 'actual', 'abs_error', 'category'])

for i in range(num_folds):
    print('\n\nTesting Fold ' + str(i + 1) + ' of ' + str(num_folds) + '\n')
    kth_fold_predictions = generate_predictions(model, test_images[i], test_labels[i])
    predictions = predictions.append(kth_fold_predictions, ignore_index = True)
show_validation_results(predictions)

# **Testing NP+NA model on NI dataset**

In [None]:
tf.keras.backend.clear_session()

In [None]:
model = load_model('/content/drive/MyDrive/hursat/NP_NA.hdf5')

In [None]:
csv_path = '/content/drive/MyDrive/hursat/ibtracs_NI.csv'
image_path = '/content/drive/MyDrive/hursat/NI_images.npy'
label_path = '/content/drive/MyDrive/hursat/NI_labels.npy'

In [None]:
def standardize_data(images):
    images[images < 0] = 0
    st_dev = np.std(images)
    mean = np.mean(images)
    images = np.divide(np.subtract(images, mean), st_dev)
    return images

In [None]:
def read_and_prepare_data(k):
    images = np.load(image_path)
    labels = np.load(label_path)

    folded_images = []
    folded_labels = []
    for i in range(k):
        start = int((i / k) * len(images))
        end = int(((i + 1) / k) * len(images))
        folded_images.append(images[start:end])
        folded_labels.append(labels[start:end])

    test_images = []
    test_labels = []
    for i in range(k):
        test_images.append(folded_images[i])
        test_labels.append(folded_labels[i])
        test_images[i] = standardize_data(test_images[i])

    return test_images, test_labels

In [None]:
def generate_predictions(model, images, labels):
    raw_predictions = model.predict(images)
    raw_predictions = raw_predictions.flatten()

    processed_predictions = pd.DataFrame(columns = ['prediction', 'actual', 'abs_error', 'category'])
    for i in range(len(raw_predictions)):
        abs_error = abs(raw_predictions[i] - labels[i])
        new_row = {'prediction': raw_predictions[i], 'actual': labels[i], 'abs_error': abs_error,
                   'abs_error_squared': abs_error ** 2, 'category': category_of(labels[i])}
        processed_predictions = processed_predictions.append(new_row, ignore_index = True)
        print_progress('Processing Predictions', i + 1, len(raw_predictions))

    return processed_predictions

In [None]:
num_folds = 1

test_images, test_labels = read_and_prepare_data(num_folds)

predictions = pd.DataFrame(columns = ['prediction', 'actual', 'abs_error', 'category'])

for i in range(num_folds):
    print('\n\nTesting Fold ' + str(i + 1) + ' of ' + str(num_folds) + '\n')
    kth_fold_predictions = generate_predictions(model, test_images[i], test_labels[i])
    predictions = predictions.append(kth_fold_predictions, ignore_index = True)
show_validation_results(predictions)

# **North Pacific + North Indian + North Atlantic**

In [None]:
if __name__ == "__main__":
    # Specify a list of years. Satellite images of hurricanes from those years will be downloaded.
    YEARS_TO_DOWNLOAD = ['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']
    path = '/content/drive/MyDrive/ibtracs_NP_NI_NA.csv'
    download_hursat(YEARS_TO_DOWNLOAD, path)

In [None]:
# This file takes the satellite image files downloaded and turns them into NumPy arrays that will be fed to the neural network
# Outline of this file:
# - Crops each satellite image
# - Matches each satellite image of a hurricane with the maximum sustained wind speed of that hurricane
# - Collects these images and their associated labels in arrays and saves them as numpy files

# Reads in the Best Track dataset, which contain records of the location and maximum wind speed of every recorded hurricane in the Atlantic and Eastern/Central Pacific basins
best_track_data = pd.read_csv('/content/drive/MyDrive/ibtracs_NP_NI_NA.csv')

# The number of pixels wide and tall to crop the images of hurricanes to
side_length = 50

# Lists to hold the hurricane images and the wind speed associated with those images. These lists are aligned so that
# the first image in the images list corresponds to the first label in the labels list.
images = []
labels = []

# Gets list of names of files, each file containing a satellite image
files = os.listdir('Satellite Imagery')
num_files = len(files)

for i in range(len(files)):
    # Get IR satellite image from the file
    raw_data = netCDF4.Dataset('Satellite Imagery/' + files[i])
    ir_data = raw_data.variables['IRWIN'][0]

    # 'Crop' the image by removing north, south, east, and west edges
    south_bound = (ir_data.shape[0] - side_length) // 2
    north_bound = south_bound + side_length
    cropped_ir_data = ir_data[south_bound:north_bound]
    west_bound = (ir_data.shape[1] - side_length) // 2
    east_bound = side_length
    cropped_ir_data = np.delete(cropped_ir_data, np.s_[:west_bound], axis=1)
    cropped_ir_data = np.delete(cropped_ir_data, np.s_[east_bound:], axis=1)

    # Get storm name, date, and time of the hurricane from the image's file name
    file_name = files[i]
    file_name = file_name.split('.')
    storm_name = file_name[1]
    date = int(file_name[2] + file_name[3] + file_name[4])
    time = int(file_name[5])

    # Filter the best track dataset to find the row that matches the name, date, and time of this hurricane image
    matching_best_track_data = best_track_data.loc[(best_track_data.storm_name == storm_name) & (best_track_data.fulldate == date) & (best_track_data.time == time)]

    # Get the wind speed from the row that matches the name, date, and time of this hurricane image
    try:
        wind_speed = matching_best_track_data.max_sus_wind_speed.reset_index(drop = True)[0]
    except Exception:
        print('\rCould not find label for image of ' + storm_name + ' at date ' + str(date) + ' and time ' + str(time), end='\n')
        continue  # Skip to the next hurricane image if the a wind speed could not be found for this hurricane image

    # Add the image and wind speed to these lists. This way, the lists of images and labels always line up. The first
    # hurricane image in the images list is associated with the first wind speed in the labels list.
    images.append(cropped_ir_data)
    labels.append(wind_speed)

    raw_data.close()

    print('\rProcessing Samples... ' + str(round(((i + 1) / num_files) * 100, 1)) + '% (' + str(i + 1) + ' of ' + str(num_files) + ')', end='')

print('\nSaving NumPy arrays...')

# Turn the list of images and labels into NumPy arrays
images = np.array(images)
labels = np.array(labels)

# Add a fourth dimension to the images array. This is one since we only have one color channel: grayscale. The fourth
# dimension would typically be 3 if we were working with color images
images = images.reshape((images.shape[0], side_length, side_length, 1))

# Save the NumPy arrays for use in model.py, where the neural network is trained and validated on this data
np.save('images.npy', images)
np.save('labels.npy', labels)

print("\nNumPy files saved. Processing complete.")

In [None]:
# Outline:
# - Reads numpy files containing hurricane images and the wind speed they are associated with
# - Shows 10 random satellite images and their wind speeds
# - When running the script, closing the current matplotlib window will cause the next one to open

images = np.load('images.npy')
labels = np.load('labels.npy')

for x in range(5):
    i = random.randint(0, images.shape[0])
    image = np.reshape(images[i], (images[i].shape[0], images[i].shape[1]))
    plt.imshow(image, cmap='binary')
    plt.title('Image #' + str(i) + '   ' + str(labels[i]) + ' knots')
    plt.show()

In [None]:
if __name__ == "__main__":
    # Specify whether the script should use Keras's ImageDataGenerator to augment the training dataset. Assigning
    # this variable to True will improve accuracy, but will also increase execution time.
    AUGMENT = True

    # Specify how many folds in the k-fold validation process. Can be any integer greater than or equal to 2. Larger
    # integers will increase execution time.
    NUM_FOLDS = 5

    train_images, train_labels, test_images, test_labels = read_and_prepare_data('k_fold', NUM_FOLDS, augment=AUGMENT)
    model = build_model()
    predictions = pd.DataFrame(columns=['prediction', 'actual', 'abs_error', 'category'])
    for i in range(NUM_FOLDS):
        print('\n\nTraining Fold ' + str(i + 1) + ' of ' + str(NUM_FOLDS) + '\n')
        model = train_model(model, train_images[i], train_labels[i], test_images[i], test_labels[i])
        kth_fold_predictions = generate_predictions(model, test_images[i], test_labels[i])
        predictions = predictions.append(kth_fold_predictions, ignore_index=True)
    show_validation_results(predictions)