In [1]:
import pandas as pd
import numpy as np
import os
import netCDF4

In [4]:
best_track_data = pd.read_csv('/Users/opopligher1996/workspace/master/BusinessIntelligenceTechniquesAndApplications_DSME6756/project/DSME6756_Group_Project/Section_2/cyclone_intensity/dataset/besttrack.csv')

side_length = 50

images = []
labels = []

# Gets list of names of files, each file containing a satellite image
files = os.listdir('/Users/opopligher1996/workspace/master/BusinessIntelligenceTechniquesAndApplications_DSME6756/project/DSME6756_Group_Project/Section_2/cyclone_intensity/code/Satellite Imagery/')
num_files = len(files)

for i in range(len(files)):
    # Get IR satellite image from the file
    raw_data = netCDF4.Dataset('/Users/opopligher1996/workspace/master/BusinessIntelligenceTechniquesAndApplications_DSME6756/project/DSME6756_Group_Project/Section_2/cyclone_intensity/code/Satellite Imagery/' + files[i])
    ir_data = raw_data.variables['IRWIN'][0]

    # 'Crop' the image by removing north, south, east, and west edges
    south_bound = (ir_data.shape[0] - side_length) // 2
    north_bound = south_bound + side_length
    cropped_ir_data = ir_data[south_bound:north_bound]
    west_bound = (ir_data.shape[1] - side_length) // 2
    east_bound = side_length
    cropped_ir_data = np.delete(cropped_ir_data, np.s_[:west_bound], axis=1)
    cropped_ir_data = np.delete(cropped_ir_data, np.s_[east_bound:], axis=1)

    # Get storm name, date, and time of the hurricane from the image's file name
    file_name = files[i]
    file_name = file_name.split('.')
    storm_name = file_name[1]
    date = int(file_name[2] + file_name[3] + file_name[4])
    time = int(file_name[5])

    # Filter the best track dataset to find the row that matches the name, date, and time of this hurricane image
    matching_best_track_data = best_track_data.loc[
        (best_track_data.storm_name == storm_name) &
        (best_track_data.fulldate == date) &
        (best_track_data.time == time)
    ]

    # Get the wind speed from the row that matches the name, date, and time of this hurricane image
    try:
        wind_speed = matching_best_track_data.max_sus_wind_speed.reset_index(drop=True)[0]
    except Exception:
        print('\rCould not find label for image of ' + storm_name + ' at date ' + str(date) + ' and time ' + str(time), end='\n')
        continue  # Skip to the next hurricane image if the a wind speed could not be found for this hurricane image

    # Add the image and wind speed to these lists. This way, the lists of images and labels always line up. The first
    # hurricane image in the images list is associated with the first wind speed in the labels list.
    images.append(cropped_ir_data)
    labels.append(wind_speed)

    raw_data.close()

    print('\rProcessing Samples... ' + str(round(((i + 1) / num_files) * 100, 1)) + '% (' + str(i + 1) + ' of ' + str(
        num_files) + ')', end='')

print('\nSaving NumPy arrays...')

# Turn the list of images and labels into NumPy arrays
images = np.array(images)
labels = np.array(labels)

# Add a fourth dimension to the images array. This is one since we only have one color channel: grayscale. The fourth
# dimension would typically be 3 if we were working with color images
images = images.reshape((images.shape[0], side_length, side_length, 1))

# Save the NumPy arrays for use in model.py, where the neural network is trained and validated on this data
np.save('images.npy', images)
np.save('labels.npy', labels)

print("\nNumPy files saved. Processing complete.")

Could not find label for image of ONE at date 20160608 and time 1200
Could not find label for image of PALI at date 20160115 and time 600
Could not find label for image of PALI at date 20160115 and time 1200
Could not find label for image of PALI at date 20160115 and time 0
Processing Samples... 100.0% (994 of 994)
Saving NumPy arrays...

NumPy files saved. Processing complete.
