In [1]:
import h5py
import numpy as np

The data is available as one HDF5 file per year

HDF5: Made of two different objects:

- Datasets (array-like collections of data)
- Groups (folder-like containers that hold datasets and other groups)

-> Groups are like dicts, and datasets are like NumPy arrays

In [2]:
data_path = "/Users/normankong/Desktop/climo_2005.h5"
h5f = h5py.File(data_path, 'r')

In [3]:
# h5f acts like a Python dict

images = h5f["images"] # (1460,16,768,1152) numpy array
boxes = h5f["boxes"] # (1460,15,5) numpy array

In [4]:
h5f.keys()

<KeysViewHDF5 ['boxes', 'images']>

In [5]:
print(images)
print(boxes)

<HDF5 dataset "images": shape (1456, 16, 768, 1152), type "<f4">
<HDF5 dataset "boxes": shape (1456, 15, 5), type "<i4">


In [6]:
images.dtype

dtype('<f4')

`f` is short for a float
`<` indicates the byte-order: https://docs.scipy.org/doc/numpy/reference/generated/numpy.dtype.byteorder.html

“HDF” stands for “Hierarchical Data Format”. Every object in an HDF5 file has a name, and they’re arranged in a POSIX-style hierarchy with `/`-separators

In [8]:
# Keeping only the classes from the boxes
targets = boxes[:,:,4]

# Cut every 4th day
targets = targets[0:1456:4,:]
targets.shape

(364, 15)

In [9]:
# Taking only hurricanes and normal days
targets_mask = np.isin(targets,[1, -1])
targets_mask.shape

(364, 15)

In [10]:
targets = targets[targets_mask]
targets.shape

(4352,)

In [11]:
targets

array([ 1, -1, -1, ..., -1, -1, -1], dtype=int32)

In [None]:
# Keeping only temperature

samples = images[:,0:1,:,:]
samples.shape

In [None]:
# Cutting it to 1 image a day
samples = samples[0:1456:4,0:1,0:768,0:1152]
samples.shape

In [None]:
# Restricting only to -1 and 1
samples = samples[targets_mask]

In [None]:
samples

In [None]:
# Flattening image dimensions into 1 array

samples = np.resize(samples,(364,1,884736))

In [None]:
# Flattening temp 
samples = np.resize(samples,(364,884736))

In [None]:
samples.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(samples, targets, test_size=0.3, random_state=42, shuffle=True)

In [None]:
# Create and fit (train) a nearest-neighbor classifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

scores = accuracy_score(y_test, y_pred)

print(scores)