In [1]:
import h5py
import numpy as np

In [5]:
%load_ext memory_profiler

The data is available as one HDF5 file per year

HDF5: Made of two different objects:

- Datasets (array-like collections of data)
- Groups (folder-like containers that hold datasets and other groups)

-> Groups are like dicts, and datasets are like NumPy arrays

In [2]:
data_path = "/Users/normankong/Desktop/climo_2005.h5"
h5f = h5py.File(data_path, 'r')

In [3]:
# h5f acts like a Python dict

images = h5f["images"] # (1456,16,768,1152) numpy array
boxes = h5f["boxes"] # (1456,15,5) numpy array

In [None]:
h5f.keys()

In [None]:
boxes[764]

In [None]:
# Viewing temperatures (K)

for x in images[563,8]:
    print(x)

In [None]:
print(images)
print(boxes)

In [None]:
images.dtype

`f` is short for a float
`<` indicates the byte-order: https://docs.scipy.org/doc/numpy/reference/generated/numpy.dtype.byteorder.html

“HDF” stands for “Hierarchical Data Format”. Every object in an HDF5 file has a name, and they’re arranged in a POSIX-style hierarchy with `/`-separators

In [6]:
# Keeping only temperature

%memit samples = images[:,0:1,:,:]
samples.shape

peak memory: 5037.34 MiB, increment: 4977.28 MiB


(1456, 1, 768, 1152)

In [7]:
# Keeping only the classes from the boxes, which are (1460,15,5)
# The 5 elements per row are (ymin, xmin, ymax, xmax, class)
targets = boxes[:,:,4]
targets.shape

(1456, 15)

In [None]:
len(targets[1])

In [8]:
# Keep only 2nd channel. We have 15 channels, and so for dimensionality
# reduction, we picked the second row, because the classes are ranked somewhat in ascending order,
# keeping the second row would assure we would almost always have a hurricane (1) and occasionally a normal class (-1)

targets = targets[:,1:2]
targets.shape

(1456, 1)

In [None]:
print(targets)

In [9]:
# Taking only hurricanes and normal days
targets_mask = np.isin(targets,[1, -1])
targets_mask.shape

(1456, 1)

In [19]:
targets_mask

array([[False],
       [False],
       [ True],
       ...,
       [False],
       [ True],
       [ True]])

In [10]:
targets = targets[targets_mask]
targets.shape

(772,)

In [20]:
targets

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1,
       -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [11]:
# Restricting only to -1 and 1
samples = samples[targets_mask]
print(samples.shape)
print(samples)

(772, 768, 1152)
[[[0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  ...
  [3.0606830e-11 3.1990036e-11 3.3502492e-11 ... 2.7483831e-11
   2.8352607e-11 2.9389536e-11]
  [5.1833249e-11 5.3090449e-11 5.4543033e-11 ... 4.8652582e-11
   4.9655089e-11 5.0706366e-11]
  [1.1481748e-11 1.1481748e-11 1.1481748e-11 ... 1.1481748e-11
   1.1481748e-11 1.1481748e-11]]

 [[2.3633938e-17 2.3633938e-17 2.3633938e-17 ... 2.3633938e-17
   2.3633938e-17 2.3633938e-17]
  [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  ...
  [7.2563090e-14 7.1567494e-14 7.0690266e-14 ... 7.6558192e-14
   7.5063592e-14 7.3719873e-14]
  [3.3591768e-14 3.

In [21]:
samples.shape

(772, 884736)

In [12]:
# Flattening image dimensions into 1 array
samples = np.resize(samples,(772,1,884736))
print(samples.shape)
print(samples)

(772, 1, 884736)
[[[0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 1.1481748e-11
   1.1481748e-11 1.1481748e-11]]

 [[2.3633938e-17 2.3633938e-17 2.3633938e-17 ... 8.7380324e-18
   8.7380324e-18 8.7380324e-18]]

 [[0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 3.1844322e-10
   3.1844322e-10 3.1844322e-10]]

 ...

 [[4.2212719e-11 4.2212719e-11 4.2212719e-11 ... 4.1322118e-10
   4.1322118e-10 4.1322118e-10]]

 [[0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 4.6566853e-09
   4.6566853e-09 4.6566844e-09]]

 [[0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 4.5711218e-08
   4.5711218e-08 4.5711218e-08]]]


In [13]:
# Flattening temp 
samples = np.resize(samples,(772,884736))
print(samples.shape)
print(samples)

(772, 884736)
[[0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 1.1481748e-11
  1.1481748e-11 1.1481748e-11]
 [2.3633938e-17 2.3633938e-17 2.3633938e-17 ... 8.7380324e-18
  8.7380324e-18 8.7380324e-18]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 3.1844322e-10
  3.1844322e-10 3.1844322e-10]
 ...
 [4.2212719e-11 4.2212719e-11 4.2212719e-11 ... 4.1322118e-10
  4.1322118e-10 4.1322118e-10]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 4.6566853e-09
  4.6566853e-09 4.6566844e-09]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 4.5711218e-08
  4.5711218e-08 4.5711218e-08]]


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(samples, targets, test_size=0.3, random_state=42, shuffle=True)

In [15]:
# Create and fit (train) a nearest-neighbor classifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_jobs=-1)

In [16]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')

In [17]:
y_pred = knn.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score

scores = accuracy_score(y_test, y_pred)

print(scores)

0.9051724137931034


In [22]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)



In [23]:
from sklearn.metrics import accuracy_score

scores = accuracy_score(y_test, y_pred)

print(scores)

0.8836206896551724


In [24]:
from sklearn.svm import SVC

svm = SVC()

svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)



In [25]:
from sklearn.metrics import accuracy_score

scores = accuracy_score(y_test, y_pred)

print(scores)

0.8836206896551724


In [30]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(hidden_layer_sizes=(5,))

nn.fit(X_train, y_train)

y_pred = nn.predict(X_test)



In [31]:
from sklearn.metrics import accuracy_score

scores = accuracy_score(y_test, y_pred)

print(scores)

0.8836206896551724
