In [1]:
import h5py
import numpy as np

The data is available as one HDF5 file per year

HDF5: Made of two different objects:

- Datasets (array-like collections of data)
- Groups (folder-like containers that hold datasets and other groups)

-> Groups are like dicts, and datasets are like NumPy arrays

In [5]:
data_path = "/Users/Oliver/Desktop/CCAI Hackathon/climo_2005.h5"
h5f = h5py.File(data_path, 'r')

In [6]:
# h5f acts like a Python dict

images = h5f["images"] # (1456,16,768,1152) numpy array
boxes = h5f["boxes"] # (1456,15,5) numpy array

In [7]:
h5f.keys()

<KeysViewHDF5 ['boxes', 'images']>

In [8]:
boxes[764]

array([[ 224,  246,  304,  326,    1],
       [ 441, 1005,  485, 1049,    1],
       [  66,    9,  146,   89,    2],
       [ 143,  601,  223,  681,    2],
       [ 122,  685,  202,  765,    2],
       [  -1,   -1,   -1,   -1,   -1],
       [  -1,   -1,   -1,   -1,   -1],
       [  -1,   -1,   -1,   -1,   -1],
       [  -1,   -1,   -1,   -1,   -1],
       [  -1,   -1,   -1,   -1,   -1],
       [  -1,   -1,   -1,   -1,   -1],
       [  -1,   -1,   -1,   -1,   -1],
       [  -1,   -1,   -1,   -1,   -1],
       [  -1,   -1,   -1,   -1,   -1],
       [  -1,   -1,   -1,   -1,   -1]])

In [9]:
# Viewing temperatures (K)

for x in images[563,8]:
    print(x)

[217.23242 217.23251 217.23322 ... 217.23322 217.23193 217.23245]
[218.16585 218.19518 218.18735 ... 218.22514 218.17683 218.18214]
[217.23804 217.24225 217.2722  ... 217.1987  217.21407 217.21585]
[214.11479 214.14964 214.16493 ... 214.08148 214.07487 214.09491]
[214.2488  214.30737 214.37027 ... 214.0838  214.14209 214.18692]
[213.48666 213.50746 213.5493  ... 213.38326 213.41652 213.45244]
[213.07707 213.08438 213.08926 ... 213.06837 213.07101 213.07335]
[213.06447 213.06772 213.07013 ... 213.06403 213.06152 213.05893]
[214.68909 214.70967 214.73781 ... 214.6077  214.63374 214.66084]
[215.65869 215.69028 215.73164 ... 215.5234  215.5664  215.61151]
[216.05489 216.16873 216.27496 ... 215.70482 215.82405 215.94046]
[215.9146  216.0542  216.19215 ... 215.47209 215.63963 215.78175]
[215.13168 215.20995 215.30087 ... 214.66534 214.93533 215.06291]
[214.60193 214.88231 214.90016 ... 214.45819 214.70663 214.55257]
[214.34508 214.31845 214.30965 ... 214.4616  214.41806 214.37222]
[214.61612

[299.93246 299.87442 298.2547  ... 299.7637  299.70496 299.41946]
[300.11377 300.46558 298.5462  ... 299.81268 299.94952 299.9349 ]
[300.22253 299.98102 298.8448  ... 300.40414 300.1688  300.11887]
[300.0554  300.37015 299.49286 ... 300.92242 300.82324 299.9333 ]
[300.12207 300.24042 300.13174 ... 300.28967 300.33258 299.96497]
[300.40414 300.25427 300.14264 ... 300.4859  300.37677 300.34943]
[300.49936 300.38644 299.95926 ... 300.78024 300.78534 300.69952]
[300.65363 300.54684 300.79034 ... 300.76532 300.8771  300.65845]
[300.5464  300.67462 300.85916 ... 300.8333  300.84607 300.6056 ]
[301.10138 300.99292 300.60553 ... 300.70697 301.29324 300.6978 ]
[301.48416 301.57584 301.7989  ... 300.7254  301.5488  301.59958]
[302.32767 302.41556 302.34198 ... 302.2553  301.97452 302.15558]
[303.40277 303.07837 302.56882 ... 302.82254 303.24097 303.2406 ]
[303.90314 303.54263 303.34732 ... 303.31528 304.20093 304.1709 ]
[304.776   304.7854  303.81152 ... 303.97748 304.7782  304.96664]
[305.14014

In [10]:
print(images)
print(boxes)

<HDF5 dataset "images": shape (1456, 16, 768, 1152), type "<f4">
<HDF5 dataset "boxes": shape (1456, 15, 5), type "<i4">


In [11]:
images.dtype

dtype('<f4')

`f` is short for a float
`<` indicates the byte-order: https://docs.scipy.org/doc/numpy/reference/generated/numpy.dtype.byteorder.html

“HDF” stands for “Hierarchical Data Format”. Every object in an HDF5 file has a name, and they’re arranged in a POSIX-style hierarchy with `/`-separators

In [14]:
# Keeping only temperature

samples = images[:,0:1,:,:]
samples.shape

(1456, 1, 768, 1152)

In [15]:
# Keeping only the classes from the boxes, which are (1460,15,5)
# The 5 elements per row are (ymin, xmin, ymax, xmax, class)
targets = boxes[:,:,4]
targets.shape

(1456, 15)

In [16]:
len(targets[1])

15

In [17]:
# Keep only 2nd channel. We have 15 channels, and so for dimensionality
# reduction, we picked the second row, because the classes are ranked somewhat in ascending order,
# keeping the second row would assure we would almost always have a hurricane (1) and occasionally a normal class (-1)

targets = targets[:,1:2]
targets.shape

(1456, 1)

In [18]:
print(targets)

[[ 3]
 [ 3]
 [ 1]
 ...
 [ 2]
 [-1]
 [-1]]


In [19]:
# Taking only hurricanes and normal days
targets_mask = np.isin(targets,[1, -1])
targets_mask.shape

(1456, 1)

In [20]:
targets_mask

array([[False],
       [False],
       [ True],
       ...,
       [False],
       [ True],
       [ True]])

In [21]:
targets = targets[targets_mask]
targets.shape

(772,)

In [22]:
targets

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1,
       -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [23]:
# Restricting only to -1 and 1
samples = samples[targets_mask]
print(samples.shape)
print(samples)

(772, 768, 1152)
[[[0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  ...
  [3.0606830e-11 3.1990036e-11 3.3502492e-11 ... 2.7483831e-11
   2.8352607e-11 2.9389536e-11]
  [5.1833249e-11 5.3090449e-11 5.4543033e-11 ... 4.8652582e-11
   4.9655089e-11 5.0706366e-11]
  [1.1481748e-11 1.1481748e-11 1.1481748e-11 ... 1.1481748e-11
   1.1481748e-11 1.1481748e-11]]

 [[2.3633938e-17 2.3633938e-17 2.3633938e-17 ... 2.3633938e-17
   2.3633938e-17 2.3633938e-17]
  [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  ...
  [7.2563090e-14 7.1567494e-14 7.0690266e-14 ... 7.6558192e-14
   7.5063592e-14 7.3719873e-14]
  [3.3591768e-14 3.

In [24]:
samples.shape

(772, 768, 1152)

In [25]:
# Flattening image dimensions into 1 array
samples = np.resize(samples,(772,1,884736))
print(samples.shape)
print(samples)

(772, 1, 884736)
[[[0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 1.1481748e-11
   1.1481748e-11 1.1481748e-11]]

 [[2.3633938e-17 2.3633938e-17 2.3633938e-17 ... 8.7380324e-18
   8.7380324e-18 8.7380324e-18]]

 [[0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 3.1844322e-10
   3.1844322e-10 3.1844322e-10]]

 ...

 [[4.2212719e-11 4.2212719e-11 4.2212719e-11 ... 4.1322118e-10
   4.1322118e-10 4.1322118e-10]]

 [[0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 4.6566853e-09
   4.6566853e-09 4.6566844e-09]]

 [[0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 4.5711218e-08
   4.5711218e-08 4.5711218e-08]]]


In [26]:
# Flattening temp 
samples = np.resize(samples,(772,884736))
print(samples.shape)
print(samples)

(772, 884736)
[[0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 1.1481748e-11
  1.1481748e-11 1.1481748e-11]
 [2.3633938e-17 2.3633938e-17 2.3633938e-17 ... 8.7380324e-18
  8.7380324e-18 8.7380324e-18]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 3.1844322e-10
  3.1844322e-10 3.1844322e-10]
 ...
 [4.2212719e-11 4.2212719e-11 4.2212719e-11 ... 4.1322118e-10
  4.1322118e-10 4.1322118e-10]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 4.6566853e-09
  4.6566853e-09 4.6566844e-09]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 4.5711218e-08
  4.5711218e-08 4.5711218e-08]]


In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(samples, targets, test_size=0.3, random_state=42, shuffle=True)

In [28]:
# Create and fit (train) a nearest-neighbor classifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_jobs=-1)

In [29]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')

In [30]:
y_pred = knn.predict(X_test)

In [31]:
from sklearn.metrics import accuracy_score

scores = accuracy_score(y_test, y_pred)

print(scores)

0.9051724137931034


In [32]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)



In [33]:
from sklearn.metrics import accuracy_score

scores = accuracy_score(y_test, y_pred)

print(scores)

0.8836206896551724


In [34]:
from sklearn.svm import SVC

svm = SVC()

svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)



In [35]:
from sklearn.metrics import accuracy_score

scores = accuracy_score(y_test, y_pred)

print(scores)

0.8836206896551724


In [36]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(hidden_layer_sizes=(5,))

nn.fit(X_train, y_train)

y_pred = nn.predict(X_test)



In [37]:
from sklearn.metrics import accuracy_score

scores = accuracy_score(y_test, y_pred)

print(scores)

0.8836206896551724
