In [3]:
import xarray as xr
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

from tqdm.notebook import tqdm

sns.set_theme('notebook')

In [4]:
data = xr.open_dataset('data/oaflux_air_sea_fluxes_train.nc', engine='netcdf4')
labels = xr.open_dataset('data/marine_heatwave_labels_train.nc', engine='netcdf4')

In [5]:
df = None
variables = list(data.variables)
for var in tqdm(variables[3:]): 
    
    stacked = data[var].stack(dim=["lon", "lat", "time"]).to_pandas().T
    if df is None:
        df = stacked
    else:
        df = pd.concat([df, stacked], axis=1)

# df = df.dropna(0, how='all')
df.reset_index(inplace=True)
df.columns = variables

  0%|          | 0/10 [00:00<?, ?it/s]

In [6]:
# sort variables and remove rows where ALL variables are NaN
df = df.sort_values(by=["lon", "lat", "time"], ascending=[True] * 3)

In [7]:
df["month"] = df.time.dt.month
df["day"] = df.time.dt.day
df = df.drop(["time"], axis=1)
df = df.fillna(-1)

In [8]:
targets = labels.mhw_label \
    .stack(dim=["lon", "lat", "time"]) \
    .to_pandas().T.reset_index() 

targets = targets \
    .sort_values(by=["lon", "lat", "time"]) \
    .drop(["lon", "lat", "time"], axis=1)

In [9]:
targets

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
26697595,0.0
26697596,0.0
26697597,0.0
26697598,0.0


In [10]:
X_train, X_test, y_train, y_test = \
    train_test_split(df, targets, test_size=0.3, shuffle=True, stratify=targets, random_state=1)

In [11]:
scoring = ['accuracy', 'precision', 'recall']
pipe = make_pipeline(StandardScaler(), SGDClassifier(loss='hinge', verbose=10))
scores = cross_validate(pipe, X_train, y_train, scoring=scoring, verbose=10, n_jobs=6)

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
report = classification_report(y_test, y_pred)

print(scores)
print(report)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   2 out of   5 | elapsed:  2.7min remaining:  4.1min
[Parallel(n_jobs=6)]: Done   3 out of   5 | elapsed:  2.8min remaining:  1.8min
[Parallel(n_jobs=6)]: Done   5 out of   5 | elapsed:  2.9min remaining:    0.0s
[Parallel(n_jobs=6)]: Done   5 out of   5 | elapsed:  2.9min finished
  return f(*args, **kwargs)


-- Epoch 1
Norm: 0.03, NNZs: 13, Bias: -1.004561, T: 18688320, Avg. loss: 0.114722
Total training time: 4.22 seconds.
-- Epoch 2
Norm: 0.02, NNZs: 13, Bias: -1.003394, T: 37376640, Avg. loss: 0.110402
Total training time: 8.37 seconds.
-- Epoch 3
Norm: 0.01, NNZs: 13, Bias: -1.002222, T: 56064960, Avg. loss: 0.110264
Total training time: 12.53 seconds.
-- Epoch 4
Norm: 0.01, NNZs: 13, Bias: -1.001584, T: 74753280, Avg. loss: 0.110207
Total training time: 16.76 seconds.
-- Epoch 5
Norm: 0.01, NNZs: 13, Bias: -1.001121, T: 93441600, Avg. loss: 0.110176
Total training time: 20.91 seconds.
-- Epoch 6
Norm: 0.01, NNZs: 13, Bias: -1.001036, T: 112129920, Avg. loss: 0.110156
Total training time: 25.06 seconds.
-- Epoch 7
Norm: 0.01, NNZs: 13, Bias: -1.000874, T: 130818240, Avg. loss: 0.110143
Total training time: 29.20 seconds.
Convergence after 7 epochs took 29.20 seconds


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'fit_time': array([158.27038169, 152.63811278, 154.05543113, 163.49093199,
       152.2410233 ]), 'score_time': array([4.48801208, 5.33720255, 4.90810657, 4.26860619, 5.38021255]), 'test_accuracy': array([0.94496589, 0.94496589, 0.94496589, 0.94496563, 0.94496563]), 'test_precision': array([0., 0., 0., 0., 0.]), 'test_recall': array([0., 0., 0., 0., 0.])}
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97   7568495
         1.0       0.00      0.00      0.00    440785

    accuracy                           0.94   8009280
   macro avg       0.47      0.50      0.49   8009280
weighted avg       0.89      0.94      0.92   8009280



  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
pipe = make_pipeline(StandardScaler(), SGDClassifier(loss='log', verbose=10))
scores = cross_validate(pipe, X_train, y_train, scoring=scoring, verbose=10, n_jobs=6)

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
report = classification_report(y_test, y_pred)

print(scores)
print(report)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   2 out of   5 | elapsed:  2.5min remaining:  3.8min
[Parallel(n_jobs=6)]: Done   3 out of   5 | elapsed:  2.5min remaining:  1.7min
[Parallel(n_jobs=6)]: Done   5 out of   5 | elapsed:  2.9min remaining:    0.0s
[Parallel(n_jobs=6)]: Done   5 out of   5 | elapsed:  2.9min finished
  return f(*args, **kwargs)


-- Epoch 1
Norm: 1.84, NNZs: 13, Bias: -3.038197, T: 18688320, Avg. loss: 0.203004
Total training time: 5.63 seconds.
-- Epoch 2
Norm: 1.84, NNZs: 13, Bias: -3.021909, T: 37376640, Avg. loss: 0.201134
Total training time: 11.25 seconds.
-- Epoch 3
Norm: 1.84, NNZs: 13, Bias: -3.037041, T: 56064960, Avg. loss: 0.201104
Total training time: 16.84 seconds.
-- Epoch 4
Norm: 1.84, NNZs: 13, Bias: -3.074667, T: 74753280, Avg. loss: 0.201092
Total training time: 22.44 seconds.
-- Epoch 5
Norm: 1.84, NNZs: 13, Bias: -3.049936, T: 93441600, Avg. loss: 0.201084
Total training time: 28.01 seconds.
-- Epoch 6
Norm: 1.84, NNZs: 13, Bias: -3.048616, T: 112129920, Avg. loss: 0.201081
Total training time: 33.63 seconds.
-- Epoch 7
Norm: 1.84, NNZs: 13, Bias: -3.053702, T: 130818240, Avg. loss: 0.201077
Total training time: 39.22 seconds.
Convergence after 7 epochs took 39.22 seconds
{'fit_time': array([161.83682585, 163.35950518, 139.76409268, 133.17167187,
       141.89087915]), 'score_time': array([

In [None]:
# Takes too long on this data
# pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=10))
# scores = cross_validate(pipe, X_train, y_train, scoring=scoring, verbose=10, n_jobs=6)

# pipe.fit(X_train, y_train)
# y_pred = pipe.predict(X_test)
# report = classification_report(y_test, y_pred)

# print(scores)
# print(report)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
