In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import xarray as xr
from bs4 import BeautifulSoup
from smount_predictors import SeamountScorer, SeamountTransformer, SeamountHelp, SeamountCVSplitter
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
import plotly.express as px
from sklearn.model_selection import GridSearchCV, train_test_split

In [2]:
seamount_centers = SeamountHelp.read_seamount_centers(Path('data/seamount_training_zone.kml'))[['lat', 'lon']].to_numpy()

In [3]:
pipe = Pipeline([
    ('trans', SeamountTransformer()),
    ('predictor', SVC(kernel='linear', class_weight={0: 1, 1: 10}))
])

param_grid = {
    'predictor__C': np.logspace(1, 5, 10),
}

scorer = SeamountScorer(seamount_centers)

grid = GridSearchCV(
    pipe,
    param_grid,
    cv=SeamountCVSplitter(5),
    n_jobs=-1,
    # error_score='raise',
    verbose=3,
    scoring='recall'
    )

In [4]:
points = SeamountHelp.readKMLbounds(Path('data/seamount_training_zone.kml'))
data = SeamountHelp.readAndFilterGRD(Path('data') / 'vgg_swot.grd', points[:2], points[2:])

In [5]:
y = SeamountHelp.readAndFilterGRD(Path('data') / 'vgg_seamounts_labled.nc')
X = data.to_dataframe().reset_index().merge(y.to_dataframe().reset_index(), on=['lat', 'lon'], how='left')
splitter = SeamountCVSplitter(5)
X_train, X_test = next(splitter.split(X[['lat', 'lon', 'z_x']], X['Labels']))
X_train = X.iloc[X_train]
y_train = X_train['Labels'].to_numpy()
X_train = X_train[['lat', 'lon', 'z_x']].to_numpy()
X_test = X.iloc[X_test]
y_test = X_test['Labels'].to_numpy()
X_test = X_test[['lat', 'lon', 'z_x']].to_numpy()

In [6]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 4/5] END .................predictor__C=10.0;, score=0.243 total time=   2.2s
[CV 1/5] END .................predictor__C=10.0;, score=0.000 total time=   2.4s
[CV 2/5] END .................predictor__C=10.0;, score=0.561 total time=   2.4s
[CV 5/5] END .................predictor__C=10.0;, score=0.120 total time=   2.4s
[CV 3/5] END .................predictor__C=10.0;, score=0.577 total time=   3.1s
[CV 5/5] END ...predictor__C=27.825594022071243;, score=0.120 total time=   3.6s
[CV 2/5] END ...predictor__C=27.825594022071243;, score=0.561 total time=   3.7s
[CV 1/5] END ...predictor__C=27.825594022071243;, score=0.000 total time=   3.9s
[CV 4/5] END ...predictor__C=27.825594022071243;, score=0.243 total time=   4.0s
[CV 3/5] END ...predictor__C=27.825594022071243;, score=0.577 total time=   5.7s
[CV 2/5] END .....predictor__C=77.4263682681127;, score=0.561 total time=   7.0s
[CV 1/5] END .....predictor__C=77.4263682681127;

In [7]:
grid.best_score_, grid.best_params_

(np.float64(0.4887473123082879), {'predictor__C': np.float64(100000.0)})

In [8]:
grid.score(X_test, y_test)

np.float64(0.75)

In [9]:
X_test

array([[ -19.09166667, -112.875     ,    0.35392523],
       [ -19.075     , -112.875     ,    0.35948601],
       [ -18.99166667, -112.875     ,    4.834445  ],
       ...,
       [ -18.54166667, -112.125     ,   -0.95524698],
       [ -16.925     , -111.925     ,   -2.97528911],
       [ -18.175     , -112.34166667,   -0.52246022]])

In [10]:
X_test = pd.DataFrame(X_test, columns=['lat', 'lon', 'z'])

In [11]:
fig = px.scatter(X_test, x='lon', y='lat', color=grid.predict(X_test))
fig.update_layout(
        width=300,
        height=800,
        # xaxis=dict(type='linear', autorange=True),  # Adjust x-axis properties
        yaxis=dict(type='linear', autorange=True),  # Adjust y-axis properties
    )
fig.show()

In [14]:
y_true = px.scatter(X_test, x='lon', y='lat', color='z')
y_true.update_layout(
        width=320,
        height=800,
        xaxis=dict(type='linear', autorange=True),  # Adjust x-axis properties
        yaxis=dict(type='linear', autorange=True),  # Adjust y-axis properties
    )