In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import xarray as xr
from bs4 import BeautifulSoup
from smount_predictors import SeamountScorer, SeamountTransformer, SeamountHelp, SeamountCVSplitter
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
import plotly.express as px
from sklearn.model_selection import GridSearchCV, train_test_split

In [2]:
seamount_centers = SeamountHelp.read_seamount_centers(Path('data/seamount_training_zone.kml'))[['lat', 'lon']].to_numpy()

In [3]:
pipe = Pipeline([
    ('trans', SeamountTransformer()),
    ('predictor', SVC(kernel='linear'))
])

param_grid = {
    'predictor__C': np.logspace(-10, 5, 10),
}

scorer = SeamountScorer(seamount_centers)

grid = GridSearchCV(
    pipe,
    param_grid,
    cv=SeamountCVSplitter(5),
    n_jobs=-1,
    error_score='raise',
    verbose=2,
    # scoring=scorer
    )

In [4]:
points = SeamountHelp.readKMLbounds(Path('data/seamount_training_zone.kml'))
data = SeamountHelp.readAndFilterGRD(Path('data') / 'vgg_swot.grd', points[:2], points[2:])

In [5]:
y = SeamountHelp.readAndFilterGRD(Path('data') / 'vgg_seamounts_labled.nc')
X = data.to_dataframe().reset_index().merge(y.to_dataframe().reset_index(), on=['lat', 'lon'], how='left')
X_test = X[X['lon'] < -112.5583]
X_train = X[X['lon'] >= -112.5583]
y_train = X_train['Labels'].to_numpy()
y_test = X_test['Labels'].to_numpy()
X_train = X_train.drop(columns=['Labels'])[['lat', 'lon', 'z_x']].to_numpy()
X_test = X_test.drop(columns=['Labels'])[['lat', 'lon', 'z_x']].to_numpy()

In [6]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END .................................predictor__C=1e-10; total time=   0.4s
[CV] END .................................predictor__C=1e-10; total time=   0.4s
[CV] END .................................predictor__C=1e-10; total time=   0.4s
[CV] END .................predictor__C=4.641588833612773e-09; total time=   0.3s
[CV] END .................predictor__C=4.641588833612773e-09; total time=   0.4s
[CV] END .................................predictor__C=1e-10; total time=   0.5s
[CV] END .................................predictor__C=1e-10; total time=   0.4s
[CV] END .................predictor__C=4.641588833612773e-09; total time=   0.3s
[CV] END ................predictor__C=2.1544346900318867e-07; total time=   0.3s
[CV] END ................predictor__C=2.1544346900318867e-07; total time=   0.3s
[CV] END .................predictor__C=4.641588833612773e-09; total time=   0.4s
[CV] END .................predictor__C=4.6415888

ValueError: Found input variables with inconsistent numbers of samples: [43, 3601]

In [None]:
grid.best_score_, grid.best_params_

In [None]:
grid.score(X_test, y_test)

In [None]:
X_test

In [None]:
X_test = pd.DataFrame(X_test, columns=['lat', 'lon', 'z'])

In [None]:
fig = px.scatter(X_test, x='lon', y='lat', color=grid.predict(X_test.to_numpy()))
fig.update_layout(
        width=300,
        height=800,
        # xaxis=dict(type='linear', autorange=True),  # Adjust x-axis properties
        yaxis=dict(type='linear', autorange=True),  # Adjust y-axis properties
    )
fig.show()

In [None]:
y_true = px.scatter(X_test, x='lon', y='lat', color=y_test)
y_true.update_layout(
        width=300,
        height=800,
        xaxis=dict(type='linear', autorange=True),  # Adjust x-axis properties
        yaxis=dict(type='linear', autorange=True),  # Adjust y-axis properties
    )