In [7]:
from pathlib import Path
import pandas as pd
import numpy as np
from smount_predictors import SeamountScorer, SeamountTransformer, SeamountHelp, SeamountCVSplitter
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
import plotly.express as px
from sklearn.model_selection import GridSearchCV

In [8]:
seamount_centers = SeamountHelp.read_seamount_centers(Path('data/seamount_training_zone.kml'))[['lat', 'lon']].to_numpy()

In [9]:
pipe = Pipeline([
    ('trans', SeamountTransformer()),
    ('predictor', SVC(kernel='linear', class_weight='balanced'))
])

param_grid = {
    'predictor__C': np.logspace(1, 5, 10),
    # 'trans__sigma': np.linspace(0.1, 1, 10),
}

scorer = SeamountScorer(seamount_centers)

grid = GridSearchCV(
    pipe,
    param_grid,
    cv=SeamountCVSplitter(5),
    n_jobs=-1,
    error_score='raise',
    verbose=3,
    # scoring='recall'
    )

In [10]:
points = SeamountHelp.readKMLbounds(Path('data/seamount_training_zone.kml'))
data = SeamountHelp.readAndFilterGRD(Path('data') / 'swot_seamounts_labled.nc')
X = data.to_dataframe().reset_index()

In [11]:
splitter = SeamountCVSplitter(5)
X_train, X_test = next(splitter.split(X[['lat', 'lon', 'z']], X['Labels']))
X_train = X.iloc[X_train]
y_train = X_train['Labels'].to_numpy()
X_train = X_train[['lat', 'lon', 'z']].to_numpy()
X_test = X.iloc[X_test]
y_test = X_test['Labels'].to_numpy()
X_test = X_test[['lat', 'lon', 'z']].to_numpy()

In [12]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 5/5] END .................predictor__C=10.0;, score=0.962 total time=   9.9s
[CV 4/5] END .................predictor__C=10.0;, score=0.950 total time=  10.2s
[CV 2/5] END .................predictor__C=10.0;, score=0.961 total time=  10.7s
[CV 3/5] END .................predictor__C=10.0;, score=0.970 total time=  13.0s
[CV 5/5] END ...predictor__C=27.825594022071243;, score=0.962 total time=  13.2s
[CV 4/5] END ...predictor__C=27.825594022071243;, score=0.950 total time=  13.4s
[CV 2/5] END ...predictor__C=27.825594022071243;, score=0.961 total time=  13.7s
[CV 1/5] END .................predictor__C=10.0;, score=0.931 total time=  14.0s
[CV 1/5] END ...predictor__C=27.825594022071243;, score=0.931 total time=  16.3s
[CV 2/5] END .....predictor__C=77.4263682681127;, score=0.961 total time=  16.7s
[CV 3/5] END ...predictor__C=27.825594022071243;, score=0.970 total time=  17.1s
[CV 1/5] END .....predictor__C=77.4263682681127;

In [13]:
import pickle
from sklearn.cluster import HDBSCAN

class PipelinePredictor:
    def __init__(self, model, clusterer):
        self.model = model
        self.clusterer = clusterer

    def predict(self, data):
        predictions = self.model.predict(data)
        data['class'] = predictions
        self.clusterer.fit_predict(data[['lon', 'lat', 'class']])
        data['cluster'] = self.clusterer.labels_
        return data
    
full_pipeline = PipelinePredictor(grid, HDBSCAN())
pickle.dump(full_pipeline, open('out/script_accuracy_balenced_model.pkl', 'wb'))

In [14]:
grid.best_score_, grid.best_params_

(np.float64(0.9549068919457164),
 {'predictor__C': np.float64(4641.588833612777)})

In [15]:
grid.score(X_test, y_test)

0.9288642392090668

In [16]:
X_test

array([[ -19.19166667, -112.025     ,    0.69804347],
       [ -19.19166667, -112.125     ,   -3.73248291],
       [ -19.19166667, -112.00833333,    1.27841341],
       ...,
       [ -17.44166667, -111.74166667,   -2.88247156],
       [ -18.39166667, -112.59166667,   -0.32931018],
       [ -17.79166667, -112.55833333,   -5.46643734]])

In [17]:
X_test = pd.DataFrame(X_test, columns=['lat', 'lon', 'z'])

In [18]:
fig = px.scatter(X_test, x='lon', y='lat', color=grid.predict(X_test))
fig.update_layout(
        width=300,
        height=800,
        # xaxis=dict(type='linear', autorange=True),  # Adjust x-axis properties
        yaxis=dict(type='linear', autorange=True),  # Adjust y-axis properties
    )
fig.show()

In [19]:
y_true = px.scatter(X_test, x='lon', y='lat', color='z')
y_true.update_layout(
        width=320,
        height=800,
        xaxis=dict(type='linear', autorange=True),  # Adjust x-axis properties
        yaxis=dict(type='linear', autorange=True),  # Adjust y-axis properties
    )