In [1]:
import contextlib
import io
from pathlib import Path
import pandas as pd
from smount_predictors import SeamountTransformer, SeamountHelp, SeamountCVSplitter
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import plotly.express as px
import numpy as np

In [2]:
pipe = Pipeline([
    ('trans', SeamountTransformer()),
    ('predictor', SVC(kernel='linear', class_weight='balanced'))  # previous grid searches have found optimal C of 1
])

param_grid = {
    'predictor__C': np.linspace(0.1, 10, 100)
}

grid = GridSearchCV(pipe, param_grid, cv=SeamountCVSplitter(), n_jobs=-1, verbose=3, scoring='recall')

In [3]:
points = SeamountHelp.readKMLbounds(Path('data/seamount_training_zone.kml'))
data = SeamountHelp.readAndFilterGRD(Path('data') / 'training_data_new.nc')
X = data.to_dataframe().reset_index()[['lat', 'lon' , 'z']]

In [4]:
y = data.to_dataframe().reset_index()['Labels']
grid_output = io.StringIO()
with contextlib.redirect_stdout(grid_output):
    grid.fit(X, y)
with open(Path('out') / 'gridsearch_output.txt', 'w') as f:
    f.write(grid_output.getvalue())
print(f'train score: {grid.score(X, y)}')
print(f'best params: {grid.best_params_}')
print(f'decision function intercept: {grid.best_estimator_.named_steps["predictor"].intercept_}')
print(f'decision function coef: {grid.best_estimator_.named_steps["predictor"].coef_}')

[CV 1/4] END ..................predictor__C=0.2;, score=0.770 total time= 2.4min
[CV 1/4] END ..................predictor__C=0.1;, score=0.770 total time= 2.4min
[CV 1/4] END ..predictor__C=0.30000000000000004;, score=0.770 total time= 2.5min
[CV 2/4] END ..................predictor__C=0.2;, score=0.856 total time= 2.6min
[CV 2/4] END ..................predictor__C=0.1;, score=0.856 total time= 2.6min
[CV 3/4] END ..................predictor__C=0.1;, score=0.805 total time= 2.6min
[CV 4/4] END ..................predictor__C=0.2;, score=0.740 total time= 2.6min
[CV 3/4] END ..predictor__C=0.30000000000000004;, score=0.805 total time= 2.7min
[CV 2/4] END ..predictor__C=0.30000000000000004;, score=0.856 total time= 2.7min
[CV 3/4] END ..................predictor__C=0.2;, score=0.805 total time= 3.4min
[CV 4/4] END ..................predictor__C=0.1;, score=0.738 total time= 3.4min
[CV 4/4] END ..predictor__C=0.30000000000000004;, score=0.741 total time= 3.5min
[CV 1/4] END ...............



[CV 1/4] END ..................predictor__C=2.6;, score=0.770 total time= 3.4min
[CV 2/4] END ..................predictor__C=2.6;, score=0.856 total time= 3.6min
[CV 3/4] END ..................predictor__C=2.6;, score=0.805 total time= 3.5min
[CV 1/4] END ..................predictor__C=2.7;, score=0.770 total time= 3.3min
[CV 2/4] END ..................predictor__C=2.7;, score=0.856 total time= 3.4min
[CV 3/4] END ..................predictor__C=2.7;, score=0.805 total time= 3.4min
[CV 4/4] END ..................predictor__C=2.7;, score=0.741 total time= 3.5min
[CV 1/4] END ...predictor__C=2.8000000000000003;, score=0.770 total time= 3.2min
[CV 4/4] END ..................predictor__C=2.6;, score=0.741 total time= 4.3min
[CV 2/4] END ...predictor__C=2.8000000000000003;, score=0.856 total time= 3.4min
[CV 3/4] END ...predictor__C=2.8000000000000003;, score=0.805 total time= 3.2min
[CV 4/4] END ...predictor__C=2.8000000000000003;, score=0.741 total time= 3.3min
[CV 1/4] END ...predictor__C

In [5]:
import pickle
from sklearn.cluster import DBSCAN
from smount_predictors.src.SeamountHelp import PipelinePredictor

full_pipeline = PipelinePredictor(grid, DBSCAN(eps=0.00029088820866630336, min_samples=4, metric='haversine'))
pickle.dump(full_pipeline, open('out/cluster_tuned_model.pkl', 'wb'))