In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from smount_predictors import SeamountScorer, SeamountTransformer, SeamountHelp, SeamountCVSplitter
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
import plotly.express as px
from sklearn.model_selection import GridSearchCV

In [2]:
seamount_centers = SeamountHelp.read_seamount_centers(Path('data/seamount_training_zone.kml'))[['lat', 'lon']].to_numpy()

In [3]:
pipe = Pipeline([
    ('trans', SeamountTransformer()),
    ('predictor', SVC(kernel='linear', class_weight='balanced', C=1.0))  # previous grid searches have found optimal C of 1
])

In [4]:
points = SeamountHelp.readKMLbounds(Path('data/seamount_training_zone.kml'))
data = SeamountHelp.readAndFilterGRD(Path('data') / 'swot_seamounts_labled.nc')
X = data.to_dataframe().reset_index()

In [30]:
from sklearn.metrics.pairwise import haversine_distances
import numpy as np

dists = haversine_distances(np.radians(np.array([data['lat'].values, data['lon'].values]).T))
# np.array([data['lat'].values, data['lon'].values]).T
px.histogram(dists.flatten()).show()

In [36]:
print(dists)

[[0.         0.00040006 0.00080014 ... 0.05658979 0.05699239 0.05739499]
 [0.00040006 0.         0.00040008 ... 0.05618975 0.05659234 0.05699495]
 [0.00080014 0.00040008 0.         ... 0.05578969 0.05619228 0.05659489]
 ...
 [0.05658979 0.05618975 0.05578969 ... 0.         0.00040261 0.00080524]
 [0.05699239 0.05659234 0.05619228 ... 0.00040261 0.         0.00040263]
 [0.05739499 0.05699495 0.05659489 ... 0.00080524 0.00040263 0.        ]]


In [6]:
splitter = SeamountCVSplitter(5)
X_train, X_test = next(splitter.split(X[['lat', 'lon', 'z']], X['Labels']))
X_train = X.iloc[X_train]
y_train = X_train['Labels'].to_numpy()
X_train = X_train[['lat', 'lon', 'z']].to_numpy()
X_test = X.iloc[X_test]
y_test = X_test['Labels'].to_numpy()
X_test = X_test[['lat', 'lon', 'z']].to_numpy()

In [7]:
pipe.fit(X_train, y_train)

In [53]:
import pickle
from sklearn.cluster import DBSCAN, HDBSCAN

class PipelinePredictor:
    def __init__(self, model, clusterer):
        self.model = model
        self.clusterer = clusterer

    def predict(self, data):
        predictions = self.model.predict(data)
        data['class'] = predictions
        data['cluster'] = -1
        data['lat'] = np.radians(data['lat'])
        data['lon'] = np.radians(data['lon'])
        self.clusterer.fit_predict(data[['lon', 'lat']][data['class'] == 1])
        data['cluster'][data['class'] == 1] = self.clusterer.labels_
        return data
    
full_pipeline = PipelinePredictor(pipe, DBSCAN(eps=0.00029088820866630336, min_samples=4, metric='haversine'))
pickle.dump(full_pipeline, open('out/cluster_tuned_model.pkl', 'wb'))

In [48]:
predict = pipe.predict(X_test)
X_test['class'] = predict
clust = DBSCAN(eps=0.00029088820866630336, min_samples=4, metric='haversine')
clust.fit_predict(X_test[['lon', 'lat']][X_test['class'] == 1])
X_test['cluster'] = -1
X_test['cluster'][X_test['class'] == 1] = clust.labels_
px.scatter(X_test[X_test['class'] == 1], x='lon', y='lat', color='cluster').update_layout(
        width=320,
        height=800,
        xaxis=dict(type='linear', autorange=True),  # Adjust x-axis properties
        yaxis=dict(type='linear', autorange=True),  # Adjust y-axis properties
    ).show()


ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [37]:
clust.

{'algorithm': 'auto',
 'eps': 0.0255,
 'leaf_size': 30,
 'metric': 'haversine',
 'metric_params': None,
 'min_samples': 5,
 'n_jobs': None,
 'p': None}

In [10]:
X_test = pd.DataFrame(X_test, columns=['lat', 'lon', 'z'])

In [11]:
y_true = px.scatter(X_test, x='lon', y='lat', color='z')
y_true.update_layout(
        width=320,
        height=800,
        xaxis=dict(type='linear', autorange=True),  # Adjust x-axis properties
        yaxis=dict(type='linear', autorange=True),  # Adjust y-axis properties
    )