In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import plotly.express as px
from DBSCANSupport import *
from LocalPath import LOCALPATH

Load gravity data

In [2]:
grav = pd.read_csv(LOCALPATH + 'data/test_grav.csv').drop(columns=["old_ind"])


In [3]:
lat = list(grav['Latitude'])
longg = list(grav['Longitude'])
inten = list(grav['Intensity'])
#fig.show()

In [4]:
X = grav[['Latitude', 'Longitude', 'Intensity']].to_numpy()
X

array([[ -1.50833333, -97.99166667,  -5.5733223 ],
       [ -1.50833333, -97.975     ,  -5.16694736],
       [ -1.50833333, -97.95833333,  -4.97332668],
       ...,
       [ -5.99166667, -90.04166667,  -2.35047531],
       [ -5.99166667, -90.025     ,  -0.47824201],
       [ -5.99166667, -90.00833333,  -0.67400765]])

In [5]:
test_eps = np.linspace(0.35, 1, 20)
test_samp = np.arange(5, 30)
DBModel_test = DBSCANSupport(LOCALPATH+"data/sample_mask.txt.xlsx", test_zone=(-6, -1.5, -98, -90))

In [6]:
score, params, data_out  = DBModel_test.gridSearch(test_eps, test_samp, X, DBModel_test.outlierDeviation, verbose=True)

Score for 0.1 and 20 is -761.1324503311258
Score for 0.1 and 21 is -762.0728476821192
Score for 0.1 and 22 is -762.5165562913908
Score for 0.1 and 23 is -762.8079470198676
0.1 and 24 produced 1 (too few) clusters
0.1 and 25 produced 1 (too few) clusters
0.1 and 26 produced 0 (too few) clusters
0.1 and 27 produced 0 (too few) clusters
0.1 and 28 produced 0 (too few) clusters
0.1 and 29 produced 0 (too few) clusters
0.1 and 30 produced 0 (too few) clusters
0.1 and 31 produced 0 (too few) clusters
0.1 and 32 produced 0 (too few) clusters
0.1 and 33 produced 0 (too few) clusters
0.1 and 34 produced 0 (too few) clusters
0.1 and 35 produced 0 (too few) clusters
0.1 and 36 produced 0 (too few) clusters
0.1 and 37 produced 0 (too few) clusters
0.1 and 38 produced 0 (too few) clusters
0.1 and 39 produced 0 (too few) clusters
Score for 0.1473684210526316 and 20 is -610.0993377483444
Score for 0.1473684210526316 and 21 is -633.635761589404
Score for 0.1473684210526316 and 22 is -655.4635761589404

In [7]:
data_out

array([[ -1.50833333, -97.99166667,   0.        ,  -5.5733223 ],
       [ -1.50833333, -97.975     ,   0.        ,  -5.16694736],
       [ -1.50833333, -97.95833333,   0.        ,  -4.97332668],
       ...,
       [ -5.99166667, -90.04166667,  -1.        ,  -2.35047531],
       [ -5.99166667, -90.025     ,  -1.        ,  -0.47824201],
       [ -5.99166667, -90.00833333,  -1.        ,  -0.67400765]])

In [8]:
dfout = pd.DataFrame(data_out, columns=["Latitude", "Longitude", "Label", "Intensity"])
DBModel_test.matchPoints(dfout)
df_labeled = dfout[dfout['Label'] == -1]

In [9]:

fig = px.scatter(df_labeled, x="Longitude", y="Latitude")
fig.show()

In [10]:
test_zone=(-6, -1.5, -98, -90)
seamounts = pd.read_excel(LOCALPATH+"data/sample_mask.txt.xlsx", \
                              sheet_name="new mask")
seamounts = seamounts.drop(columns=["VGG Height", "Radius", "base_depth", "-",
                                        "Name", "Charted", "surface_depth"])
seamounts = seamounts[(seamounts["Latitude"] >= test_zone[0]) & (seamounts["Latitude"] <= test_zone[1]) &
                          (seamounts["Longitude"] >= test_zone[2]) & (seamounts["Longitude"] <= test_zone[3])]
seamounts = seamounts.to_numpy()
fig3 = px.scatter(x=seamounts[:, 0], y=seamounts[:, 1])
fig3.show()

In [11]:
fig = px.scatter(df_labeled, x="Longitude", y="Latitude", color="True_Seamount")
fig.add_trace(px.scatter(x=seamounts[:, 0], y=seamounts[:, 1]).data[0])
fig.update_xaxes(
    scaleanchor="y",
    scaleratio=1,
  )
fig.show()


In [12]:
score

2.9205298013245033

In [13]:
params

(0.4789473684210527, 20)

In [14]:
dfout.to_csv(LOCALPATH+"data/DBSCAN_test.csv", index=False)

In [15]:
params

(0.4789473684210527, 20)