In [1]:
# %pip install beacon_api --upgrade
# %pip install contextily

from beacon_api import * # Import the Beacon API client

In [None]:
TOKEN = "" # Replace with your actual token

emodnet_client = Client('https://beacon-emod-chem.maris.nl',jwt_token=TOKEN)
cmems_client = Client('https://beacon-cmems.maris.nl',jwt_token=TOKEN)
wod_client = Client('https://beacon-wod.maris.nl', jwt_token=TOKEN)
merged_client = Client('https://beacon-wb2-eutrophication.maris.nl', jwt_token=TOKEN)

Connected to: https://beacon-emod-chem.maris.nl/ server successfully
Connected to: https://beacon-cmems.maris.nl/ server successfully
Connected to: https://beacon-wod.maris.nl/ server successfully
Connected to: https://beacon-wb2-eutrophication.maris.nl/ server successfully


#### List the available columns and their data types (e.g., string, integer) that can be queried.

In [3]:
#  
emodnet_columns = emodnet_client.available_columns_with_data_type()
search_term = "wmo".lower()  # Convert to lowercase for case-insensitive search
[field for field in emodnet_columns if search_term in field.name.lower()]

[]

In [None]:
EMODNET_df = (
    emodnet_client.query()
    .add_select_column("yyyy-mm-ddThh:mm:ss.sss", alias="time")  # Select the columns you want to retrieve
    .add_select_column("Latitude", alias="latitude")
    .add_select_column("Longitude", alias="longitude")
    .add_select_column("Depth", alias="depth")
    .add_select_column("Water body nitrate plus nitrite", alias="Water body nitrate plus nitrite")
    .add_select_column("Water body nitrate plus nitrite_qc", alias="Water body nitrate plus nitrite_qc")
    .add_select_column("Water body nitrate", alias="Water body nitrate")
    .add_select_column("Water body nitrate_qc", alias="Water body nitrate_qc")
    .add_select_column("Water body chlorophyll-a", alias="Water body chlorophyll-a")
    .add_select_column("Water body chlorophyll-a_qc", alias="Water body chlorophyll-a_qc")
    .add_select_column("Water body dissolved oxygen concentration", alias="Water body dissolved oxygen concentration")
    .add_select_column("Water body dissolved oxygen concentration_qc", alias="Water body dissolved oxygen concentration_qc")
    .add_select_column("Water body phosphate", alias="Water body phosphate")
    .add_select_column("Water body phosphate_qc", alias="Water body phosphate_qc")
    .add_select_column("Water body silicate", alias="Water body silicate")
    .add_select_column("Water body silicate_qc", alias="Water body silicate_qc")
    .add_select_column("Water body ammonium", alias="Water body ammonium")
    .add_select_column("Water body ammonium_qc", alias="Water body ammonium_qc")
    
    .add_range_filter("time", "2016-01-01T00:00:00", "2019-12-31T23:59:59")  # You can adjust the date range as needed. The format is ISO 8601.
    .add_range_filter("latitude", -90, 90)  # Latitude range from -90 to 90 (you can adjust as needed)
    .add_range_filter("longitude", -180, 180)  # Longitude range from -180 to 180 (you can adjust as needed)
    .add_range_filter("depth", 0, 100)  # Depth range from 0 to 100 meters (you can adjust as needed)

    .add_polygon_filter("longitude", "latitude", [[-42, 24.30], [-42, 48], [-0.5, 48], [-0.5, 41], [-5,37], [-5, 24.30], [-42, 24.30]])
    .add_filter(
        OrFilter([IsNotNullFilter("Water body chlorophyll-a"), IsNotNullFilter("Water body dissolved oxygen concentration"), IsNotNullFilter("Water body phosphate"), 
                  IsNotNullFilter("Water body nitrate"), IsNotNullFilter("Water body silicate"), IsNotNullFilter("Water body ammonium"), IsNotNullFilter("Water body nitrate plus nitrite")])
    )  # Ensure the parameter selected is not null

    .to_pandas_dataframe()  # Convert the results to a pandas DataFrame.
    # .to_geo_pandas_dataframe("longitude","latitude")
)
EMODNET_df.describe()

In [None]:
# plot the amount of data available per parameter and per year 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
EMODNET_df['year'] = pd.DatetimeIndex(EMODNET_df['time']).year
EMODNET_df_melted = EMODNET_df.melt(
    id_vars=['year'],
    value_vars=[
        'Water body nitrate plus nitrite',
        'Water body nitrate',
        'Water body chlorophyll-a',
        'Water body dissolved oxygen concentration',
        'Water body phosphate',
        'Water body silicate',
        'Water body ammonium'
    ],
    var_name='parameter',
    value_name='value'
)
EMODNET_df_melted = EMODNET_df_melted.dropna(subset=['value'])
plt.figure(figsize=(12, 6))
sns.countplot(data=EMODNET_df_melted, x='year', hue='parameter')
plt.title('Number of Observations per Parameter per Year')
plt.xlabel('Year')
plt.ylabel('Number of Observations')
plt.legend(title='Parameter', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# availability percentage per parameter
total_count = len(EMODNET_df)
availability_percentage = (EMODNET_df.notnull().sum() / total_count) * 100
print(availability_percentage)

In [4]:
cmems_columns = cmems_client.available_columns_with_data_type()
search_term = "wmo".lower()  # Convert to lowercase for case-insensitive search
[field for field in cmems_columns if search_term in field.name.lower()]

[pyarrow.Field<.wmo_platform_code: string>,
 pyarrow.Field<.wmo_instrument_type: string>]

In [None]:
CMEMS_df = (
    cmems_client.query()
    .add_select_column("TIME", alias="time")  # Select the columns you want to retrieve
    .add_select_column("LATITUDE", alias="latitude")
    .add_select_column("LONGITUDE", alias="longitude")
    .add_select_column("DEPTH", alias="depth")
    .add_select_column("CPHL", alias="CPHL")
    .add_select_column("CPHL_QC", alias="CPHL_QC")
    .add_select_column("CHLT", alias="CHLT")
    .add_select_column("CHLT_QC", alias="CHLT_QC")
    .add_select_column("DOXY", alias="DOXY")
    .add_select_column("DOXY_QC", alias="DOXY_QC")
    .add_select_column("DOX1", alias="DOX1")
    .add_select_column("DOX1_QC", alias="DOX1_QC")
    .add_select_column("DOX2", alias="DOX2")
    .add_select_column("DOX2_QC", alias="DOX2_QC")
    .add_select_column("AMON", alias="AMON")
    .add_select_column("AMON_QC", alias="AMON_QC")
    .add_select_column("NTRA", alias="NTRA")
    .add_select_column("NTRA_QC", alias="NTRA_QC")
    .add_select_column("NTAW", alias="NTAW")
    .add_select_column("NTAW_QC", alias="NTAW_QC")
    .add_select_column("PHOS", alias="PHOS")
    .add_select_column("PHOS_QC", alias="PHOS_QC")
    .add_select_column("PHOW", alias="PHOW")
    .add_select_column("PHOW_QC", alias="PHOW_QC")
    .add_select_column("SLCA", alias="SLCA")
    .add_select_column("SLCA_QC", alias="SLCA_QC")
    .add_select_column("SLCW", alias="SLCW")
    .add_select_column("SLCW_QC", alias="SLCW_QC")
    .add_select_column("DC_REFERENCE", alias="DC_REFERENCE")
    
    .add_range_filter("time", "2016-01-01T00:00:00", "2019-12-31T23:59:59")  # You can adjust the date range as needed. The format is ISO 8601.
    .add_range_filter("latitude", -90, 90)  # Latitude range from -90 to 90 (you can adjust as needed)
    .add_range_filter("longitude", -180, 180)  # Longitude range from -180 to 180 (you can adjust as needed)
    .add_range_filter("depth", 0, 100)  # Depth range from 0 to 100 meters (you can adjust as needed)

    .add_polygon_filter("longitude", "latitude", [[-42, 24.30], [-42, 48], [-0.5, 48], [-0.5, 41], [-5,37], [-5, 24.30], [-42, 24.30]])
    .add_filter(OrFilter([IsNotNullFilter("CPHL"), IsNotNullFilter("CHLT"), IsNotNullFilter("DOXY"), IsNotNullFilter("DOX1"), IsNotNullFilter("DOX2"), IsNotNullFilter("AMON"), 
                  IsNotNullFilter("NTRA"), IsNotNullFilter("NTAW"), IsNotNullFilter("PHOS"), IsNotNullFilter("PHOW"), IsNotNullFilter("SLCA"), IsNotNullFilter("SLCW")]))  # Ensure the parameter selected is not null

    .to_pandas_dataframe()  # Convert the results to a pandas DataFrame.
    # .to_geo_pandas_dataframe("longitude","latitude")
)
CMEMS_df.describe()

In [None]:
CMEMS_df['DC_REFERENCE'].value_counts()

In [None]:
print(CMEMS_df['DC_REFERENCE'].unique())

In [None]:
# plot the amount of data available per parameter and per year 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
CMEMS_df['year'] = pd.DatetimeIndex(CMEMS_df['time']).year
CMEMS_df_melted = CMEMS_df.melt(
    id_vars=['year'],
    value_vars=[
        'CPHL','CHLT','DOXY','DOX1','DOX2','AMON','NTRA','NTAW','PHOS','PHOW','SLCA','SLCW'        
    ],
    var_name='parameter',
    value_name='value'
)
CMEMS_df_melted = CMEMS_df_melted.dropna(subset=['value'])
plt.figure(figsize=(12, 6))
sns.countplot(data=CMEMS_df_melted, x='year', hue='parameter')
plt.title('Number of Observations per Parameter per Year')
plt.xlabel('Year')
plt.ylabel('Number of Observations')
plt.legend(title='Parameter', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [5]:
# reset search term
search_term = ""
wod_columns = wod_client.available_columns_with_data_type()
search_term = "wmo".lower()  # Convert to lowercase for case-insensitive search
[field for field in wod_columns if search_term in field.name.lower()]

[pyarrow.Field<WMO_ID: int32>, pyarrow.Field<WMO_ID.long_name: string>]

In [7]:
# search for a specific column
columns = merged_client.available_columns_with_data_type()
search_term = "platform".lower()  # Convert to lowercase for case-insensitive search
[field for field in columns if search_term in field.name.lower()]

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

#### Using the Query Builder to dynamically create queries


In [None]:
import ipywidgets as widgets
source_bdi_widget = widgets.Dropdown(
  options=[
    ("WOD", "BEACON_WOD"),
    ("EMODNET Chemistry", "BEACON_EMODNET_CHEMISTRY"),
    ("CMEMS BGC", "BEACON_CMEMS_BGC")
  ],
  value="BEACON_WOD",
  description="Source BDI:"
)

display(source_bdi_widget)

In [None]:
# print(source_bdi_widget.options)
print("Current selection:", source_bdi_widget.value)

In [None]:
query = merged_client.query()  # Create a new query builder instance
query.add_select_column("COMMON_TIME", alias="TIME")
query.add_select_column("COMMON_LATITUDE", alias="LATITUDE")
query.add_select_column("COMMON_LONGITUDE", alias="LONGITUDE")

#DEPTH
query.add_select_column("COMMON_DEPTH", alias="DEPTH")
query.add_select_column("COMMON_DEPTH_QC", alias="DEPTH_QC")
query.add_select_column("COMMON_DEPTH_UNITS", alias="DEPTH_UNITS")
query.add_select_column("COMMON_DEPTH_P01", alias="DEPTH_P01")
query.add_select_column("COMMON_DEPTH_P06", alias="DEPTH_P06")

# CHLOROPHYLL
query.add_select_column("COMMON_CHLOROPHYLL_PER_VOLUME", alias="CHLOROPHYLL_PER_VOLUME")
query.add_select_column("COMMON_CHLOROPHYLL_PER_VOLUME_QC", alias="CHLOROPHYLL_PER_VOLUME_QC")
query.add_select_column("COMMON_CHLOROPHYLL_PER_VOLUME_UNITS", alias="CHLOROPHYLL_PER_VOLUME_UNITS")
query.add_select_column("COMMON_CHLOROPHYLL_PER_VOLUME_P01", alias="CHLOROPHYLL_PER_VOLUME_P01")
query.add_select_column("COMMON_CHLOROPHYLL_PER_VOLUME_P06", alias="CHLOROPHYLL_PER_VOLUME_P06")
query.add_select_column("COMMON_CHLOROPHYLL_L05", alias="CHLOROPHYLL_L05")
query.add_select_column("COMMON_CHLOROPHYLL_L22", alias="CHLOROPHYLL_L22")
query.add_select_column("COMMON_CHLOROPHYLL_L33", alias="CHLOROPHYLL_L33")


#OXYGEN PER VOLUME
query.add_select_column("COMMON_OXYGEN_PER_VOLUME", alias="OXYGEN_PER_VOLUME")
query.add_select_column("COMMON_OXYGEN_PER_VOLUME_QC", alias="OXYGEN_PER_VOLUME_QC")
query.add_select_column("COMMON_OXYGEN_PER_VOLUME_UNITS", alias="OXYGEN_PER_VOLUME_UNITS")
query.add_select_column("COMMON_OXYGEN_PER_VOLUME_P01", alias="OXYGEN_PER_VOLUME_P01")
query.add_select_column("COMMON_OXYGEN_PER_VOLUME_P06", alias="OXYGEN_PER_VOLUME_P06")
query.add_select_column("COMMON_OXYGEN_L05", alias="OXYGEN_L05")
query.add_select_column("COMMON_OXYGEN_L22", alias="OXYGEN_L22")
query.add_select_column("COMMON_OXYGEN_L33", alias="OXYGEN_L33")

# NITRATE PER VOLUME
query.add_select_column("COMMON_NITRATE_PER_VOLUME", alias="NITRATE_PER_VOLUME")
query.add_select_column("COMMON_NITRATE_PER_VOLUME_QC", alias="NITRATE_PER_VOLUME_QC")
query.add_select_column("COMMON_NITRATE_PER_VOLUME_UNITS", alias="NITRATE_PER_VOLUME_UNITS")
query.add_select_column("COMMON_NITRATE_PER_VOLUME_P01", alias="NITRATE_PER_VOLUME_P01")
query.add_select_column("COMMON_NITRATE_PER_VOLUME_P06", alias="NITRATE_PER_VOLUME_P06")
query.add_select_column("COMMON_NITRATE_L05", alias="NITRATE_L05")
query.add_select_column("COMMON_NITRATE_L22", alias="NITRATE_L22")
query.add_select_column("COMMON_NITRATE_L33", alias="NITRATE_L33")

# NITRATE PLUS NITRITE PER VOLUME
query.add_select_column("COMMON_NITRATE_NITRITE_PER_VOLUME", alias="NITRATE_NITRITE_PER_VOLUME")
query.add_select_column("COMMON_NITRATE_NITRITE_PER_VOLUME_QC", alias="NITRATE_NITRITE_PER_VOLUME_QC")
query.add_select_column("COMMON_NITRATE_NITRITE_PER_VOLUME_UNITS", alias="NITRATE_NITRITE_PER_VOLUME_UNITS")
query.add_select_column("COMMON_NITRATE_NITRITE_PER_VOLUME_P01", alias="NITRATE_NITRITE_PER_VOLUME_P01")
query.add_select_column("COMMON_NITRATE_NITRITE_PER_VOLUME_P06", alias="NITRATE_NITRITE_PER_VOLUME_P06")
query.add_select_column("COMMON_NITRATE_NITRITE_L05", alias="NITRATE_NITRITE_L05")
query.add_select_column("COMMON_NITRATE_NITRITE_L22", alias="NITRATE_NITRITE_L22")
query.add_select_column("COMMON_NITRATE_NITRITE_L33", alias="NITRATE_NITRITE_L33")

# AMMONIUM PER VOLUME
query.add_select_column("COMMON_AMMONIUM_PER_VOLUME", alias="AMMONIUM_PER_VOLUME")
query.add_select_column("COMMON_AMMONIUM_PER_VOLUME_QC", alias="AMMONIUM_PER_VOLUME_QC")
query.add_select_column("COMMON_AMMONIUM_PER_VOLUME_UNITS", alias="AMMONIUM_PER_VOLUME_UNITS")
query.add_select_column("COMMON_AMMONIUM_PER_VOLUME_P01", alias="AMMONIUM_PER_VOLUME_P01")
query.add_select_column("COMMON_AMMONIUM_PER_VOLUME_P06", alias="AMMONIUM_PER_VOLUME_P06")
query.add_select_column("COMMON_AMMONIUM_L05", alias="AMMONIUM_L05")
query.add_select_column("COMMON_AMMONIUM_L22", alias="AMMONIUM_L22")
query.add_select_column("COMMON_AMMONIUM_L33", alias="AMMONIUM_L33")

# PHOSPHATE PER VOLUME
query.add_select_column("COMMON_PHOSPHATE_PER_VOLUME", alias="PHOSPHATE_PER_VOLUME")
query.add_select_column("COMMON_PHOSPHATE_PER_VOLUME_QC", alias="PHOSPHATE_PER_VOLUME_QC")
query.add_select_column("COMMON_PHOSPHATE_PER_VOLUME_UNITS", alias="PHOSPHATE_PER_VOLUME_UNITS")
query.add_select_column("COMMON_PHOSPHATE_PER_VOLUME_P01", alias="PHOSPHATE_PER_VOLUME_P01")
query.add_select_column("COMMON_PHOSPHATE_PER_VOLUME_P06", alias="PHOSPHATE_PER_VOLUME_P06")
query.add_select_column("COMMON_PHOSPHATE_L05", alias="PHOSPHATE_L05")
query.add_select_column("COMMON_PHOSPHATE_L22", alias="PHOSPHATE_L22")
query.add_select_column("COMMON_PHOSPHATE_L33", alias="PHOSPHATE_L33")

# SILICATE PER VOLUME
query.add_select_column("COMMON_SILICATE_PER_VOLUME", alias="SILICATE_PER_VOLUME")  
query.add_select_column("COMMON_SILICATE_PER_VOLUME_QC", alias="SILICATE_PER_VOLUME_QC")
query.add_select_column("COMMON_SILICATE_PER_VOLUME_UNITS", alias="SILICATE_PER_VOLUME_UNITS")
query.add_select_column("COMMON_SILICATE_PER_VOLUME_P01", alias="SILICATE_PER_VOLUME_P01")
query.add_select_column("COMMON_SILICATE_PER_VOLUME_P06", alias="SILICATE_PER_VOLUME_P06")
query.add_select_column("COMMON_SILICATE_L05", alias="SILICATE_L05")
query.add_select_column("COMMON_SILICATE_L22", alias="SILICATE_L22")
query.add_select_column("COMMON_SILICATE_L33", alias="SILICATE_L33")

# SALINITY
query.add_select_column("COMMON_SALINITY", alias="SALINITY")
query.add_select_column("COMMON_SALINITY_QC", alias="SALINITY_QC")
query.add_select_column("COMMON_SALINITY_UNITS", alias="SALINITY_UNITS")
query.add_select_column("COMMON_SALINITY_P01", alias="SALINITY_P01")
query.add_select_column("COMMON_SALINITY_P06", alias="SALINITY_P06")
query.add_select_column("COMMON_SALINITY_L05", alias="SALINITY_L05")
query.add_select_column("COMMON_SALINITY_L22", alias="SALINITY_L22")
query.add_select_column("COMMON_SALINITY_L33", alias="SALINITY_L33")

# TEMPERATURE
query.add_select_column("COMMON_TEMPERATURE", alias="TEMPERATURE")
query.add_select_column("COMMON_TEMPERATURE_QC", alias="TEMPERATURE_QC")
query.add_select_column("COMMON_TEMPERATURE_UNITS", alias="TEMPERATURE_UNITS")
query.add_select_column("COMMON_TEMPERATURE_P01", alias="TEMPERATURE_P01")
query.add_select_column("COMMON_TEMPERATURE_P06", alias="TEMPERATURE_P06")
query.add_select_column("COMMON_TEMPERATURE_L05", alias="TEMPERATURE_L05")
query.add_select_column("COMMON_TEMPERATURE_L22", alias="TEMPERATURE_L22")
query.add_select_column("COMMON_TEMPERATURE_L33", alias="TEMPERATURE_L33")


# add metadata columns if needed

query.add_select_column("COMMON_PLATFORM_L06", alias="PLATFORM_L06")
query.add_select_column("COMMON_PLATFORM_C17", alias="PLATFORM_C17")
query.add_select_column("SOURCE_BDI")
query.add_select_column("SOURCE_BDI_DATASET_ID")
query.add_select_column("COMMON_EDMO_CODE", alias="EDMO_CODE")
query.add_select_column("COMMON_FEATURE_TYPE", alias="FEATURE_TYPE")
query.add_select_column("COMMON_CSR", alias="CSR")

# important to generate the odv format
query.add_select_column("COMMON_ODV_TAG", alias="ODV_TAG")

# Apply filters to the query
query.add_filter(
        OrFilter([IsNotNullFilter("CHLOROPHYLL_PER_VOLUME"), 
                  IsNotNullFilter("OXYGEN_PER_VOLUME"), 
                  IsNotNullFilter("NITRATE_PER_VOLUME"),
                  IsNotNullFilter("NITRATE_NITRITE_PER_VOLUME"), 
                  IsNotNullFilter("AMMONIUM_PER_VOLUME"),
                  IsNotNullFilter("PHOSPHATE_PER_VOLUME"),
                  IsNotNullFilter("SILICATE_PER_VOLUME"),
                #   IsNotNullFilter("SALINITY"),
                #   IsNotNullFilter("TEMPERATURE")
                  ])
    )
query.add_range_filter("TIME", "2016-01-01T00:00:00", "2019-12-31T23:59:59") # You can adjust the date range as needed. The format is ISO 8601.
query.add_range_filter("LATITUDE", -90, 90) # Latitude range from -90 to 90 for full range (you can adjust as needed)
query.add_range_filter("LONGITUDE", -180, 180) # Longitude range from -180 to 180 for full range (you can adjust as needed)
query.add_range_filter("DEPTH", 0, 100) # Depth range from 0 to 1000 meters (you can adjust as needed)
query.add_equals_filter("SOURCE_BDI", source_bdi_widget.value)
# Alternatively, you can use a polygon filter to define a custom area
query.add_polygon_filter("LONGITUDE", "LATITUDE", [[-42, 24.30], [-42, 48], [-0.5, 48], [-0.5, 41], [-5,37], [-5, 24.30], [-42, 24.30]])


In [None]:
df = query.to_pandas_dataframe()


In [None]:
import ipywidgets as widgets
from IPython.display import display

output_widget = widgets.Dropdown(
    options=["odv", "netcdf", "parquet", "zarr"],
    value="odv",
    description="Output type:",
)
display(output_widget)

output_select = output_widget.value

In [None]:
print(output_select)

In [None]:
output = "odv"
if output == "odv":
    odv_output = Odv(
        longitude_column=OdvDataColumn("LONGITUDE"),
        latitude_column=OdvDataColumn("LATITUDE"),
        depth_column=OdvDataColumn("DEPTH"),
        time_column=OdvDataColumn("TIME"),
        data_columns=[OdvDataColumn("CHLOROPHYLL_PER_VOLUME", qf_column="CHLOROPHYLL_PER_VOLUME_QC"), 
                      OdvDataColumn("OXYGEN_PER_VOLUME", qf_column="OXYGEN_PER_VOLUME_QC"),
                      OdvDataColumn("NITRATE_PER_VOLUME", qf_column="NITRATE_PER_VOLUME_QC"),
                      OdvDataColumn("NITRATE_NITRITE_PER_VOLUME", qf_column="NITRATE_NITRITE_PER_VOLUME_QC"), 
                      OdvDataColumn("AMMONIUM_PER_VOLUME", qf_column="AMMONIUM_PER_VOLUME_QC"), 
                      OdvDataColumn("PHOSPHATE_PER_VOLUME", qf_column="PHOSPHATE_PER_VOLUME_QC"),
                      OdvDataColumn("SILICATE_PER_VOLUME", qf_column="SILICATE_PER_VOLUME_QC"),
                      OdvDataColumn("SALINITY", qf_column="SALINITY_QC"),
                      OdvDataColumn("TEMPERATURE", qf_column="TEMPERATURE_QC")
                     ],
        metadata_columns=[OdvDataColumn("CHLOROPHYLL_L05"), OdvDataColumn("CHLOROPHYLL_L22"), OdvDataColumn("CHLOROPHYLL_L33"), 
                          OdvDataColumn("OXYGEN_L05"), OdvDataColumn("OXYGEN_L22"), OdvDataColumn("OXYGEN_L33"),
                          OdvDataColumn("NITRATE_L05"), OdvDataColumn("NITRATE_L22"), OdvDataColumn("NITRATE_L33"),
                          OdvDataColumn("NITRATE_NITRITE_L05"), OdvDataColumn("NITRATE_NITRITE_L22"), OdvDataColumn("NITRATE_NITRITE_L33"),
                          OdvDataColumn("AMMONIUM_L05"), OdvDataColumn("AMMONIUM_L22"), OdvDataColumn("AMMONIUM_L33"),
                          OdvDataColumn("PHOSPHATE_L05"), OdvDataColumn("PHOSPHATE_L22"), OdvDataColumn("PHOSPHATE_L33"),
                          OdvDataColumn("SILICATE_L05"), OdvDataColumn("SILICATE_L22"), OdvDataColumn("SILICATE_L33"),
                          OdvDataColumn("SALINITY_L05"), OdvDataColumn("SALINITY_L22"), OdvDataColumn("SALINITY_L33"),
                          OdvDataColumn("TEMPERATURE_L05"), OdvDataColumn("TEMPERATURE_L22"), OdvDataColumn("TEMPERATURE_L33"),
                          OdvDataColumn("PLATFORM_L06"), OdvDataColumn("PLATFORM_C17"),
                          OdvDataColumn("SOURCE_BDI"), OdvDataColumn("SOURCE_BDI_DATASET_ID"),
                          OdvDataColumn("EDMO_CODE"), OdvDataColumn("FEATURE_TYPE"), OdvDataColumn("CSR")],
        key_column="ODV_TAG", # This column should uniquely identify a dataset
        qf_schema="SEADATANET",
        feature_type_column="FEATURE_TYPE"
    )

    query.to_odv(odv_output, f"{source_bdi_widget.value}.zip")
elif output == "netcdf":
    query.to_netcdf(f"{source_bdi_widget.value}.zip")
elif output == "parquet":
    query.to_parquet(f"{source_bdi_widget.value}.zip")
elif output == "zarr":
    query.to_zarr(f"{source_bdi_widget.value}.zip")

## Statistics per parameter:
select the columns of interest and keep in mind here all QF are taken into account!

In [None]:
# statistics per parameter, select the columns of interest
df[[
	"CHLOROPHYLL_PER_VOLUME",
	"OXYGEN_PER_VOLUME",
	"NITRATE_PER_VOLUME",
	"NITRATE_NITRITE_PER_VOLUME",
	"AMMONIUM_PER_VOLUME",
	"PHOSPHATE_PER_VOLUME",
	"SILICATE_PER_VOLUME",
	"SALINITY",
	"TEMPERATURE",
]].describe()

In [None]:
#data grouped by "COMMON_FEATURE_TYPE" and "SOURCE_BDI"
grouped = df.groupby(['FEATURE_TYPE', 'SOURCE_BDI']).size().unstack(fill_value=0)
print(grouped)

In [None]:
# availability percentage per parameter (percentage of non-null values)
total_count = len(df)
availability_percentage = (df[[
    "TIME",
    "LATITUDE",
    "LONGITUDE",
	"CHLOROPHYLL_PER_VOLUME",
	"OXYGEN_PER_VOLUME",
	"NITRATE_PER_VOLUME",
	"NITRATE_NITRITE_PER_VOLUME",
	"AMMONIUM_PER_VOLUME",
	"PHOSPHATE_PER_VOLUME",
	"SILICATE_PER_VOLUME",
	"SALINITY",
	"TEMPERATURE"
]].notnull().sum() / total_count) * 100
print(availability_percentage)

#plot the availability percentage
import matplotlib.pyplot as plt
availability_percentage.plot(kind='bar', figsize=(10, 6), color='skyblue')
plt.title('Availability Percentage per Parameter')
plt.ylabel('Percentage (%)')
plt.xlabel('Parameters')
plt.ylim(0, 100)
plt.grid(axis='y')
plt.show()


In [None]:
# plot the amount of data available per parameter and per year 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
df['year'] = pd.DatetimeIndex(df['TIME']).year
df_melted = df.melt(
    id_vars=['year'],
    value_vars=[
        "CHLOROPHYLL_PER_VOLUME",
        "OXYGEN_PER_VOLUME",
        "NITRATE_PER_VOLUME",
        "NITRATE_NITRITE_PER_VOLUME",
        "AMMONIUM_PER_VOLUME",
        "PHOSPHATE_PER_VOLUME",
        "SILICATE_PER_VOLUME",
        "SALINITY",
        "TEMPERATURE"
    ],
    var_name='parameter',
    value_name='value'
)
df_melted = df_melted.dropna(subset=['value'])
plt.figure(figsize=(12, 6))
sns.countplot(data=df_melted, x='year', hue='parameter')
plt.title('Number of Observations per Parameter per Year')
plt.xlabel('Year')
plt.ylabel('Number of Observations')
plt.legend(title='Parameter', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
# Plot the amount of data available per parameter and per depth range

import matplotlib.pyplot as plt
import seaborn as sns

# Bin depth into ranges (e.g., every 10 meters)
depth_bins = [0, 10, 20, 30, 40, 50, 75, 100]
depth_labels = [f"{depth_bins[i]}-{depth_bins[i+1]}m" for i in range(len(depth_bins)-1)]
df['depth_range'] = pd.cut(df['DEPTH'], bins=depth_bins, labels=depth_labels, include_lowest=True)

df_melted = df.melt(
    id_vars=['depth_range'],
    value_vars=[
        "CHLOROPHYLL_PER_VOLUME",
        "OXYGEN_PER_VOLUME",
        "NITRATE_PER_VOLUME",
        "NITRATE_NITRITE_PER_VOLUME",
        "AMMONIUM_PER_VOLUME",
        "PHOSPHATE_PER_VOLUME",
        "SILICATE_PER_VOLUME",
        "SALINITY",
        "TEMPERATURE"
    ],
    var_name='parameter',
    value_name='value'
)
df_melted = df_melted.dropna(subset=['value'])

plt.figure(figsize=(14, 6))
sns.countplot(data=df_melted, x='depth_range', hue='parameter')
plt.title('Number of Observations per Parameter per Depth Range')
plt.xlabel('Depth Range (m)')
plt.ylabel('Number of Observations')
plt.legend(title='Parameter', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()