# OPEN PARQUET FILES

In [1]:
import pandas as pd
import ipywidgets as widgets
import os
import matplotlib.pyplot as plt
import seaborn as sns

# List all parquet files in the folder
parquet_files = [f for f in os.listdir('.') if f.endswith('.parquet')]
file_dropdown = widgets.Dropdown(
    options=parquet_files,
    description='Select Parquet File:',
)
display(file_dropdown)

Dropdown(description='Select Parquet File:', options=('BEACON_CMEMS_BGC.parquet', 'BEACON_EMODNET_CHEMISTRY.paâ€¦

In [None]:
print("Selected file:", file_dropdown.value)

In [None]:
# Load the selected file into a pandas DataFrame
df = pd.read_parquet(file_dropdown.value)
df.head()  # Display the first few rows of the DataFrame


In [None]:
# statistics per parameter, select the columns of interest
df[["TIME",	"CHLOROPHYLL_PER_VOLUME", "OXYGEN_PER_VOLUME","NITRATE_PER_VOLUME","NITRATE_NITRITE_PER_VOLUME","AMMONIUM_PER_VOLUME",
    "PHOSPHATE_PER_VOLUME",	"SILICATE_PER_VOLUME","SALINITY","TEMPERATURE",]].describe()

In [None]:
#data grouped by "COMMON_FEATURE_TYPE" and "SOURCE_BDI"
grouped = df.groupby(['FEATURE_TYPE', 'SOURCE_BDI']).size().unstack(fill_value=0)
print(grouped)

In [None]:
print(df['CHLOROPHYLL_PER_VOLUME_UNITS'].unique())


In [None]:
# plot percentage and types of PLATFORM_C17

platform_counts = df['PLATFORM_C17'].value_counts(normalize=True) * 100
plt.figure(figsize=(10, 6))
sns.barplot(x=platform_counts.index, y=platform_counts.values)
plt.title('Percentage of Each PLATFORM_C17 Type')
plt.xlabel('PLATFORM_C17 Type')
plt.ylabel('Percentage (%)')
plt.xticks(rotation=45)
plt.show()

 

In [None]:
# plot percentage and types of OXYGEN_L22, CHLOROPHYLL_L22, NITRATE_L22, SILICATE_L22, PHOSPHATE_L22, AMMONIUM_L22, 
parameters = ['OXYGEN_L22', 'CHLOROPHYLL_L22', 'NITRATE_L22', 'SILICATE_L22', 'PHOSPHATE_L22', 'AMMONIUM_L22']
for param in parameters:
    counts = df[param].value_counts(normalize=True) * 100
    plt.figure(figsize=(10, 6))
    sns.barplot(x=counts.index, y=counts.values)
    plt.title(f'Percentage of Each {param} Type')
    plt.xlabel(f'{param} Type')
    plt.ylabel('Percentage (%)')
    plt.xticks(rotation=45)
    plt.show()


In [None]:
# GROUP PARAMETER and see how many CHLOROPHYLL_PER_VOLUME records are in each FEATURE_TYPE (per source)
chl_mask = df['CHLOROPHYLL_PER_VOLUME'].notna()
grouped_chl = df[chl_mask].groupby(['FEATURE_TYPE', 'SOURCE_BDI']).size().unstack(fill_value=0)
print('CHL PER FEATURE TYPE:')
print(grouped_chl)

# GROUP PARAMETER and see how many OXYGEN_PER_VOLUME records are in each FEATURE_TYPE (per source)
oxy_mask = df['OXYGEN_PER_VOLUME'].notna()
grouped_oxy = df[oxy_mask].groupby(['FEATURE_TYPE', 'SOURCE_BDI']).size().unstack(fill_value=0)
print('OXY PER FEATURE TYPE:')
print(grouped_oxy)

# GROUP PARAMETER and see how many NITRATE_PER_VOLUME records are in each FEATURE_TYPE (per source)
nitrate_mask = df['NITRATE_PER_VOLUME'].notna() 
grouped_nitrate = df[nitrate_mask].groupby(['FEATURE_TYPE', 'SOURCE_BDI']).size().unstack(fill_value=0)
print('NITRATE PER VOLUME PER FEATURE TYPE:')
print(grouped_nitrate)
# GROUP PARAMETER and see how many NITRATE_NITRITE_PER_VOLUME records are in each FEATURE_TYPE (per source)
nitrate_nitrite_mask = df['NITRATE_NITRITE_PER_VOLUME'].notna()  
grouped_nitrate_nitrite = df[nitrate_nitrite_mask].groupby(['FEATURE_TYPE', 'SOURCE_BDI']).size().unstack(fill_value=0)
print('NITRATE NITRITE PER VOLUME PER FEATURE TYPE:')       
print(grouped_nitrate_nitrite)

#  AMMONIUM_PER_VOLUME
ammonium_mask = df['AMMONIUM_PER_VOLUME'].notna()   
grouped_ammonium = df[ammonium_mask].groupby(['FEATURE_TYPE', 'SOURCE_BDI']).size().unstack(fill_value=0)
print('AMMONIUM PER VOLUME PER FEATURE TYPE:')
print(grouped_ammonium)
# PHOSPHATE_PER_VOLUME
phosphate_mask = df['PHOSPHATE_PER_VOLUME'].notna()
grouped_phosphate = df[phosphate_mask].groupby(['FEATURE_TYPE', 'SOURCE_BDI']).size().unstack(fill_value=0)
print('PHOSPHATE PER VOLUME PER FEATURE TYPE:')
print(grouped_phosphate)
# SILICATE_PER_VOLUME
silicate_mask = df['SILICATE_PER_VOLUME'].notna()
grouped_silicate = df[silicate_mask].groupby(['FEATURE_TYPE', 'SOURCE_BDI']).size().unstack(fill_value=0)
print('SILICATE PER VOLUME PER FEATURE TYPE:')
print(grouped_silicate)
#  SALINITY
salinity_mask = df['SALINITY'].notna()
grouped_salinity = df[salinity_mask].groupby(['FEATURE_TYPE', 'SOURCE_BDI']).size().unstack(fill_value=0)
print('SALINITY PER FEATURE TYPE:')
print(grouped_salinity)
# TEMPERATURE
temperature_mask = df['TEMPERATURE'].notna()
grouped_temperature = df[temperature_mask].groupby(['FEATURE_TYPE', 'SOURCE_BDI']).size().unstack(fill_value=0)
print('TEMPERATURE PER FEATURE TYPE:')  
print(grouped_temperature)


In [None]:
# define the list of parameters to analyze
parameters = [
    "CHLOROPHYLL_PER_VOLUME",
    "OXYGEN_PER_VOLUME",
    "NITRATE_PER_VOLUME",
    "NITRATE_NITRITE_PER_VOLUME",
    "AMMONIUM_PER_VOLUME",
    "PHOSPHATE_PER_VOLUME",
    "SILICATE_PER_VOLUME",
    "SALINITY",
    "TEMPERATURE"
]

# build counts of non-null records per FEATURE_TYPE for each parameter
feature_order = grouped.index  # use existing grouped index for consistent ordering
counts = pd.DataFrame(index=feature_order)

for p in parameters:
    s = df[df[p].notna()].groupby('FEATURE_TYPE').size()
    counts[p] = s.reindex(feature_order).fillna(0).astype(int)

# plot grouped bar chart: FEATURE_TYPE on x, parameter counts on y
ax = counts.plot(kind='bar', figsize=(12,6))
ax.set_xlabel('FEATURE_TYPE')
ax.set_ylabel('Count of non-null records')
ax.set_title('Parameter availability per FEATURE_TYPE')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Parameter', bbox_to_anchor=(1.02, 1), loc='upper left')
plt.tight_layout()
plt.show()



In [None]:
# plot the amount of data available per parameter and per year PER MONTH
df['year'] = pd.DatetimeIndex(df['TIME']).year
df_melted = df.melt(
    id_vars=['year'],
    value_vars=[
        "CHLOROPHYLL_PER_VOLUME",
        "OXYGEN_PER_VOLUME",
        "NITRATE_PER_VOLUME",
        "NITRATE_NITRITE_PER_VOLUME",
        "AMMONIUM_PER_VOLUME",
        "PHOSPHATE_PER_VOLUME",
        "SILICATE_PER_VOLUME",
        "SALINITY",
        "TEMPERATURE"
    ],
    var_name='parameter',
    value_name='value'
)
df_melted = df_melted.dropna(subset=['value'])
plt.figure(figsize=(12, 6))
sns.countplot(data=df_melted, x='year', hue='parameter')
plt.title('Number of Observations per Parameter per Year')
plt.xlabel('Year')
plt.ylabel('Number of Observations')
plt.legend(title='Parameter', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Plot the amount of data available per parameter and per depth range
# Bin depth into ranges (e.g., every 10 meters)
depth_bins = [0, 100, 500, 1000, 2000, 3000,4000, 5000, 10000]
depth_labels = [f"{depth_bins[i]}-{depth_bins[i+1]}m" for i in range(len(depth_bins)-1)]
df['depth_range'] = pd.cut(df['DEPTH'], bins=depth_bins, labels=depth_labels, include_lowest=True)

df_melted = df.melt(
    id_vars=['depth_range'],
    value_vars=[
        "CHLOROPHYLL_PER_VOLUME",
        "OXYGEN_PER_VOLUME",
        "NITRATE_PER_VOLUME",
        "NITRATE_NITRITE_PER_VOLUME",
        "AMMONIUM_PER_VOLUME",
        "PHOSPHATE_PER_VOLUME",
        "SILICATE_PER_VOLUME",
        "SALINITY",
        "TEMPERATURE"
    ],
    var_name='parameter',
    value_name='value'
)
df_melted = df_melted.dropna(subset=['value'])

plt.figure(figsize=(14, 6))
sns.countplot(data=df_melted, x='depth_range', hue='parameter')
plt.title('Number of Observations per Parameter per Depth Range')
plt.xlabel('Depth Range (m)')
plt.ylabel('Number of Observations')
plt.legend(title='Parameter', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# plot the amount of data available per parameter and per MONTH

df['month'] = pd.DatetimeIndex(df['TIME']).month
df_melted = df.melt(
    id_vars=['month'],
    value_vars=[
        "CHLOROPHYLL_PER_VOLUME",
        "OXYGEN_PER_VOLUME",
        "NITRATE_PER_VOLUME",
        "NITRATE_NITRITE_PER_VOLUME",
        "AMMONIUM_PER_VOLUME",
        "PHOSPHATE_PER_VOLUME",
        "SILICATE_PER_VOLUME",
        "SALINITY",
        "TEMPERATURE"
    ],
    var_name='parameter',
    value_name='value'
)
df_melted = df_melted.dropna(subset=['value'])
plt.figure(figsize=(12, 6))
sns.countplot(data=df_melted, x='month', hue='parameter')
plt.title('Number of Observations per Parameter per Month') 
plt.xlabel('Month in 2011')
plt.ylabel('Number of Observations')
plt.legend(title='Parameter', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# convert data in geopandas for geospatial analysis
# %matplotlib inline
import pandas as pd
import geopandas as gpd
import contextily as cx
import matplotlib.pyplot as plt

gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df['LONGITUDE'], df['LATITUDE']),
    crs="EPSG:4326"  # WGS84 Latitude/Longitude
)
gdf.head()

In [None]:
# pip install "folium>=0.12" matplotlib mapclassify

In [None]:
ax = gdf.to_crs('EPSG:4326').plot(figsize=(10, 10))
cx.add_basemap(ax)
plt.show()

In [None]:
# plot geospatial distribution of CHLOROPHYLL_PER_VOLUME using geopandas and a tile basemap
import contextily as ctx
chl_gdf = gdf[gdf['CHLOROPHYLL_PER_VOLUME'].notna()]
ax = chl_gdf.plot(
    figsize=(10, 10),
    column='CHLOROPHYLL_PER_VOLUME',
    cmap='viridis',
    markersize=5,
    legend=True,
    alpha=0.7
)
ctx.add_basemap(ax)
ax.set_title('Geospatial Distribution of CHLOROPHYLL_PER_VOLUME')
plt.show()

In [None]:
# plot data and color it using the SOURCE_BDI
ax = gdf.plot(
    figsize=(10, 10),
    column='SOURCE_BDI',
    cmap='tab10',
    markersize=5,
    legend=True,
    alpha=0.7
)
ctx.add_basemap(ax)
ax.set_title('Data Points Colored by SOURCE_BDI')
plt.show()