<a href="https://colab.research.google.com/github/marinebon/HackingLimno2025/blob/main/02_Sample_environmental_data_at_each_occurrence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Fetching Occurrence Data from pyobis



In [3]:
!pip install pyobis  rasterio pandas geopandas matplotlib  requests

Collecting pyobis
  Downloading pyobis-1.4.1-py3-none-any.whl.metadata (5.5 kB)
Downloading pyobis-1.4.1-py3-none-any.whl (23 kB)
Installing collected packages: pyobis
Successfully installed pyobis-1.4.1


Importing libraries

In [4]:
from pyobis import occurrences
import pandas as pd
from typing import List, Dict, Optional, Tuple
from datetime import datetime

Matplotlib for colab

In [5]:
%matplotlib inline

mount google drive for saving result

In [6]:
from google.colab import drive
drive.mount('/content/drive')


import os
project_dir = '/content/drive/MyDrive/GSoC_SDM_Project'
if not os.path.exists(project_dir):
    os.makedirs(project_dir)

Mounted at /content/drive


fetching occurrence data

In [7]:
def fetch_occurrences(
    scientific_name: str,
    start_date: Optional[str] = None,
    end_date: Optional[str] = None,
    geometry: Optional[str] = None,
    limit: Optional[int] = None
) -> pd.DataFrame:

    query_params = {"scientificname": scientific_name}

    if start_date:
        query_params["startdate"] = start_date
    if end_date:
        query_params["enddate"] = end_date
    if geometry:
        query_params["geometry"] = geometry

    query = occurrences.search(**query_params)
    query.execute()


    df = pd.DataFrame(query.data['results'])

    if limit and len(df) > limit:
        df = df.head(limit)

    return df

In [8]:
def get_occurrence_basics(df):
    """
    Extract just the basic columns needed for SDM from OBIS data.
    """
    key_cols = ['scientificName', 'decimalLatitude', 'decimalLongitude',
                'eventDate', 'year', 'month', 'depth']

    available = [col for col in key_cols if col in df.columns]

    df_basic = df[available].copy()


    df_basic['occurrence_id'] = range(len(df_basic))

    return df_basic


# df_simple = get_occurrence_basics(df)
# print(df_simple.shape)
# df_simple.head()

Clean Occurrence Data Function

In [9]:
def clean_occurrence_data(
    df: pd.DataFrame,
    required_columns: Optional[List[str]] = None
) -> pd.DataFrame:

    if required_columns is None:
        required_columns = ['decimalLatitude', 'decimalLongitude', 'eventDate']


    initial_count = len(df)


    df_clean = df.dropna(subset=required_columns)


    if 'decimalLatitude' in df_clean.columns:
        df_clean['decimalLatitude'] = pd.to_numeric(df_clean['decimalLatitude'], errors='coerce')
    if 'decimalLongitude' in df_clean.columns:
        df_clean['decimalLongitude'] = pd.to_numeric(df_clean['decimalLongitude'], errors='coerce')


    if 'eventDate' in df_clean.columns:
        df_clean['eventDate'] = pd.to_datetime(df_clean['eventDate'], errors='coerce')

    df_clean = df_clean.dropna(subset=['decimalLatitude', 'decimalLongitude', 'eventDate'])


    if 'decimalLatitude' in required_columns:
        df_clean = df_clean[
            (df_clean['decimalLatitude'] >= -90) &
            (df_clean['decimalLatitude'] <= 90)
        ]

    if 'decimalLongitude' in required_columns:
        df_clean = df_clean[
            (df_clean['decimalLongitude'] >= -180) &
            (df_clean['decimalLongitude'] <= 180)
        ]


    removed = initial_count - len(df_clean)
    print(f"Removed {removed} invalid records ({removed/initial_count*100:.1f}%)")

    return df_clean

Data exploration


In [10]:
def explore_occurrence_data(df: pd.DataFrame) -> Dict:

    summary = {
        'total_records': len(df),
        'columns': list(df.columns),
        'species_count': df['scientificName'].nunique() if 'scientificName' in df.columns else 0
    }

    if all(col in df.columns for col in ['decimalLatitude', 'decimalLongitude']):

        lat_values = pd.to_numeric(df['decimalLatitude'], errors='coerce')
        lon_values = pd.to_numeric(df['decimalLongitude'], errors='coerce')

        valid_lats = lat_values.dropna()
        valid_lons = lon_values.dropna()

        if len(valid_lats) > 0 and len(valid_lons) > 0:
            summary['spatial'] = {
                'lat_range': (float(valid_lats.min()), float(valid_lats.max())),
                'lon_range': (float(valid_lons.min()), float(valid_lons.max())),
                'unique_locations': len(df[['decimalLatitude', 'decimalLongitude']].drop_duplicates())
            }

    if 'eventDate' in df.columns:
        dates = pd.to_datetime(df['eventDate'], errors='coerce')
        valid_dates = dates.dropna()

        if len(valid_dates) > 0:
            summary['temporal'] = {
                'date_range': (str(valid_dates.min()), str(valid_dates.max()))
            }

    if 'year' in df.columns:
        summary['temporal'] = summary.get('temporal', {})
        years = pd.to_numeric(df['year'], errors='coerce').dropna()
        if len(years) > 0:
            summary['temporal']['year_range'] = (int(years.min()), int(years.max()))

    return summary

Function to prepare data for environmental sampling

In [11]:
def prepare_for_environmental_sampling(
    df: pd.DataFrame,
    species_column: str = 'scientificName',
    lat_column: str = 'decimalLatitude',
    lon_column: str = 'decimalLongitude',
    additional_columns: Optional[List[str]] = None
) -> pd.DataFrame:

    columns = [species_column, lat_column, lon_column]

    if additional_columns:
        columns.extend([col for col in additional_columns if col in df.columns])

    prepared_df = df[columns].copy()

    prepared_df['occurrence_id'] = range(len(prepared_df))

    cols = ['occurrence_id'] + [col for col in prepared_df.columns if col != 'occurrence_id']
    prepared_df = prepared_df[cols]

    return prepared_df

This is just and example for testing the general functions


In [12]:
species_name = "Mola mola"
df = fetch_occurrences(species_name)
df.head()

Unnamed: 0,associatedReferences,basisOfRecord,bibliographicCitation,brackish,catalogNumber,class,classid,collectionCode,coordinatePrecision,coordinateUncertaintyInMeters,...,identificationID,verbatimSRS,otherCatalogNumbers,typeStatus,nameAccordingTo,nameAccordingToID,recordedByID,verbatimElevation,unaccepted,fieldNotes
0,"[{""crossref"":{""citeinfo"":{""origin"":""APEM and N...",HumanObservation,"[{""crossref"":{""citeinfo"":{""origin"":""APEM and N...",False,1817_34381,Teleostei,293496,1817,0.0001,11.13,...,,,,,,,,,,
1,,PreservedSpecimen,,False,I.25630-001,Teleostei,293496,Ichthyology,,100000.0,...,,,,,,,,,,
2,"[{""crossref"":{""citeinfo"":{""origin"":""Van Canney...",HumanObservation,"[{""crossref"":{""citeinfo"":{""origin"":""Van Canney...",False,1404_76313,Teleostei,293496,1404,1.0000000000000002e-06,,...,,,,,,,,,,
3,"[{""crossref"":{""citeinfo"":{""origin"":""Cetacean a...",HumanObservation,"[{""crossref"":{""citeinfo"":{""origin"":""Cetacean a...",False,283_18309,Teleostei,293496,283,0.001,111.32,...,,,,,,,,,,
4,"[{""crossref"":{""citeinfo"":{""origin"":""Gatzke J, ...",HumanObservation,"[{""crossref"":{""citeinfo"":{""origin"":""Gatzke J, ...",False,513_89049,Teleostei,293496,513,1e-05,1.11,...,,,,,,,,,,


In [13]:
df = get_occurrence_basics(df)
df.head()

Unnamed: 0,scientificName,decimalLatitude,decimalLongitude,eventDate,year,month,depth,occurrence_id
0,Mola mola,39.331,-72.4104,2016-11-17T14:20:44,,,,0
1,Mola mola,-31.0,153.0,,1985.0,9.0,,1
2,Mola mola,43.1007,5.48887,2012-05-24,,,,2
3,Mola mola,40.167,-70.033,1981-08-02T11:14:00,,,,3
4,Mola mola,42.38549,-67.09194,2016-06-16T11:09:56,2016.0,6.0,,4


In [14]:
df_clean = clean_occurrence_data(df)


summary = explore_occurrence_data(df_clean)
summary

Removed 10974 invalid records (47.0%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['decimalLatitude'] = pd.to_numeric(df_clean['decimalLatitude'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['decimalLongitude'] = pd.to_numeric(df_clean['decimalLongitude'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['eventDate'] =

{'total_records': 12374,
 'columns': ['scientificName',
  'decimalLatitude',
  'decimalLongitude',
  'eventDate',
  'year',
  'month',
  'depth',
  'occurrence_id'],
 'species_count': 1,
 'spatial': {'lat_range': (-35.14, 64.18537),
  'lon_range': (-127.67253, 150.728),
  'unique_locations': 11866},
 'temporal': {'date_range': ('1974-05-08 07:10:00', '2024-09-18 15:06:56'),
  'year_range': (1987, 2024)}}

In [15]:
occurrence_data = prepare_for_environmental_sampling(df_clean)
occurrence_data.head()

Unnamed: 0,occurrence_id,scientificName,decimalLatitude,decimalLongitude
0,0,Mola mola,39.331,-72.4104
3,1,Mola mola,40.167,-70.033
4,2,Mola mola,42.38549,-67.09194
6,3,Mola mola,39.733,-71.533
8,4,Mola mola,40.9687,-70.4354


save occurrence data

In [16]:
def save_occurrence_data(
    df: pd.DataFrame,
    filename: str,
    format: str = 'csv'
) -> None:

    if format == 'csv':
        df.to_csv(f"{filename}.csv", index=False)
        print(f"Data saved to {filename}.csv")
    elif format == 'parquet':
        df.to_parquet(f"{filename}.parquet", index=False)
        print(f"Data saved to {filename}.parquet")
    else:
        raise ValueError(f"Unsupported format: {format}")


bringing in environmental data. Functions for Environmental Data Sampling

In [2]:
!pip install rasterio
!pip install xarray
!pip install netCDF4

Collecting rasterio
  Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio)
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: cligj, click-plugins, affine, rasterio
Successfully installed affine-2.4.0 click-plugins-1.1.1 cligj-0.7.2 rasterio-1.4.3
Collecting netCDF4
  Download

In [17]:
import rasterio
import xarray as xr
import numpy as np
from typing import Union, List, Tuple, Optional, Dict

In [19]:
def add_environmental_data(
    df: pd.DataFrame,
    raster_path: str,
    column_name: str = 'temperature',
    lat_col: str = 'decimalLatitude',
    lon_col: str = 'decimalLongitude'
) -> pd.DataFrame:

    df_result = df.copy()

    with rasterio.open(raster_path) as src:
        coords = [(row[lon_col], row[lat_col]) for _, row in df.iterrows()]
        sampled_values = list(src.sample(coords))
        values = [val[0] if val[0] != src.nodata else np.nan for val in sampled_values]
        df_result[column_name] = values

    print(f"Added {column_name}: {df_result[column_name].notna().sum()}/{len(df_result)} valid values")
    return df_result

RASTER_PATH = f'{project_dir}/cmems_salinity_44d2_bc4b_70b2_U1749495233099.nc'

df_with_temp = add_environmental_data(df_clean, RASTER_PATH, 'temperature')

print(df_with_temp[['scientificName', 'decimalLatitude', 'decimalLongitude', 'eventDate', 'temperature']].head())

Added temperature: 12306/12374 valid values
  scientificName  decimalLatitude  decimalLongitude           eventDate  \
0      Mola mola         39.33100         -72.41040 2016-11-17 14:20:44   
3      Mola mola         40.16700         -70.03300 1981-08-02 11:14:00   
4      Mola mola         42.38549         -67.09194 2016-06-16 11:09:56   
6      Mola mola         39.73300         -71.53300 1979-07-18 11:44:00   
8      Mola mola         40.96870         -70.43540 2020-07-21 14:22:45   

   temperature  
0    34.304287  
3    33.463039  
4    32.325291  
6    34.602718  
8    32.050797  
