# **Interactive Map with PACE and insitu data**<br>
Author:Gulce Kurtay<br>
ORCID: 0000-0002-5169-6314<br>
Sources:<br>
hypercoast, https://hypercoast.org/<br>
Woods Hole Dashboard, https://ifcb-data.whoi.edu/dashboard <br>

This notebook designed to combine PACE chl-a data (hpyercoast) with in-situ flow cytometry group distribution then make an interactive map.<br>
Classified IFCB data is limited, so make sure to search the database then find a group distribution that matches with your interest. 



STEP 1  download the phytoplankton group distribution and metadata from the IFCB dashboard, it only downloads the autoclass files not the images<br>
STEP 2.1 organize the csv files in group distribution format aligns with lat and long info<br>
STEP 2.2 summarize the information into daily format to match up with PACE data <br>
STEP 3   Download the PACE data with hypercoast<br>
STEP 4   interactive map with spatial chl-a and insitu group distribution


In [1]:
#STEP 1.1 
#Functions that needed for the downloading the csv files
#STEP-1 Download the csv files and lat and long
import os
import requests
import csv
from concurrent.futures import ThreadPoolExecutor

def collect_bin_ids(start_bin_id, end_bin_id, base_url, dataset, instrument, prefix):
    """
    Collects all bin IDs starting with a given prefix and stops at the end bin ID.

    :param start_bin_id: The starting bin ID to begin the search.
    :param end_bin_id: The bin ID at which to stop collecting.
    :param base_url: The base URL of the API.
    :param dataset: The dataset name to filter.
    :param instrument: The instrument name to filter.
    :param prefix: The prefix to match bin IDs against (e.g., "D2024").
    :return: A list of matching bin IDs.
    """
    bin_ids = []
    current_bin_id = start_bin_id
    
    while True:
        url = f"{base_url}/api/bin/{current_bin_id}"
        params = {
            "dataset": dataset,
            "instrument": instrument,
        }
        response = requests.get(url, params=params, timeout=10)
        if response.status_code != 200:
            print(f"Failed to retrieve data for bin: {current_bin_id}, Status Code: {response.status_code}")
            break
        
        data = response.json()
        if current_bin_id.startswith(prefix):
            bin_ids.append(current_bin_id)
            print(f"Collected bin ID: {current_bin_id}")
        
        # Check if we've reached the end bin ID
        if current_bin_id == end_bin_id:
            print(f"Reached the end bin ID: {end_bin_id}")
            break
        
        next_bin_id = data.get('next_bin_id')
        if not next_bin_id or not next_bin_id.startswith(prefix):
            break
        
        current_bin_id = next_bin_id
    
    print(f"Total bin IDs collected: {len(bin_ids)}")
    return bin_ids

def download_file(file_url, output_dir):
    """
    Downloads a specific file from the given URL.

    :param file_url: The full URL to the file.
    :param output_dir: The directory where the file will be saved.
    """
    file_name = os.path.basename(file_url)
    output_file = os.path.join(output_dir, file_name)
    os.makedirs(output_dir, exist_ok=True)

    try:
        response = requests.get(file_url, timeout=10)
        response.raise_for_status()
        with open(output_file, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded: {output_file}")
        return True

    except requests.exceptions.HTTPError as err:
        if response.status_code == 404:
            print(f"File not found (404): {file_url}")
        else:
            print(f"HTTP error occurred: {err} - URL: {file_url}")
        return False
    except requests.exceptions.ConnectionError as err:
        print(f"Connection error occurred: {err}")
        return False
    except requests.exceptions.Timeout as err:
        print(f"Timeout error occurred: {err}")
        return False
    except requests.exceptions.RequestException as err:
        print(f"An error occurred: {err}")
        return False

def fetch_lat_lon(bin_id, base_url, dataset):
    """
    Fetches latitude and longitude for a specific bin ID.

    :param bin_id: The bin ID to fetch lat/lon for.
    :param base_url: The base URL where the dataset is located.
    :param dataset: The dataset name.
    :return: A dictionary with latitude and longitude.
    """
    url = f"{base_url}/api/bin/{bin_id}"
    params = {
        "dataset": dataset,
    }
    response = requests.get(url, params=params, timeout=10)
    if response.status_code == 200:
        data = response.json()
        return {
            "bin_id": bin_id,
            "latitude": data.get('lat'),
            "longitude": data.get('lng')
        }
    else:
        print(f"Failed to fetch lat/lon for bin: {bin_id}, Status Code: {response.status_code}")
        return None

def download_autoclass_csvs_and_lat_lon(start_bin_id, end_bin_id, base_url, dataset, instrument, prefix, output_dir, max_workers=5):
    """
    Collects bin IDs, downloads all _class_scores.csv files, and fetches latitude and longitude for each bin.

    :param start_bin_id: The starting bin ID to begin the search.
    :param end_bin_id: The bin ID at which to stop collecting.
    :param base_url: The base URL where the dataset is located.
    :param dataset: The dataset name to filter.
    :param instrument: The instrument name to filter.
    :param prefix: The prefix to match bin IDs against (e.g., "D2024").
    :param output_dir: The directory where the files will be saved.
    :param max_workers: The maximum number of parallel downloads.
    """
    bin_ids = collect_bin_ids(start_bin_id, end_bin_id, base_url, dataset, instrument, prefix)
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        lat_lon_results = []
        
        for bin_id in bin_ids:
            # Download the _class_scores.csv file
            csv_file_url = f"{base_url}/{dataset}/{bin_id}_class_scores.csv"
            futures.append(executor.submit(download_file, csv_file_url, output_dir))
            
            # Fetch latitude and longitude
            lat_lon_results.append(fetch_lat_lon(bin_id, base_url, dataset))
        
        # Wait for all downloads to complete
        for future in futures:
            future.result()
        
        # Extract date from start_bin_id for the CSV filename
        date_str = start_bin_id.split('T')[0][1:]  # Extracts "DYYYYMMDD" and removes the "D"
        lat_lon_file = os.path.join(output_dir, f"{date_str}.csv")
        
        # Write lat/lon to CSV
        if lat_lon_results:
            with open(lat_lon_file, 'w', newline='') as csvfile:
                fieldnames = ['bin_id', 'latitude', 'longitude']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                
                writer.writeheader()
                for result in lat_lon_results:
                    if result:
                        writer.writerow(result)
            print(f"Latitude and Longitude data saved to {lat_lon_file}")




In [2]:
#STEP 1.2 application
#To get the community composition information from the dashboard, you will need to know specific information about the dataset:
#I have added stat and end bin ids because it will take too much time, so you can limit it 
# Example usage
base_url = "https://ifcb-data.whoi.edu"
dataset = "NESLTER_broadscale" #Dataset name: Name should be as it spelled in the url 
instrument = "IFCB102" #Instrument number
start_bin_id = "D20240526T000035_IFCB102"  # Start with a known valid bin ID 
end_bin_id = "D20240526T235208_IFCB102"  # End at this bin ID
prefix = "D2024"  # Prefix to match bin IDs
output_dir = r"C:\Users\kurta\OneDrive - UW\Desktop\deneme\2024_NELSTER"  # Directory to save the files

# Download all _class_scores.csv files and fetch lat/lon data, stopping at end_bin_id
download_autoclass_csvs_and_lat_lon(start_bin_id, end_bin_id, base_url, dataset, instrument, prefix, output_dir)

Collected bin ID: D20240526T000035_IFCB102
Collected bin ID: D20240526T002420_IFCB102
Collected bin ID: D20240526T004805_IFCB102
Collected bin ID: D20240526T011147_IFCB102
Collected bin ID: D20240526T013531_IFCB102
Collected bin ID: D20240526T015914_IFCB102
Collected bin ID: D20240526T022258_IFCB102
Collected bin ID: D20240526T024641_IFCB102
Collected bin ID: D20240526T031026_IFCB102
Collected bin ID: D20240526T033410_IFCB102
Collected bin ID: D20240526T035754_IFCB102
Collected bin ID: D20240526T042139_IFCB102
Collected bin ID: D20240526T044524_IFCB102
Collected bin ID: D20240526T050908_IFCB102
Collected bin ID: D20240526T053253_IFCB102
Collected bin ID: D20240526T062719_IFCB102
Collected bin ID: D20240526T065104_IFCB102
Collected bin ID: D20240526T071448_IFCB102
Collected bin ID: D20240526T073832_IFCB102
Collected bin ID: D20240526T080217_IFCB102
Collected bin ID: D20240526T082602_IFCB102
Collected bin ID: D20240526T084946_IFCB102
Collected bin ID: D20240526T091331_IFCB102
Collected b

In [7]:
#STEP 2.1, SPATIAL GROUPING FOR ONE DAY
#Choose the group with highest score for eery images
import os
import pandas as pd

# Define the directory containing the CSV files
if_files_root = r"C:\Users\kurta\OneDrive - UW\Desktop\deneme\2024_NELSTER"

# List to store summary data for each file
all_summaries = []

# Iterate over all files in the directory
for file_name in os.listdir(if_files_root):
    if file_name.startswith('D2024') and file_name.endswith('.csv'):
        file_path = os.path.join(if_files_root, file_name)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Keep the 'pid' column and process the numeric columns for max values
        pid_column = df['pid']
        numeric_df = df.iloc[:, 1:].fillna(-float('inf'))  # Process the numeric columns, excluding 'pid'
        
        # Find the max value and corresponding column
        df['max_value_column'] = numeric_df.idxmax(axis=1)
        df['max_value'] = numeric_df.max(axis=1)
        
        # Create a summary DataFrame, including 'pid', 'max_value_column', and 'max_value'
        summary_df = pd.DataFrame({
            'pid': pid_column,
            'max_value_column': df['max_value_column'],
            'max_value': df['max_value']
        })
        
        # Extract the date from the file name and remove the leading "D"
        summary_df['date'] = file_name.split('T')[0][1:]
        
        # Append the summary to the list
        all_summaries.append(summary_df)

# Concatenate all summary DataFrames into one
final_summary = pd.concat(all_summaries, ignore_index=True)

# Print the final summary (or save it to a file if needed)
print(final_summary)

# Optionally, save the final summary to a CSV file
# final_summary.to_csv('final_summary.csv', index=False)


                                   pid      max_value_column  max_value  \
0       D20240526T000035_IFCB102_00001           Dinophyceae     0.5740   
1       D20240526T000035_IFCB102_00002      Pseudo-nitzschia     0.5370   
2       D20240526T000035_IFCB102_00003        Leptocylindrus     0.5815   
3       D20240526T000035_IFCB102_00004      nanoplankton_mix     0.9995   
4       D20240526T000035_IFCB102_00005              detritus     1.0000   
...                                ...                   ...        ...   
306511  D20240526T235208_IFCB102_04761      nanoplankton_mix     0.9970   
306512  D20240526T235208_IFCB102_04762      nanoplankton_mix     0.9720   
306513  D20240526T235208_IFCB102_04763  detritus_transparent     0.5425   
306514  D20240526T235208_IFCB102_04765  detritus_transparent     0.5040   
306515  D20240526T235208_IFCB102_04766           Dinophyceae     0.6640   

            date  
0       20240526  
1       20240526  
2       20240526  
3       20240526  
4   

In [9]:
latlong_df=pd.read_csv(r"C:\Users\kurta\OneDrive - UW\Desktop\deneme\2024_NELSTER\20240526.csv")#read the lat_long csv you donwloaded in STEP 1.2
latlong_df.head()

Unnamed: 0,bin_id,latitude,longitude
0,D20240526T000035_IFCB102,41.16341,-70.957489
1,D20240526T002420_IFCB102,41.136379,-70.982971
2,D20240526T004805_IFCB102,41.139118,-70.938995
3,D20240526T011147_IFCB102,41.14257,-70.841156
4,D20240526T013531_IFCB102,41.101849,-70.758606


In [11]:
#STEP 2.2 MERGE THE LOCATION AND GROUP DISTRIBUTION
import pandas as pd


# Extract the date and time part from 'pid' and 'bin_id'
final_summary['pid_date'] = final_summary['pid'].apply(lambda x: x.split('_')[0])
latlong_df['bin_date'] = latlong_df['bin_id'].apply(lambda x: x.split('_')[0])

# Merge the two DataFrames based on the extracted date
merged_df = pd.merge(final_summary, latlong_df, left_on='pid_date', right_on='bin_date', how='inner')

# Save the merged DataFrame to a new CSV file
output_csv_path = r'C:\Users\kurta\OneDrive - UW\Desktop\deneme\2024_NELSTER\merged_D20240526.csv'
merged_df.to_csv(output_csv_path, index=False)

print("Merging completed and saved to:", output_csv_path)

# Display the merged DataFrame
merged_df.head()


Merging completed and saved to: C:\Users\kurta\OneDrive - UW\Desktop\deneme\2024_NELSTER\merged_D20240526.csv


Unnamed: 0,pid,max_value_column,max_value,date,pid_date,bin_id,latitude,longitude,bin_date
0,D20240526T000035_IFCB102_00001,Dinophyceae,0.574,20240526,D20240526T000035,D20240526T000035_IFCB102,41.16341,-70.957489,D20240526T000035
1,D20240526T000035_IFCB102_00002,Pseudo-nitzschia,0.537,20240526,D20240526T000035,D20240526T000035_IFCB102,41.16341,-70.957489,D20240526T000035
2,D20240526T000035_IFCB102_00003,Leptocylindrus,0.5815,20240526,D20240526T000035,D20240526T000035_IFCB102,41.16341,-70.957489,D20240526T000035
3,D20240526T000035_IFCB102_00004,nanoplankton_mix,0.9995,20240526,D20240526T000035,D20240526T000035_IFCB102,41.16341,-70.957489,D20240526T000035
4,D20240526T000035_IFCB102_00005,detritus,1.0,20240526,D20240526T000035,D20240526T000035_IFCB102,41.16341,-70.957489,D20240526T000035


##STEP 3  Download the PACE data by using Hypercoast package

In [None]:
#first install the hypercoast
%pip install "hypercoast[extra]"

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
from io import BytesIO
import base64
import hypercoast
from hypercoast import Map

In [13]:
#read the merged csv file
#the codes for downloading the csv files will be in another notebook
difcb=pd.read_csv(r"C:\Users\kurta\OneDrive - UW\Desktop\deneme\2024_NELSTER\merged_D20240526.csv")
difcb.dtypes

pid                  object
max_value_column     object
max_value           float64
date                  int64
pid_date             object
bin_id               object
latitude            float64
longitude           float64
bin_date             object
dtype: object

In [14]:
# Convert the cleaned date strings to datetime objects
difcb['date'] = pd.to_datetime(difcb['date'], format='%Y%m%d')
difcb.head()

Unnamed: 0,pid,max_value_column,max_value,date,pid_date,bin_id,latitude,longitude,bin_date
0,D20240526T000035_IFCB102_00001,Dinophyceae,0.574,2024-05-26,D20240526T000035,D20240526T000035_IFCB102,41.16341,-70.957489,D20240526T000035
1,D20240526T000035_IFCB102_00002,Pseudo-nitzschia,0.537,2024-05-26,D20240526T000035,D20240526T000035_IFCB102,41.16341,-70.957489,D20240526T000035
2,D20240526T000035_IFCB102_00003,Leptocylindrus,0.5815,2024-05-26,D20240526T000035,D20240526T000035_IFCB102,41.16341,-70.957489,D20240526T000035
3,D20240526T000035_IFCB102_00004,nanoplankton_mix,0.9995,2024-05-26,D20240526T000035,D20240526T000035_IFCB102,41.16341,-70.957489,D20240526T000035
4,D20240526T000035_IFCB102_00005,detritus,1.0,2024-05-26,D20240526T000035,D20240526T000035_IFCB102,41.16341,-70.957489,D20240526T000035


In [15]:
import hypercoast

# Step 1: Login to NASA Earthdata
hypercoast.nasa_earth_login()# this will be your username and password for nasa_earth. you can get it easily


In [17]:
#Search for Chl-a data within a specific temporal range
temporal = ("2024-05-01", "2024-06-21")  # Adjust date range as needed
results = hypercoast.search_pace_chla(temporal=temporal)

# Step 3: Download the Chl-a data thi is the type three processed chl-a data
hypercoast.download_nasa_data(results, "chla")

# Print the search results to see what you found
print(results)


QUEUEING TASKS | :   0%|          | 0/47 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/47 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/47 [00:00<?, ?it/s]

[Collection: {'ShortName': 'PACE_OCI_L3M_CHL_NRT', 'Version': '2.0'}
Spatial coverage: {'HorizontalSpatialDomain': {'Geometry': {'BoundingRectangles': [{'SouthBoundingCoordinate': -90, 'WestBoundingCoordinate': -180, 'NorthBoundingCoordinate': 90, 'EastBoundingCoordinate': 180}]}}}
Temporal coverage: {'RangeDateTime': {'BeginningDateTime': '2024-05-01T00:00:00Z', 'EndingDateTime': '2024-05-01T23:59:59Z'}}
Size(MB): 2.894341468811035
Data: ['https://obdaac-tea.earthdatacloud.nasa.gov/ob-cumulus-prod-public/PACE_OCI.20240501.L3m.DAY.CHL.V2_0.chlor_a.0p1deg.NRT.nc'], Collection: {'Version': '2.0', 'ShortName': 'PACE_OCI_L3M_CHL_NRT'}
Spatial coverage: {'HorizontalSpatialDomain': {'Geometry': {'BoundingRectangles': [{'NorthBoundingCoordinate': 90, 'WestBoundingCoordinate': -180, 'SouthBoundingCoordinate': -90, 'EastBoundingCoordinate': 180}]}}}
Temporal coverage: {'RangeDateTime': {'EndingDateTime': '2024-05-02T23:59:59Z', 'BeginningDateTime': '2024-05-02T00:00:00Z'}}
Size(MB): 2.658063888

In [18]:

from hypercoast import read_pace_chla, pace_chla_to_image

# Step 4: Read the downloaded data files into an array
files = "chla/*.nc"  # Adjust file path to where your data is saved
array = read_pace_chla(files)

# Step 5: Select data for a specific date range and calculate the average
flow_date = "2024-05-26"#these should meet with the IFCB dashbpard for one day, you can easily select more days to get more coverage 
selected_array = array.sel(date=slice(flow_date))


# Step 6: Convert the averaged array to an image format
single_image = pace_chla_to_image(selected_array)


In [23]:
#PLOT GROUP DISTRIBUTION WITH HYPERCOAST
import matplotlib.pyplot as plt
from io import BytesIO
import base64
import ipywidgets as widgets
from hypercoast import Map

# Function to create genus plot for a given bin_date and location (with log-transformed Y-axis)
def create_genus_plot(species_data):
    fig, ax = plt.subplots(figsize=(15, 10))
    ax.bar(species_data['genus'], species_data['max_value'])
    ax.set_yscale('log')
    ax.set_title(f"Genus Distribution (Log Scale)", fontsize=20)
    ax.set_xlabel('Genus', fontsize=18)
    ax.set_ylabel('Max Value (log scale)', fontsize=18)
    ax.tick_params(axis='x', labelrotation=90, labelsize=12)

    # Convert the plot to a base64-encoded image
    buf = BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight')
    plt.close(fig)
    buf.seek(0)
    
    img_base64 = base64.b64encode(buf.read()).decode('utf-8')
    html_img = f'<img src="data:image/png;base64,{img_base64}" style="width:100%;">'
    return html_img

# Group data by latitude, longitude, and bin_date
locations = difcb.groupby(['latitude', 'longitude', 'bin_date'])

# Create the map centered on your stations
m = Map(center=[41.16341, -70.957489], zoom=8)
m.add_basemap("Hybrid")

# Add the averaged Chl-a raster layer to the map
m.add_raster(
    single_image,
    cmap="jet",
    vmin=-1,
    vmax=2,
    layer_name="Chlorophyll a",
    zoom_to_layer=False,
)

# Add genus distribution markers to the map for each unique bin_date and location
for (lat, lon, bin_date), location_data in locations:
    
    # Group data by genus for the specific bin_date and location
    genus_data = location_data.groupby('genus', as_index=False)['max_value'].sum()
    
    # Create genus distribution plot for the bin_date and location
    genus_plot_html = create_genus_plot(genus_data)
    
    # Construct the HTML for the popup
    popup_html = f"<strong>Station Info</strong><br>Location: {lat:.6f}, {lon:.6f}<br>Bin Date: {bin_date}<br>"
    popup_html += "<div>Genus Distribution:</div>"
    
    # Add the genus distribution plot directly
    popup_html += f'<div id="species_img">{genus_plot_html}</div>'
    
    # Create an HTML widget from the string and set the size of the popup
    popup_widget = widgets.HTML(value=popup_html)
    popup_container = widgets.Box([popup_widget], layout=widgets.Layout(width='600px', height='500px'))
    
    # Add the marker to the map
    location = (lat, lon)
    m.add_marker(location=location, popup=popup_container)

# Display the map with Chl-a raster and species markers
m


Map(center=[41.16341, -70.957489], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title',…