# Setup

## Load packages

In [None]:
import pandas as pd
import requests
from google.colab import drive
import json
import time
import re
import os

## Setup Google Drive

Mount Google Drive in Colab. Add path to text file containing API key.

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Path to API key file in Google Drive
api_key_path = "/content/drive/My Drive/Outlaw Ocean/Global Fishing Watch/gfw_key2.txt"

# Read API key from file
with open(api_key_path, "r") as file:
    API_TOKEN = file.read().strip()

Mounted at /content/drive


## Setup Vessels API

In [None]:
# API Settings
url = "https://gateway.api.globalfishingwatch.org/v3/vessels/search"

In [None]:
# Set request headers
headers = {
    "Authorization": f"Bearer {API_TOKEN}"
}

In [None]:
# Rate limits
DAILY_LIMIT = 40000
PER_MINUTE_LIMIT = 60
REQUEST_DELAY = 60 / PER_MINUTE_LIMIT  # Time per request

## Load list of vessels to search

In [None]:
# Define file path
file_path = "/content/drive/My Drive/Outlaw Ocean/Global Fishing Watch/Sheet 1-master_2025-02-26_14-28-24.csv"

In [None]:
# Read the Google Sheet into a DataFrame with latin-1 encoding
try:
    vessels_df = pd.read_csv(file_path)
    print('File read successfully with default utf-8 encoding')
except UnicodeDecodeError:
    print('Failed to read with utf-8 encoding')
    try:
        vessels_df = pd.read_csv(file_path, encoding='ISO-8859-1')
        print('File read successfully with ISO-8859-1 encoding')
    except UnicodeDecodeError:
        print('Failed to read with ISO-8859-1 encoding')
        vessels_df = None

  vessels_df = pd.read_csv(file_path)


File read successfully with default utf-8 encoding


In [None]:
print("Total number of records ", len(vessels_df))

Total number of records  133461


In [None]:
vessels_df.head(12)

Unnamed: 0,Vessel Name,IMO Number,Flag,MMSI,Call Sign,National Registration Number,Port of Registry,External Marking,Owner Name,Owner Address,...,PERU_VesselMonitoring_South_20Feb2025,PERU_QuotaAssigned_NorthCenter_20Feb2025,PERU_QuotaAssigned_South_20Feb2025,PERU_FishingAgreement_NorthCenter_20Feb2025,PERU_FishingAgreement_South_20Feb2025,PERU_Fleet_NorthCenter_20Feb2025,PERU_Fleet_South_20Feb2025,PERU_On IUU List_24Feb2025,PERU_Decree 1392 IUU Infraction,Unnamed: 144
0,00Me02609,,Italy,,,ITA000005691,Messina,00Me02609,,,...,,,,,,,,,,
1,00Rg,,Croatia,,,HRV000002339,Latvia,00-Rg,,,...,,,,,,,,,,
2,1 De Abril,,Portugal,,,PRT000006814,Caminha,C-940-L,,,...,,,,,,,,,,
3,1 De Maio,,Portugal,,,PRT000022238,Aveiro,Ptave-117195-L,,,...,,,,,,,,,,
4,10 De Noviembre,9071301.0,Ar,,,,,,,,...,,,,,,,,,,
5,100,8331560.0,,,,,,,,,...,,,,,,,,,,
6,100022Ti,,Croatia,238861840.0,9A8947,HRV000001193,Tisno,100022-Ti,,,...,,,,,,,,,,
7,100029No,,Croatia,,,HRV000000579,Novi Vinodolski,100029-No,,,...,,,,,,,,,,
8,100049Kb,,Croatia,,,HRV000003083,Karlobag,100049-Kb,,,...,,,,,,,,,,
9,100100Su,,Croatia,238358840.0,9A4276,HRV000001464,Supetar,100100-Su,,,...,,,,,,,,,,


## Create/read file for saving results

In [None]:
# Define CSV save path
save_path = "/content/drive/My Drive/Outlaw Ocean/Global Fishing Watch/vessel_api_results.csv"

# Ensure CSV file exists with headers (original columns + API columns)
if not os.path.exists(save_path):
    # Create an empty DataFrame with extra API columns
    api_columns = ["api_shipname", "api_imo", "api_mmsi", "api_geartype", "api_flag"]
    df_columns = list(vessels_df.columns) + api_columns
    pd.DataFrame(columns=df_columns).to_csv(save_path, index=False)

## Helper functions for working with the Vessels API

In [None]:
def search_vessel_api(url, headers, vessel_name):
    """
    Searches for vessel information based on the vessel name.

    :param url: The API endpoint URL.
    :param headers: The headers required for the API request.
    :param vessel_name: String containing the 'Vessel Name'.
    :return: The JSON response data if successful, or None if an error occurs.
    """
    # Define query parameters
    params = {
        "query": vessel_name,
        "datasets[0]": "public-global-vessel-identity:latest",
        "includes[0]": "MATCH_CRITERIA",
        "includes[1]": "OWNERSHIP",
        "includes[2]": "AUTHORIZATIONS",
        "limit": 25
    }

    # Make the request
    response = requests.get(url, headers=headers, params=params)

    # Check response status
    if response.status_code == 200:
        data = response.json()
        print(f"Total search results: {data.get('total', 'Unknown')}, Downloaded: {len(data.get('entries', []))}")
        return data  # Return the response JSON for further use
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return None  # Return None if the request fails

In [None]:
def is_valid_vessel_name(name):
    """
    Checks if a vessel name is valid based on the following criteria:
    - Must be longer than 3 characters
    - Should not contain excessive punctuation or corrupted characters
    - Allowed characters: letters, numbers, spaces, hyphens, commas, dots, and ampersands
    """
    if not isinstance(name, str) or len(name) <= 3:
        return False

    # Regular expression to allow typical vessel name characters
    valid_pattern = r'^[a-zA-Z0-9\s\-.,&]+$'

    # Check if name matches the valid pattern
    return bool(re.match(valid_pattern, name))

# Example usage:
vessel_names = [
    "Titanic", "00Me02609", "Aboubak Saddik Ex Rais Abdelwahid Ex Hadj Besadoune",
    "!@#$%^&*", "A", "MV Kingfisher", "El Mar Azul", "123", "D#estroyer"
]

valid_names = [name for name in vessel_names if is_valid_vessel_name(name)]
print("Valid Vessel Names:", valid_names)

Valid Vessel Names: ['Titanic', '00Me02609', 'Aboubak Saddik Ex Rais Abdelwahid Ex Hadj Besadoune', 'MV Kingfisher', 'El Mar Azul']


In [None]:
def get_vessel_info(record, flag):
    """
    Extract vessel information from a record.

    :param record: The dictionary object containing vessel details.
    :param flag: The flag representing the vessel's country or registry.
    :return: Dictionary object containing extracted vessel information.
    """
    return {
        "api_shipname": record.get("shipname"),
        "api_imo": record.get("imo"),
        "api_mmsi": record.get("ssvid"),
        "api_geartype": ", ".join(record.get("geartypes", [])),
        "api_flag": flag,
    }

In [None]:
def format_shipname(name):
  """
  Format the vessel name by stripping whitespace and converting to uppercase.

  :param name: The vessel name as a string.
  :return: Formatted vessel name in uppercase, or an empty string if input is None or empty.
  """
  if (name):
    return name.strip().upper()
  else:
    return ''

In [None]:
def format_imo(imo):
  """
  Format the IMO (International Maritime Organization) number.

  :param imo: The IMO number, which may be a string, float, or NaN.
  :return: The IMO number as an integer if valid, otherwise an empty string.
  """
  if imo and not pd.isna(imo):
    return int(float(imo))
  else:
    return ''

In [None]:
def find_vessel_match(scraped_record, data):
    """
    Searches for vessel information in API response data using vessel name or IMO number.

    :param scraped_record: Dictionary containing "Vessel Name", "IMO Number" & other values.
    :param data: API response data containing vessel entries.
    :return: Dictionary with matched vessel information with keys 'api_shipname', 'api_imo', 'api_mmsi', 'api_geartype', 'api_flag'.
    """
    # Normalize vessel name for comparison
    scraped_vessel_name = format_shipname(scraped_record["Vessel Name"])
    scraped_imo_number = format_imo(scraped_record["IMO Number"])
    #print("Scraped record:", scraped_vessel_name, scraped_imo_number)

    # Initialize vessel info
    vessel_info = None

    # Iterate through API response entries
    for entry in data.get("entries", []):

        # Search Registry info
        for registry_info in entry.get("registryInfo", []):
            api_vessel_name = format_shipname(registry_info.get("shipname"))
            api_nshipname = format_shipname(registry_info.get("nShipname"))
            api_imo = format_imo(registry_info.get("imo"))

            # Check for match by name or IMO number
            if scraped_vessel_name in {api_vessel_name, api_nshipname} or (
                scraped_imo_number and scraped_imo_number == api_imo
            ):
                print("Match found in Registry Info:", api_vessel_name, api_nshipname, api_imo)
                return get_vessel_info(registry_info, "registry")  # Return vessel info immediately if a match is found

        # Search AIS info (only if registryInfo didn't match)
        for vessel_data in entry.get("selfReportedInfo", []):
            api_vessel_name = format_shipname(vessel_data.get("shipname"))
            api_nshipname = format_shipname(vessel_data.get("nShipname"))
            api_imo = format_imo(vessel_data.get("imo"))

            # Check for match by name or IMO number
            if scraped_vessel_name in {api_vessel_name, api_nshipname} or (
                scraped_imo_number and scraped_imo_number == api_imo
            ):
                print("Match found in Self-Reported Info:", api_vessel_name, api_nshipname, api_imo)
                return get_vessel_info(vessel_data, "ais")  # Return vessel info immediately if a match is found

    # Return empty vessel_info if no match is found
    return vessel_info

In [None]:
# Function to determine last processed index
def get_last_processed_index():
    """Reads the last processed vessel name from the CSV and finds its index in vessels_df."""
    if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
        try:
            saved_df = pd.read_csv(save_path, usecols=["Vessel Name"])
            last_vessel = saved_df.iloc[-1]["Vessel Name"]  # Get last vessel name
            last_index = vessels_df[vessels_df["Vessel Name"] == last_vessel].index.max()
            return last_index + 1 if last_index is not None else 0  # Start from next row
        except Exception as e:
            print(f"Error reading last processed index: {e}")
            return 0  # Start from beginning if any issue
    return 0  # If no saved file, start from beginning

In [None]:
get_last_processed_index()

60066

# Search for a single vessel name

In [None]:
#scraped_record = vessels_df.iloc[767]
scraped_record = vessels_df.iloc[2]
scraped_record

Unnamed: 0,2
Vessel Name,1 De Abril
IMO Number,
Flag,Portugal
MMSI,
Call Sign,
...,...
PERU_Fleet_NorthCenter_20Feb2025,
PERU_Fleet_South_20Feb2025,
PERU_On IUU List_24Feb2025,
PERU_Decree 1392 IUU Infraction,


In [None]:
data = search_vessel_api(url, headers, scraped_record.get("Vessel Name"))

Total search results: 3719, Downloaded: 25


In [None]:
find_vessel_match(scraped_record, data)

# Search a list of vessel names

The cell below searches for all vessel names in the dataframe set up earlier. The code limit the number of searches per minute will stop executing when it reaches the daily limit. All results are written out to a csv file on your Google Drive.

In [None]:
# Track progress
start_index = get_last_processed_index()
print(f"Resuming from row {start_index}...")

requests_today = 0

for i in range(start_index, len(vessels_df)):
    #time.sleep(1)  # Wait for 1 second between api calls
    print("**********************")
    vessel_row = vessels_df.iloc[i]  # Full original row
    vessel_name = vessel_row["Vessel Name"]

    # Default API response (if invalid name or no API result)
    vessel_info = {
        "api_shipname": "",
        "api_imo": "",
        "api_mmsi": "",
        "api_geartype": "",
        "api_flag": "not found",
    }

    # Process API request if name is valid
    if is_valid_vessel_name(vessel_name):
        results = search_vessel_api(url, headers, vessel_name)
        if results:
            vessel_match = find_vessel_match(vessel_row, results)
            if vessel_match:
                vessel_info = vessel_match

    # Combine original row with API response
    combined_data = list(vessel_row) + list(vessel_info.values())

    # Convert to DataFrame and append to CSV without overwriting headers
    df = pd.DataFrame([combined_data])
    df.to_csv(save_path, mode="a", header=False, index=False, encoding="utf-8")

    print(f"Saved row {i}: {vessel_name}")

    requests_today += 1

    # Respect per-minute limit
    if (i + 1) % PER_MINUTE_LIMIT == 0:
        print(f"Waiting to avoid API rate limits... ({i+1} requests sent)")
        time.sleep(60)  # Sleep for 1 minute after 50 requests

    # Respect daily limit
    if requests_today >= DAILY_LIMIT:
        print(f"Reached daily limit of {DAILY_LIMIT} requests. Pausing until next day.")
        break  # Stop for today


Resuming from row 60066...
**********************
Total search results: 2467, Downloaded: 25
Saved row 60066: Kasih Hati 3
**********************
Total search results: 832, Downloaded: 25
Saved row 60067: Kasih Hati 4
**********************
Total search results: 1319, Downloaded: 25
Saved row 60068: Kasih Hati 5
**********************
Total search results: 33, Downloaded: 25
Saved row 60069: Kasih Hati Ii
**********************
Total search results: 6, Downloaded: 6
Match found in Self-Reported Info: KASIH JAYA KASIHJAYA 
Saved row 60070: Kasih Jaya
**********************
Total search results: 6, Downloaded: 6
Match found in Self-Reported Info: KASIH JAYA KASIHJAYA 
Saved row 60071: Kasih Jaya
**********************
Total search results: 0, Downloaded: 0
Saved row 60072: Kasih Sejahtera
**********************
Total search results: 4, Downloaded: 4
Match found in Self-Reported Info: KASIH SETIA 77 KASIHSETIA77 
Saved row 60073: Kasih Setia 77
**********************
Total search results:

## Note: Higher Confidence Matches
GFW [reccomends](https://globalfishingwatch.org/our-apis/assets/2024_Vessel_Viewer_and_APIs_behind_It.pdf) selecting vessels that have `INFO SOURCE = Registry and self-reported (AIS)` from the search results. This indicates that GFW is able to match registry data and AIS. This ensures that GFW has successfully matched registry data with AIS signals, providing the highest confidence that the two associated vessel_ids correspond to the same physical vessel based on publicly available registry information.