# Vessel Scraping

#### Globals

In [None]:
!apt-get update
!apt-get install -y chromium-chromedriver
!pip install selenium
!pip install beautifulsoup4

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from tabulate import tabulate
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from datetime import date

import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = Options()


In [None]:
# Desired fields
final_fields = {
    "Vessel Name": None,
    "IMO Number": None,
    "Flag": None,
    "MMSI": None,
    "Callsign": None,
    "Vessel Type / Fishing Method": None,
    "Registration Number": None,
    "Port of Registry": None,
    "Dates of Authorization": None,
    "Owner Name": None,
    "Owner Address": None,
    "Operator Name": None,
    "Operator Address": None,
    "Source of Information": None,
    "Source Link": None,
    "Date of Information": None
}

## Equasis

In [None]:
# Install required libraries
!pip install selenium beautifulsoup4 pandas lxml requests

### Scrape full vessel list from backend

In [None]:


import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

# Step 1: Set the URL and headers
url = "https://www.equasis.org/EquasisWeb/restricted/Search?fs=Search"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
}

# Step 2: Add the active JSESSIONID cookie
cookies = {
    "JSESSIONID": ""  # Replace with your actual cookie
}

MAX_PAGES = 15

# Step 3: Define the form data template
form_data = {
    "P_PAGE": 1,
    "P_PAGE_COMP": 1,
    "P_PAGE_SHIP": 1,
    "ongletActifSC": "ship",
    "P_ENTREE_HOME_HIDDEN": "",
    "P_IMO": "",
    "P_CALLSIGN": "",
    "P_NAME": "",
    "P_NAME_cu": "on",
    "P_MMSI": "",
    "P_GT_GT": "",
    "P_GT_LT": "",
    "P_DW_GT": "",
    "P_DW_LT": "",
    "P_YB_GT": "",
    "P_YB_LT": "",
    "P_CLASS_rb": "HC",
    "P_CLASS_ST_rb": "CM",
    "P_FLAG_rb": "HC",
    "P_CatTypeShip_rb": "CM",
    "P_CatTypeShip": "14",
    "P_CatTypeShip_p2": "14",
    "P_STATUS": "S",
    "buttonAdvancedSearch": "advancedOk",
}

all_dataframes = []

for page in range(1, MAX_PAGES + 1):  # Adjust the range for the desired number of pages
    print(f"Scraping page {page}...")
    form_data["P_PAGE_SHIP"] = page
    response = requests.post(url, headers=headers, cookies=cookies, data=form_data)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "lxml")

        # Locate the table
        table = soup.find("table")  # Adjust this to target the specific table if there are multiple
        if table:
            # Convert the HTML table to a pandas DataFrame
            df = pd.read_html(StringIO(str(table)))[0]
            all_dataframes.append(df)
        else:
            print(f"Table not found on page {page}")
    else:
        print(f"Failed to fetch page {page}. Status code: {response.status_code}")

final_dataframe = pd.concat(all_dataframes, ignore_index=True)


final_dataframe.to_csv("scraped_data.csv", index=False)
print("Scraping completed. Data saved to scraped_data.csv.")


In [None]:
# First value in IMO # is actual number, set that
# After number, string until `Fish` is ship name
# Rest is Type of ship
testing_df = final_dataframe.copy()
columns = ["IMO number", "Name of ship", "Gross tonnage", "Type of ship", "Year of build", "Flag"]
for idx, row in testing_df.iterrows():
    if pd.isna(row["Gross tonnage"]) and pd.isna(row["Flag"]):
        # its a messed up row
        imo_num = row["IMO number"].split(" ")[0]
        testing_df.at[idx, "IMO number"] = imo_num
        rest = row["IMO number"].split(" ")[1:]
        rest = " ".join(rest)
        testing_df.at[idx, "Name of ship"] = rest.split("Fish")[0]
        testing_df.at[idx, "Type of ship"] = 'Fish' + rest.split("Fish")[1]

testing_df.to_csv("scraped_data_fixed.csv", index=False)

### Convert dataframe to desired format

In [None]:
new_df = pd.DataFrame(columns=final_fields)

# Copy over matching columns
for idx, row in testing_df.iterrows():
    new_df.loc[idx, "Vessel Name"] = row["Name of ship"]
    new_df.loc[idx, "IMO Number"] = row["IMO number"]
    new_df.loc[idx, "Flag"] = row["Flag"]
    new_df.loc[idx, "Vessel Type / Fishing Method"] = row["Type of ship"]
    new_df.loc[idx, "Source of Information"] = "Equasis"
    new_df.loc[idx, "Source Link"] = "https://www.equasis.org/EquasisWeb/restricted/Search?fs=Search"
    new_df.loc[idx, "Date of Information"] = date.today()
new_df.head()

### Import from existing csv

In [None]:
new_df = pd.read_csv("equasis-checkpoint.csv")
new_df["Callsign"] = new_df["Callsign"].astype(object)
new_df["MMSI"] = new_df["MMSI"].astype(object)
new_df["Owner Address"] = new_df["Owner Address"].astype(object)
new_df["Owner Name"] = new_df["Owner Name"].astype(object)
new_df["Operator Name"] = new_df["Operator Name"].astype(object)
new_df["Operator Address"] = new_df["Operator Address"].astype(object)


### Add data from individual vessel search

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL and headers
url = "https://www.equasis.org/EquasisWeb/restricted/ShipInfo?fs=Search"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
}
cookies = {
    "JSESSIONID": "",  # Replace with your actual cookie
}

# Function to extract values based on the label
def extract_value(soup, label):
    # print (f"Searching page: {soup.find('h4').get_text(strip=True)}")
    label_b = soup.find("b", string=lambda s: s and label in s)
    if label_b:
        # Get the next sibling <div> for the value
        value_div = label_b.find_parent("div").find_next_sibling("div")
        if value_div:
            return value_div.get_text(strip=True)

def extract_owner_operator(soup):
    ship_info = {"Owner Name": '', 'Owner Address': '', "Operator Name": '', 'Operator Address': ''}
    for row in soup.find_all("tr"):
      # If row is null, return empty
      if not row:
        return ship_info
      cells = row.find_all("td")
      if len(cells) >= 4:  # Ensure there are enough columns
          role = cells[1].get_text(strip=True)
          company = cells[2].get_text(strip=True)
          address = cells[3].get_text(strip=True)

          if role == "Ship manager/Commercial manager":
              ship_info["Owner Name"] = company
              ship_info["Owner Address"] = address
          elif role == "Registered owner":
              ship_info["Operator Name"] = company
              ship_info["Operator Address"] = address
    return ship_info

def saveCheckpoint(df):
    df.to_csv("equasis-checkpoint.csv", index=False)
    print(f"Saved progress to equasis-checkpoint.csv")

MAX_RUN = 100 # must stay under 500 a day
WITH_PAUSES = True
LAST_IDX = 1420; # last checkpoint index populated with details

# Setup dataframe from most recent run
new_df = pd.read_csv("equasis-checkpoint.csv")
new_df["Callsign"] = new_df["Callsign"].astype(object)
new_df["MMSI"] = new_df["MMSI"].astype(object)
new_df["Owner Address"] = new_df["Owner Address"].astype(object)
new_df["Owner Name"] = new_df["Owner Name"].astype(object)
new_df["Operator Name"] = new_df["Operator Name"].astype(object)
new_df["Operator Address"] = new_df["Operator Address"].astype(object)

full_df = new_df.copy()

# Iterate over each IMO number in the DataFrame
count = 0
for index, row in full_df.iterrows():
    if index <= LAST_IDX:
        continue  # Skip already processed rows

    # Every 10% save progress via csv export of current full_df
    if MAX_RUN >= 10 and count % 10 == 0 and count != 0:
        full_df.to_csv("equasis-progress.csv", index=False)
        print(f"Saved progress to equasis-progress.csv")
        print (f"Percent complete: {(count / MAX_RUN) * 100}%")
        # Sleep for 30s every 10 requests
        if WITH_PAUSES:
            print(f"Sleeping for 32 seconds...")
            time.sleep(32)

    # Skip this row if lacking IMO
    if pd.isna(row["IMO Number"]):
        continue

    form_data = {
        "P_IMO": row["IMO Number"],  # Query parameter with IMO number
    }


    # Make the POST request
    response = requests.post(url, headers=headers, cookies=cookies, data=form_data)
    count += 1
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "lxml")

        # Scrape Callsign and MMSI
        call_sign = extract_value(soup, "Call Sign")
        mmsi = extract_value(soup, "MMSI")
        if call_sign:
            full_df.at[index, "Callsign"] = call_sign
        if mmsi:
            full_df.at[index, "MMSI"] = mmsi

        # Scrape owner/operator details
        owner_operator = extract_owner_operator(soup)
        for key, val in owner_operator.items():
            full_df.at[index, key] = val

        if count > MAX_RUN:
          print("Hit max processing. Exiting. Ended at index: " + str(index))
          break

    else:
        print(f"Failed to fetch data for IMO number {row['IMO Number']}. Status code: {response.status_code}")

full_df.to_csv("final.csv", index=False)
print("Scraping completed. Data saved to final.csv.")


## IOTC

### Initialize IOTC Mapping Data
e.g the codes they use `T-KHM` to the data we care about `Cambodia` , `KHM`

In [None]:
# Populate tenant ref map
import requests
import json

# Endpoint URL
url = "https://rav.iotc.org/domain/tenantrefs"

headers = {
    "Content-Type": "application/json"
}

# Make the GET request
response = requests.get(url, headers=headers)

# Check the response status
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()

    print(f"Total entries fetched: {len(data)}")
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")

# Convert to map of `id` : {code: '', name: ''}
iotc_tenant_mapping = {}
for tenant in data:
    iotc_tenant_mapping[tenant["id"]] = {"code": tenant["code"], "name": tenant["name"]["en"]}

In [None]:
# Populate tag conversions
# Endpoint URL
url = "https://rav.iotc.org/domain/tag"

headers = {
    "Content-Type": "application/json"
}

# Make the GET request
response = requests.get(url, headers=headers)

# Check the response status
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()

    print(f"Total entries fetched: {len(data)}")
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")

# Convert to map of `id` : {code: '', name: ''}
iotc_tag_mapping = {}
for tag in data:
    iotc_tag_mapping[tag["id"]] = tag["name"]["en"]

iotc_tag_mapping["TG-lb-type"]

### Loop and collect data from their backend

In [None]:
def extract_fields(entry):
    return {
        "Vessel Name": entry.get("details", {}).get("name"),
        "IMO Number": next((id.split("TG-imo-scheme://")[1] for id in entry.get("details", {}).get("identifiers", []) if "TG-imo-scheme://" in id), None),
        "Flag": iotc_tenant_mapping[entry.get("details", {}).get("flagstate")]['code'],
        "MMSI": None,
        "Callsign": next((id.split("TG-ircs-scheme://")[1] for id in entry.get("details", {}).get("identifiers", []) if "TG-ircs-scheme://" in id), None),
        "Vessel Type / Fishing Method": iotc_tag_mapping[entry.get("details", {}).get("vesselType")],
        "Registration Number": next((id.split("TG-regno-scheme://")[1] for id in entry.get("details", {}).get("identifiers", []) if "TG-regno-scheme://" in id), None),
        "Port of Registry": entry.get("details", {}).get("port", {}).get("name"),
        "Dates of Authorization": f"{entry.get('authorization', {}).get('from')} to {entry.get('authorization', {}).get('to')}",
        "Owner Name": next((contact.get("name") for contact in entry.get("details", {}).get("contacts", []) if contact.get("type") == "TG-owner-contact"), None),
        "Owner Address": next((contact.get("address") for contact in entry.get("details", {}).get("contacts", []) if contact.get("type") == "TG-owner-contact"), None),
        "Operator Name": next((contact.get("name") for contact in entry.get("details", {}).get("contacts", []) if contact.get("type") == "TG-operatorcontact"), None),
        "Operator Address": next((contact.get("address") for contact in entry.get("details", {}).get("contacts", []) if contact.get("type") == "TG-operatorcontact"), None),
    }

In [None]:
import requests
import json

# Endpoint URL
url = "https://rav.iotc.org/domain/record/search"

# Base payload
payload_template = {
    "mode": "current",
    "sort": [
        {
            "field": "timestamp",
            "mode": "desc"
        }
    ],
    "conditions": [],
    "includeDelisted": False,
    "cursor": {
        "page": 1, # we'll increment this one
        "pageSize": 100
    },
    "language": "en"
}

headers = {
    "Content-Type": "application/json"
}

# Maximum page to fetch
MAX_PAGES = 53

# To collect all results
all_results = []

# Loop through pages
for page in range(1, MAX_PAGES + 1):
    # Update the page number in the payload
    payload = payload_template.copy()
    payload["cursor"]["page"] = page

    # Make the POST request
    response = requests.post(url, headers=headers, json=payload)

    # Check the response status
    if response.status_code == 200:
        data = response.json()
        all_results.extend(data.get("results", []))  # Assuming results are in a "results" key
        print(f"Page {page} fetched successfully.")
    else:
        print(f"Failed to fetch page {page}. Status code: {response.status_code}")
        break

# Display the total number of results fetched
print(f"Total results fetched: {len(all_results)}")

print("Converting to dataframe...")
df = pd.DataFrame([extract_fields(entry) for entry in all_results])
df.to_csv("iotc-data.csv", index=False)


## GFCM

In [None]:
index_to_fields = {
    0: "Flag",
    1: "Vessel Name",
    2: "Registration Number",
    3: "MMSI",
    4: "IMO Number",
    5: "Callsign", # assumes "IRCS" maps to "callsign"
    6: None,  # skip licence indicator
    7: None,  # skip operational status
    8: None,  # skip LOA
    9: None,  # skip GT
    10: None, # skip Engine Power
    11: "Vessel Type / Fishing Method"
}

def processVesselRow(row) :
  cells = row.find_elements(By.CSS_SELECTOR, "div[role='gridcell']")
  if (len(cells) == 0):
    print ("Empty row")
    return None
  # Skip vessels with operational status "No"
  if (cells[7] == "No"):
    return None

  # Extract text from each cell
  cell_texts = [c.text.strip() for c in cells]

  # Now, cell_texts[0] might be "Select Row".
  # We'll remove that if it's there:
  if cell_texts and "Select Row" in cell_texts[0]:
      cell_texts = cell_texts[1:]  # skip it

  # Now we map them into a new dictionary for this vessel
  row_fields = final_fields.copy()

  for i, val in enumerate(cell_texts):
      dict_key = index_to_fields.get(i)
      if dict_key:
          row_fields[dict_key] = val or None  # if val = '', store None

  # (Optionally) fill in more fields if you have them
  row_fields["Source of Information"] = "GFCM"
  row_fields["Source Link"] = "https://www.fao.org/gfcm/data/fleet/avl"
  row_fields["Date of Information"] = date.today()

  return row_fields

In [None]:
URL = "https://www.fao.org/gfcm/data/fleet/avl"
MAX_WAIT = 15
MAX_ENTRIES = 30

options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

wait = WebDriverWait(driver, MAX_WAIT)

try:
    driver.get(URL)

    iframe_elem = wait.until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "iframe[src*='app.powerbi.com/view']")))

    driver.switch_to.frame(iframe_elem)

    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "scrollable-cells-container")))
    scroll_container = driver.find_element(By.CLASS_NAME, "scrollable-cells-container")

    last_count = 0
    while True:
        # Find all rows so far
        rows = driver.find_elements(By.CSS_SELECTOR, "div[role='row']")
        current_count = len(rows)
        print(f"Currently have {current_count} rows.")

        # Scroll down to trigger loading the next set of rows
        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scroll_container)
        # Give the page a moment to load new rows if any
        time.sleep(8)

        # Check if new rows were added
        if current_count == last_count:
            # No change in row count => probably at the end
            print("No new rows loaded. Finished scrolling.")
            break

        last_count = current_count

    rows = driver.find_elements(By.CLASS_NAME, "scrollable-cells-container")

    all_vessels = []
    n = len(rows)

    count = 0
    for row in rows:
        count += 1
        vessel_data = processVesselRow(row)
        if vessel_data:
            all_vessels.append(vessel_data)
        if count % 10 : print (f"Percent complete: {(count / n) * 100}%")
    df = pd.DataFrame(all_vessels)
finally:
    driver.quit()

df.head()
df.to_csv("gfcm-data.csv", index=False)


## CCAMLR


In [None]:
BASE_URL = "https://www.ccamlr.org"

def extractTables(soup):
    rows = soup.find_all('tr')
    data_list = []

    for row in rows:
        cells = row.find_all('td')
        if not cells:
            continue

        # --- Extract Ship Name and URL ---
        ship_td = row.find('td', class_='views-field-title-1')
        if ship_td:
            link = ship_td.find('a', href=True)
            if link:
                ship_name = link.get_text(strip=True)
                relative_url = link['href']
                full_url = BASE_URL + relative_url  # convert relative to absolute
            else:
                ship_name = ""
                full_url = ""
        else:
            ship_name = ""
            full_url = ""

        # --- Extract Date Range ---
        period_td = row.find('td', class_='views-field-field-licence-period')
        if period_td:
            start_span = period_td.find('span', class_='date-display-start')
            end_span = period_td.find('span', class_='date-display-end')
            if start_span and end_span:
                start_date = start_span.get_text(strip=True)
                end_date = end_span.get_text(strip=True)
                date_range = f"{start_date} to {end_date}"
            else:
                date_range = ""
        else:
            date_range = ""

        if ship_name or date_range:
            data_list.append({
                "Ship Name": ship_name,
                "Ship URL": full_url,
                "Date Range": date_range
            })

    df = pd.DataFrame(data_list, columns=["Ship Name", "Ship URL", "Date Range"])
    return df

In [None]:
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

URL = "https://www.ccamlr.org/en/compliance/list-vessel-authorisations"
MAX_WAIT = 15
MAX_TABLES = 1

options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

wait = WebDriverWait(driver, MAX_WAIT)

try:
    driver.get(URL)

    element = wait.until(EC.visibility_of_element_located((By.ID, "edit-ccamlr-season-ccamlr-season-override")))

    soup = BeautifulSoup(driver.page_source, "html.parser")

    all_data = extractTables(soup)


finally:
    driver.quit()

In [None]:
all_data.to_csv("ships.csv", index=False)

In [None]:
# Prune to latest entry
df = all_data

# 2) Parse 'Date Range' into separate start/end columns
#    Splitting on " to " will give us two strings: e.g. "01 Dec 2024" and "30 Nov 2025".
df[['start_str', 'end_str']] = df['Date Range'].str.split(' to ', expand=True)

# 3) Convert to datetime objects
#    Adjust the format string (%d %b %Y) if the date format is different on your site.
df['start_date'] = pd.to_datetime(df['start_str'], format='%d %b %Y')
df['end_date'] = pd.to_datetime(df['end_str'], format='%d %b %Y')

# 4) Sort by end_date descending, then start_date descending
df_sorted = df.sort_values(by=['end_date', 'start_date'], ascending=[False, False])

# 5) For each ship, keep only the row with the "max" end_date and then start_date
#    groupby('Ship Name') + head(1) will keep the first row in each group
#    (which is the max after our descending sort).
df_unique = df_sorted.groupby('Ship Name', as_index=False).head(1)

# (Optional) Drop the temporary columns
df_unique.drop(columns=['start_str', 'end_str', 'start_date', 'end_date'], inplace=True)

# 6) Output the pruned data to CSV
df_unique.to_csv('ships_pruned.csv', index=False)

In [None]:
import datetime
import re

def scrapeVesselPage(shipName, shipUrl):
  driver = webdriver.Chrome(options=options)
  wait = WebDriverWait(driver, 10)

  driver.get(shipUrl)

  try:
      # Wait until vessel content present
      vessel_div_element = wait.until(
          EC.presence_of_element_located((By.CSS_SELECTOR, "div.vessel-content"))
      )

      # Get last modified from 'meta submitted' class
      dateOfInfo = None
      lastModField = driver.find_element(By.CLASS_NAME, "meta.submitted")
      if (lastModField):
          lastModDate = lastModField.text
          pattern = r"This page was last modified on (\d{1,2} [A-Za-z]{3} \d{4})"
          matchText = re.search(pattern, lastModDate)
          if matchText:
            dateOfInfo = matchText.group(1)

      html = driver.page_source

  finally:
      driver.quit()

  soup = BeautifulSoup(html, "html.parser")

  vessel_div = soup.select_one("div.vessel-content")
  if not vessel_div:
      print("Could not find the vessel-content div.")
      vessel_text = ""
  else:
      vessel_text = vessel_div.get_text("\n", strip=True)

  fields = {
      "IMO Number": None,
      "Vessel Name": None,
      "Flag": None,
      "MMSI": None,
      "Callsign": None,
      "Vessel Type / Fishing Method": None,
      "Registration Number": None,
      "Port of Registry": None,
      "Dates of Authorization": None,
      "Owner Name": None,
      "Owner Address": None,
      "Operator Name": None,
      "Operator Address": None,
      "Source of Information": None,
      "Source Link": None,
      "Date of Information": None
  }

  lines = vessel_text.splitlines()
  lines = [l.strip() for l in lines if l.strip()]  # remove empty lines

  fields_raw = {}
  current_label = None
  current_value = []

  def store_current():
      """Store the accumulated value into fields_raw under current_label."""
      if current_label is not None:
          # Join all lines for this label into one string
          # e.g., ["16 Nov 2021", "to", "28 Feb 2022"] -> "16 Nov 2021 to 28 Feb 2022"
          joined_value = ' '.join(current_value).strip()
          fields_raw[current_label] = joined_value

  for line in lines:
      # If line ends with a colon, treat it as a new label
      if re.match(r'.+:$', line):
          # If we were accumulating a previous label/value, store it
          store_current()
          # Start a new label
          current_label = line[:-1].strip()  # remove the colon at the end
          current_value = []
      else:
          # It's part of the value for the current label
          current_value.append(line)

  # Store the last accumulated label/value
  store_current()

  vessel_data = {
      "Vessel Name": shipName,
      "IMO Number": fields_raw.get("IMO Number", None),
      "Flag": fields_raw.get("Flag", None),
      "MMSI": None,
      "Callsign": fields_raw.get("Callsign", None),
      "Vessel Type / Fishing Method": None,
      "Registration Number": fields_raw.get("Registration Number", None),
      "Port of Registry": fields_raw.get("Port of Registry", None),
      "Dates of Authorization": fields_raw.get("Effective Date", None),
      "Owner Name": fields_raw.get("Owner", None),
      "Owner Address": None,
      "Operator Name": fields_raw.get("Operator", None),
      "Operator Address": None,
      "Source of Information": "CCAMLR",
      "Source Link": shipUrl,
      "Date of Information": dateOfInfo
  }


  vessel_data

  return vessel_data

In [None]:
data_list = []
for index, row in df_unique.iterrows():
    vessel_url = row["Ship URL"]
    vessel_name = row["Ship Name"]
    data_list.append(scrapeVesselPage(vessel_name, vessel_url))

ships_enhanced = pd.DataFrame(data_list)

In [None]:
from datetime import date

In [None]:
ships_enhanced['Date of Information'] = date.today()
ships_enhanced.to_csv('ships_enhanced.csv', index=False)

## WCPFC
- [AX]: `Vessel Authorization Link for The Western and Central Pacific Fisheries Commission`
  - Use the specific vessel page, found after clicking the search result, e.g. https://vessels.wcpfc.int/vessel/11327
- [AY]: `Date Source Info Last Updated for The Western and Central Pacific Fisheries Commission`
  - Use the *Last Update* field on the specific vessel page, https://vessels.wcpfc.int/vessel/11327
- [AZ]: `Date Source Info Acquired for The Western and Central Pacific Fisheries Commission`



In [None]:
# First get the auth link via the main page's table
# Set it per name & ircs
import requests
from bs4 import BeautifulSoup
import csv

# Base URL and page range
MAX_PAGE = 62
base_url = "https://vessels.wcpfc.int/browse-rfv"
page_range = range(1, MAX_PAGE)  # Pages 1 to 61

# Output file
output_file = "wcpfc_phase2.csv"

# Prepare CSV file
with open(output_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Vessel Name", "IRCS", "VID"])  # Header row

    # Loop through all pages
    for page in page_range:
        print(f"Scraping page {page}...")

        # Construct the URL for the current page
        url = f"{base_url}?page={page}" if page > 1 else base_url

        # Send GET request
        response = requests.get(url)
        response.raise_for_status()  # Raise error if request fails

        # Parse HTML content
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all rows in the table body
        rows = soup.find_all("tr")

        for row in rows:
            # Extract the vessel name, IRCS, and VID
            vessel_name_cell = row.find("td", class_="views-field-vsl-vessel-name")
            ircs_cell = row.find("td", class_="views-field-vsl-ircs")
            vid_cell = row.find("td", class_="views-field-vsl-vslo-vessel-id")

            if vessel_name_cell and ircs_cell and vid_cell:
                vessel_name = vessel_name_cell.text.strip()
                ircs = ircs_cell.text.strip()
                vid = vid_cell.text.strip()

                # Write the data to the CSV file
                writer.writerow([vessel_name, ircs, vid])

print(f"Data scraping completed. Saved to {output_file}.")


#### Visit Each Page and Nab Last Updated Date

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Read in the previous CSV as a dataframe
input_csv = "wcpfc_phase2.csv"
output_csv = "vessel_data_with_dates.csv"
df = pd.read_csv(input_csv)

# Base URL for vessel pages
base_url = "https://vessels.wcpfc.int/vessel/"

# Add a new column to store the "Last Updated" date
df["Last Updated"] = None

# Loop through each row in the dataframe
for index, row in df.iterrows():
    vid = row["VID"]
    vessel_url = f"{base_url}{vid}"

    # print(f"Scraping VID {vid}...")
    if (index % 10 == 0): print(f"Percent complete: {(index / len(df)) * 100}%" )

    try:
        # Send GET request to the vessel page
        response = requests.get(vessel_url)
        response.raise_for_status()  # Raise error if request fails

        # Parse the HTML content
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract the "Last Updated" date
        last_updated_div = soup.find("div", class_="vessel-version__vsl-changed")
        if last_updated_div:
            last_updated_date = last_updated_div.find("time")["datetime"]
            # print(f"VID: {vid}, Last Updated: {last_updated_date}")
        else:
            last_updated_date = ""

        # Update the dataframe with the scraped date
        df.at[index, "Last Updated"] = last_updated_date

    except Exception as e:
        print(f"Error scraping VID {vid}: {e}")
        df.at[index, "Last Updated"] = "Error"

    # Optional: Delay to avoid overwhelming the server
    # time.sleep(.2)

# Save the updated dataframe to a new CSV file
df.to_csv(output_csv, index=False)

print(f"Scraping completed. Updated data saved to {output_csv}.")


Percent complete: 0.0%
Percent complete: 0.3304692663582287%
Percent complete: 0.6609385327164574%
Percent complete: 0.991407799074686%
Percent complete: 1.3218770654329148%
Percent complete: 1.6523463317911435%
Percent complete: 1.982815598149372%
Percent complete: 2.313284864507601%
Percent complete: 2.6437541308658297%
Percent complete: 2.9742233972240584%
Percent complete: 3.304692663582287%
Percent complete: 3.6351619299405153%
Percent complete: 3.965631196298744%
Percent complete: 4.296100462656973%
Percent complete: 4.626569729015202%
Percent complete: 4.95703899537343%
Percent complete: 5.287508261731659%
Percent complete: 5.617977528089887%
Percent complete: 5.948446794448117%
Percent complete: 6.278916060806345%
Percent complete: 6.609385327164574%
Percent complete: 6.939854593522803%
Percent complete: 7.270323859881031%
Percent complete: 7.600793126239259%
Percent complete: 7.931262392597488%
Percent complete: 8.261731658955718%
Percent complete: 8.592200925313946%
Percent c

## SPRFMO
- [BG]: `Authorization Number for the South Pacific Regional Fisheries Management Organisation`
  - Use the *Registration Number* column
- [BH]: `Authorization Start Date for the South Pacific Regional Fisheries Management Organisation`
  - Use the *Authorised Date (Start)* field from the specific vessel page, e.g. https://sprfmo.org/vessels/2738
- [BI]: `Authorization End Date for the South Pacific Regional Fisheries Management Organisation`
  - Use the *Authorised Date (End)* column
-  [BJ]: `Vessel Authorization Link for the South Pacific Regional Fisheries Management Organisation`
  - Use the specific vessel page, found after clicking the search result, e.g. https://sprfmo.org/vessels/2738

So for scraping purposes, we need to scrape the following from the base page results:
- For lookup in `master`
  - `Vessel Name`, `IMO Number`
- For new data
  - `Date Included in SPRFMO Record`, aka the "Auth Start" date when you visit an individual vessel page
  - the `ahref` behind the `Vessel Name` aka the authorization Link

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# URL of the website
base_url = "https://sprfmo.org/vessels"
MAX_QUERY = 2

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

MAX_WAIT = 15
MAX_REQUESTS = 1000

options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

wait = WebDriverWait(driver, MAX_WAIT)


# URL of the website
url = "https://sprfmo.org/vessels"

# Open the page
driver.get(url)

# Function to scrape data from the current page
def scrape_page(soup):
    vessels = []
    rows = soup.find_all("div", class_="row")

    for row in rows:
        try:
            name_label = row.find("label", string="Vessel Name")
            if name_label:
                vessel_name_div = name_label.find_next_sibling("div")
                vessel_name_link = vessel_name_div.find("a") if vessel_name_div else None
                vessel_name = vessel_name_link.text.strip() if vessel_name_link else "N/A"
                vessel_link = f"{vessel_name_link['href']}" if vessel_name_link else "N/A"
            else:
                vessel_name = "N/A"
                vessel_link = "N/A"

            imo_number = row.find("div", class_="imo-number-column").div.text.strip()
            date_included = row.find("div", class_="date-column").div.text.strip()

            imo_number = imo_number.split("IMO Number")[1] if "IMO Number" in imo_number else imo_number
            date_included = date_included.split("Date Included in SPRFMO Record")[1] if "Date Included in SPRFMO Record" in date_included else date_included

            vessels.append({
                "Vessel Name": vessel_name,
                "IMO Number": imo_number,
                "Vessel Link": f"https://sprfmo.org{vessel_link}",
                "Date Included": date_included,
            })
        except AttributeError:
            # Skip rows that don't match the expected format
            continue

    return vessels

# Loop and click "View more" until all vessels are loaded
prev_row_count = 0
num_requests = 0
while True:
    try:
        # Wait for the "View more" button to be clickable
        wait = WebDriverWait(driver, 10)
        try:
          view_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[text()="View more"]')))

          # Click the button
          ActionChains(driver).move_to_element(view_more_button).click(view_more_button).perform()
        except Exception as e:
          print(f"Error: {e}")

        num_requests += 1

        # Wait for the new rows to load
        time.sleep(2)  # Adjust the delay if needed

        # Scrape the new content
        soup = BeautifulSoup(driver.page_source, "html.parser")
        rows = soup.find_all("div", class_="row")

        # Stop if no new rows are loaded
        if prev_row_count == len(rows) :
            print("No new rows detected, exiting.")
            break

        if (num_requests >= MAX_REQUESTS):
            print("Max requests reached, exiting.")
            break

        print (f"Rows: {len(rows)}, request #: {num_requests}")
        prev_row_count = len(rows)
    except Exception as e:
        print(f"Error: {e}")
        break

# Now loop through page and extract desired data
data = []
soup = BeautifulSoup(driver.page_source, "html.parser")
data.extend(scrape_page(soup))

# Convert the data to a DataFrame
df = pd.DataFrame(data)

# Close the browser
driver.quit()

# Save the DataFrame to a CSV file
df.to_csv("sprfmo_vessels-new.csv", index=False)



In [None]:
def scrape_page(soup):
    vessels = []
    rows = soup.find_all("div", class_="row")

    for row in rows:
        try:
            name_label = row.find("label", string="Vessel Name")
            if name_label:
                vessel_name_div = name_label.find_next_sibling("div")
                vessel_name_link = vessel_name_div.find("a") if vessel_name_div else None
                vessel_name = vessel_name_link.text.strip() if vessel_name_link else "N/A"
                vessel_link = f"{vessel_name_link['href']}" if vessel_name_link else ""
            else:
                vessel_name = "N/A"
                vessel_link = "N/A"

            imo_label = row.find("label", string="IMO Number")
            if imo_label:
                imo_number_div = imo_label.find_next_sibling("div")
                imo_number = imo_number_div.text.strip()

            date_label = row.find("label", string="Date Included in SPRFMO Record")
            if date_label:
                date_included_div = date_label.find_next_sibling("div")
                date_included = date_included_div.text.strip()

            vessels.append({
                "Vessel Name": vessel_name,
                "IMO Number": imo_number,
                "Vessel Link": f"https://sprfmo.org{vessel_link}",
                "Date Included": date_included,
            })
        except AttributeError:
            # Skip rows that don't match the expected format
            continue

    return vessels

data = []
data.extend(scrape_page(soup))

df = pd.DataFrame(data)
df.to_csv("plz-work.csv", index=False)

In [None]:
df_deduped = df.drop_duplicates(subset=['IMO Number', 'Vessel Name'])
# get rid of 'n/a' rows
df_deduped = df_deduped[df_deduped['Vessel Name'] != 'N/A']
df_deduped.to_csv("plz-work-duped.csv", index=False)