In [1]:
!pip install requests beautifulsoup4




In [2]:
import requests

url_page1 = "https://www.myshiptracking.com/estimate?sort=TIME&page=1&pid=313"

response = requests.get(url_page1, headers={"User-Agent": "Mozilla/5.0"})
html_text = response.text

# Show middle 1000 characters (helps detect JS or blocking)
mid = len(html_text)//2
print(html_text[mid:mid+1000])


/cma-cgm-innovation-mmsi-256592000-imo-9951525">CMA CGM INNOVATION</a> [MT]</span></td>
												<td style="text-align: left;"><a class="pflag" href="/ports/port-of-new-york-in-usa-id-313"><img class="flag_line" src="/icons/flags2/24/US.png" alt="Flag" title="USA"/> NEW YORK</a></td>
												<td><span class='tooltip-delay' title='
			<div class="cs-table">	<div class="table-row">
					<div class="col">LT</div>
					<div class="col">2025-12-08 <b>01:44</b></div>
				</div>
				<div class="table-row">
					<div class="col">UTC</div>
					<div class="col">2025-12-08 <b>06:44</b></div>
				</div>
			</div>
			'>2025-12-08 <b>06:44</b></span></td>
											</tr>
																			<tr>
												<td class="d-none d-sm-table-cell">367165450</td>
												<td style="text-align: left;"><img style="width: 22px;padding-right: 0.5rem;" src="/icons/icon3_511.png"><span class="padding_18"><a href="/vessels/treasure-coast-mmsi-367165450-imo-9448267">TREASURE COAST</a> [


In [3]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html_text, "html.parser")

rows = soup.select("table.table tbody tr")  # selects each vessel row

sample_data = []

for row in rows:
    cols = row.find_all("td")
    if len(cols) < 4:
        continue  # skip malformed rows

    # Vessel + country appear together in the 2nd <td>
    vessel_cell = cols[1]
    vessel_name = vessel_cell.find("a").text.strip()

    # Country code appears inside brackets [US]
    # Extract text like "[US]" then strip brackets
    country = vessel_cell.text
    start = country.find("[")
    end = country.find("]")
    country_of_origin = country[start+1:end] if start != -1 and end != -1 else None

    # Arrival estimate is 4th <td>
    arrival_cell = cols[3]
    arrival_estimate = arrival_cell.text.strip()

    sample_data.append({
        "vessel": vessel_name,
        "arrival_estimate": arrival_estimate,
        "country_of_origin": country_of_origin
    })

sample_data[:5]  # preview first 5 rows


[{'vessel': 'RAINBOW CRUISE',
  'arrival_estimate': '2025-12-07 19:44',
  'country_of_origin': 'US'},
 {'vessel': 'BARU',
  'arrival_estimate': '2025-12-07 20:22',
  'country_of_origin': 'NO'},
 {'vessel': 'JOSEPHINE',
  'arrival_estimate': '2025-12-07 20:30',
  'country_of_origin': 'US'},
 {'vessel': 'SAINT EMILION',
  'arrival_estimate': '2025-12-07 21:54',
  'country_of_origin': 'US'},
 {'vessel': 'LONG ISLAND',
  'arrival_estimate': '2025-12-07 22:35',
  'country_of_origin': 'US'}]

In [4]:
all_data = []

for page in [1, 2]:
    print(f"Scraping page {page}...")

    url = f"https://www.myshiptracking.com/estimate?sort=TIME&page={page}&pid=313"
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(response.text, "html.parser")

    rows = soup.select("table.table tbody tr")

    for row in rows:
        cols = row.find_all("td")
        if len(cols) < 4:
            continue

        vessel_cell = cols[1]
        vessel_name = vessel_cell.find("a").text.strip()

        # Extract country code inside [XXX]
        text = vessel_cell.get_text(" ", strip=True)
        start = text.find("[")
        end = text.find("]")
        country = text[start+1:end] if start != -1 and end != -1 else None

        # FIXED arrival estimate extraction
        arrival = " ".join(cols[3].stripped_strings)

        all_data.append({
            "vessel": vessel_name,
            "arrival_estimate": arrival,
            "country_of_origin": country
        })

print(f"Total rows scraped: {len(all_data)}")
all_data[:5]


Scraping page 1...
Scraping page 2...
Total rows scraped: 58


[{'vessel': 'RAINBOW CRUISE',
  'arrival_estimate': '2025-12-07 19:44',
  'country_of_origin': 'US'},
 {'vessel': 'BARU',
  'arrival_estimate': '2025-12-07 20:22',
  'country_of_origin': 'NO'},
 {'vessel': 'JOSEPHINE',
  'arrival_estimate': '2025-12-07 20:30',
  'country_of_origin': 'US'},
 {'vessel': 'SAINT EMILION',
  'arrival_estimate': '2025-12-07 21:54',
  'country_of_origin': 'US'},
 {'vessel': 'LONG ISLAND',
  'arrival_estimate': '2025-12-07 22:35',
  'country_of_origin': 'US'}]

In [7]:
import csv

output_file = "ships_estimates_fixed.csv"

# Rebuild the CSV from scratch
with open(output_file, "w", newline="", encoding="utf-8") as f:
    fieldnames = ["vessel", "arrival_estimate", "country_of_origin"]
    writer = csv.DictWriter(f, fieldnames=fieldnames)

    writer.writeheader()
    for row in all_data:
        writer.writerow({
            "vessel": row["vessel"],
            "arrival_estimate": row["arrival_estimate"],
            "country_of_origin": row["country_of_origin"]
        })

print("Saved:", output_file)


Saved: ships_estimates_fixed.csv


### now that i know this works, going to filter for FOCs 
## most FOCs come from Panama, Marshall Islands, the Bahamas, Liberia, Malta, Antigua and Barbuda, Cyprus so making the choice to only filter for these top 7

In [8]:
allowed_countries = [
    "Panama",
    "Marshall Islands",
    "The Bahamas",
    "Liberia",
    "Malta",
    "Antigua and Barbuda",
    "Cyprus"
]


In [9]:
code_to_name = {
    "PA": "Panama",
    "MH": "Marshall Islands",
    "BS": "The Bahamas",
    "LR": "Liberia",
    "MT": "Malta",
    "AG": "Antigua and Barbuda",
    "CY": "Cyprus"
}

In [10]:
filtered_data = []

for row in all_data:
    code = row["country_of_origin"]

    # Convert code â†’ country name
    country_name = code_to_name.get(code)

    # Keep only if it's one of your allowed countries
    if country_name in allowed_countries:
        new_row = row.copy()
        new_row["country_of_origin"] = country_name
        filtered_data.append(new_row)

print("Total filtered rows:", len(filtered_data))
filtered_data[:5]

Total filtered rows: 31


[{'vessel': 'CLOVER ACE',
  'arrival_estimate': '2025-12-08 06:32',
  'country_of_origin': 'Liberia'},
 {'vessel': 'CMA CGM INNOVATION',
  'arrival_estimate': '2025-12-08 06:44',
  'country_of_origin': 'Malta'},
 {'vessel': 'OCEAN GLSR',
  'arrival_estimate': '2025-12-08 09:26',
  'country_of_origin': 'Malta'},
 {'vessel': 'EUROPA 2',
  'arrival_estimate': '2025-12-09 00:52',
  'country_of_origin': 'Malta'},
 {'vessel': 'RDO FORTUNE',
  'arrival_estimate': '2025-12-09 07:08',
  'country_of_origin': 'Liberia'}]

In [11]:
import csv

output_file = "filtered_ships.csv"

with open(output_file, "w", newline="", encoding="utf-8") as f:
    fieldnames = ["vessel", "arrival_estimate", "country_of_origin"]
    writer = csv.DictWriter(f, fieldnames=fieldnames)

    writer.writeheader()
    for row in filtered_data:
        writer.writerow(row)

print("Saved:", output_file)


Saved: filtered_ships.csv
