In [1]:
import json
from bs4 import BeautifulSoup

In [2]:
with open("./static/visa_requirements_by_citizenship.html", "r") as file:
    html = file.read()

soup = BeautifulSoup(html, 'lxml')

In [3]:
a_tags = soup.find_all('a')
a_tags = [tag for tag in a_tags if tag.get('href', '').startswith('/wiki/Visa_requirements')]

links = []
for a_tag in a_tags:
    links.append({'demonym': a_tag.text, 'link': a_tag.get('href')})

In [4]:
with open("../data/un-sovereign-countries.json", "r") as file:
    data = json.load(file)

countries = [{'id': country['alpha2'], 'name': country['name'], 'link': None} for country in data]

In [5]:
countries[0:3]

[{'id': 'af', 'name': 'Afghanistan', 'link': None},
 {'id': 'al', 'name': 'Albania', 'link': None},
 {'id': 'dz', 'name': 'Algeria', 'link': None}]

The HTML table was scraped from the "Visa Requirements for Brazilian citizens" page. 

Let's include Brazil's link manually, since a self-link will not be included in the `a_tags`.

In [6]:
def find_country_index(countries, country_id):
    for index, country in enumerate(countries):
        if country['id'] == country_id:
            return index
    return -1  # Return -1 if the country_id is not found

In [7]:
idx = find_country_index(countries, "br")
countries[idx]['link'] = "https://en.wikipedia.org/wiki/Visa_requirements_for_Brazilian_citizens"

In [8]:
countries[idx]

{'id': 'br',
 'name': 'Brazil',
 'link': 'https://en.wikipedia.org/wiki/Visa_requirements_for_Brazilian_citizens'}

### Use OpenAI's API to match each link with a sovereign country

In [9]:
BASE_URL = 'https://en.wikipedia.org'

In [10]:
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

client = OpenAI()

In [11]:
def match_country(country: str):
    prompt = "You must only answer in JSON objects. You will receive the name of a country, territory or demonym. You must return a JSON object with this structure: { 'is_sovereign': boolean; 'country': 'XX' | null } where 'XX' is the alpha2 code of the country. If the territory provided is not a sovereign member of the United Nations (i.e.: Puerto Rico, Hong Kong, Taiwan), you must return null for the 'country' key.\n\nThese are the only possible country codes for UN Recognized Countries: " + ','.join([c['id'] for c in countries])
    prompt += "\n\nDo not include three backticks in your response to specify the return format. Instead, just return raw JSON content."
    
    completion = client.chat.completions.create(model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": prompt},
            {
              "role": "user",
              "content": f"Country/demonym: {country}"
            }
        ]
    )

    return completion

In [15]:
for link_dict in links:
    demonym = link_dict['demonym']
    link = link_dict['link']

    try:
        completion = match_country(demonym)
        completion_content = json.loads(completion.choices[0].message.content)
    except json.JSONDecodeError:
        print(f"Failed to decode completion. Content: {completion.choices[0].message.content}")
        print("Skipping...")
        continue
    
    if not completion_content['is_sovereign']:
        print(f"[Not sovereign] Skipping country: {demonym}")
        continue

    country_code = completion_content['country']
    if not country_code:
        print(f"[No country code] Skipping country: {demonym}")
        continue
    
    country_code = country_code.lower()
    idx = find_country_index(countries, country_code)
    if idx == -1:
        print(f"Country code not found: {country_code} - {demonym}")
        continue
    
    countries[idx]['link'] = f"{BASE_URL}{link}"

[Not sovereign] Skipping country: Saint Helena
[Not sovereign] Skipping country: Somaliland
[Not sovereign] Skipping country: Anguillan
[Not sovereign] Skipping country: Bermudian
[Not sovereign] Skipping country: British Virgin Islands
[Not sovereign] Skipping country: Caymanian
[Not sovereign] Skipping country: Greenlandic
[Not sovereign] Skipping country: Montserratian
[Not sovereign] Skipping country: Turks and Caicos Islands
[Not sovereign] Skipping country: Abkhaz
[Not sovereign] Skipping country: Hong Kongese
[Not sovereign] Skipping country: Macanese
[Not sovereign] Skipping country: Northern Cypriot
[Not sovereign] Skipping country: Palestinian
[Not sovereign] Skipping country: South Ossetian
[Not sovereign] Skipping country: Taiwanese
[Not sovereign] Skipping country: Abkhaz
[Not sovereign] Skipping country: BOTC
[Not sovereign] Skipping country: BN(O)
[Not sovereign] Skipping country: BOC
[Not sovereign] Skipping country: Faroese
[Not sovereign] Skipping country: Greenlandic

In [18]:
countries[0:3]

[{'id': 'af',
  'name': 'Afghanistan',
  'link': 'https://en.wikipedia.org/wiki/Visa_requirements_for_Afghan_citizens'},
 {'id': 'al',
  'name': 'Albania',
  'link': 'https://en.wikipedia.org/wiki/Visa_requirements_for_Albanian_citizens'},
 {'id': 'dz',
  'name': 'Algeria',
  'link': 'https://en.wikipedia.org/wiki/Visa_requirements_for_Algerian_citizens'}]

In [20]:
with open("../data/scraping-links.json", "w") as file:
    file.write(json.dumps(countries))

Manually adjust repeated links in the output file (in case there are any):

In [24]:
links = [item['link'] for item in countries]

repeated_links = set([link for link in links if links.count(link) > 1])
repeated_links

{'https://en.wikipedia.org/wiki/Visa_requirements_for_Republic_of_the_Congo_citizens'}