In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

DATA_DIR = "data/raw"
os.makedirs(DATA_DIR, exist_ok=True)

def download_html(url, filename):
    html = requests.get(url, timeout=10).text
    soup = BeautifulSoup(html, "html.parser")
    # remove scripts/styles
    for tag in soup(["script", "style", "header", "footer", "nav"]):
        tag.extract()
    text = soup.get_text(separator="\n", strip=True)
    with open(os.path.join(DATA_DIR, filename), "w", encoding="utf-8") as f:
        f.write(text)

sources = {
    "WHO_STD.txt": "https://www.who.int/en/news-room/fact-sheets/detail/sexually-transmitted-infections-(stis)",
    "CDC_Syphilis.txt": "https://www.cdc.gov/syphilis/about/index.html",
    "CDC_Gonorrhea.txt": "https://www.cdc.gov/gonorrhea/about/index.html",
    "CDC_HPV.txt": "https://www.cdc.gov/sti/about/about-genital-hpv-infection.html",
    "CDC_OralSex.txt": "https://www.cdc.gov/sti/about/about-sti-risk-and-oral-sex.html",
    "CDC_HIV1.txt": "https://www.cdc.gov/hiv/about/index.html",
    "CDC_HIV2.txt": "https://www.cdc.gov/hiv/causes/index.html",
    "CDC_HIV3.txt": "https://www.cdc.gov/hiv/prevention/index.html",
    "CleavelandClinic_HIV.txt": "https://my.clevelandclinic.org/health/diseases/4251-hiv-aids",
    "CleavelandClinic_HSV.txt": "https://my.clevelandclinic.org/health/diseases/22855-herpes-simplex",
    "CleavelandClinic_HPV.txt": "https://my.clevelandclinic.org/health/diseases/11901-hpv-human-papilloma-virus",
    "CleavelandClinic_Chlamydia.txt": 'https://my.clevelandclinic.org/health/diseases/4023-chlamydia',
    "CleavelandClinic_Syphilis.txt": "https://my.clevelandclinic.org/health/diseases/4622-syphilis",
    "CleavelandClinic_Gonorrhea.txt": "https://my.clevelandclinic.org/health/diseases/4217-gonorrhea",
    "Wikipedia_SexualHealth.txt": "https://en.wikipedia.org/wiki/Sexual_and_reproductive_health",
}

for filename, url in sources.items():
    print(f"Downloading {url}...")
    download_html(url, filename)


Downloading https://www.who.int/en/news-room/fact-sheets/detail/sexually-transmitted-infections-(stis)...
Downloading https://www.cdc.gov/syphilis/about/index.html...
Downloading https://www.cdc.gov/gonorrhea/about/index.html...
Downloading https://www.cdc.gov/sti/about/about-genital-hpv-infection.html...
Downloading https://www.cdc.gov/sti/about/about-sti-risk-and-oral-sex.html...
Downloading https://www.cdc.gov/hiv/about/index.html...
Downloading https://www.cdc.gov/hiv/causes/index.html...
Downloading https://www.cdc.gov/hiv/prevention/index.html...
Downloading https://my.clevelandclinic.org/health/diseases/4251-hiv-aids...
Downloading https://my.clevelandclinic.org/health/diseases/22855-herpes-simplex...
Downloading https://my.clevelandclinic.org/health/diseases/11901-hpv-human-papilloma-virus...
Downloading https://my.clevelandclinic.org/health/diseases/4023-chlamydia...
Downloading https://my.clevelandclinic.org/health/diseases/4622-syphilis...
Downloading https://my.clevelandcli

In [2]:
import json
with open("sources.json", "w", encoding="utf-8") as f:
    json.dump(sources, f, ensure_ascii=False, indent=2)