In [1]:
# Importando Bibliotecas
import requests
import re
import time
import json
from bs4 import BeautifulSoup

In [2]:
def read_json_file(filename):
    """Lê um arquivo JSON e retorna os dados como um dicionário ou lista Python."""
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        print(f"Erro: Arquivo '{filename}' não encontrado.")
        return None
    except json.JSONDecodeError:
        print(f"Erro: Arquivo '{filename}' não é um JSON válido.")
        return None

In [3]:
disease_names = read_json_file('empty_symptoms_keys_Medlineplus.json')

In [4]:
def clean_disease_name(disease):
    """Limpa o nome da doença para criar a URL."""
    return re.sub("[^\w]", "", disease.lower())

In [5]:
def fetch_disease_html(link):
    """Busca o conteúdo HTML da página da doença."""
    try:
        response = requests.get(link)
        response.raise_for_status()
        return response.content
    except requests.exceptions.RequestException as e:
        print('Erro ao fazer a requisição:', e)
        exit()

In [6]:
def get_url(html_content):
    """Extrai as URL."""
    soup = BeautifulSoup(html_content, 'html.parser')
    section95 = soup.find('div', id='section95')
    if not section95:
        url = ''
    else:
        url = section95.find('a')['href']
    return url

In [7]:
def scrape_disease_url(disease_names):
    disease_url = {}
    for disease in disease_names:
        link = f'https://medlineplus.gov/{clean_disease_name(disease)}.html'
        html_content = fetch_disease_html(link)
        disease_url[disease] = get_url(html_content)

    return disease_url

In [8]:
disease_url = scrape_disease_url(disease_names)


In [9]:
# Cria um novo dicionário com as entradas que contêm "cdc" na URL
disease_url_cdc = {key: value for key, value in disease_url.items() if 'cdc' in value}

# Exibe o novo dicionário
disease_url_cdc

{'Aspergillosis': 'https://www.cdc.gov/fungal/diseases/aspergillosis/symptoms.html',
 'Anthrax': 'https://www.cdc.gov/anthrax/symptoms/index.html',
 'Tick Bites': 'https://www.cdc.gov/ticks/symptoms.html',
 'Plague': 'https://www.cdc.gov/plague/symptoms/',
 'Chickenpox': 'https://www.cdc.gov/chickenpox/about/symptoms.html',
 'Valley Fever': 'https://www.cdc.gov/fungal/diseases/coccidioidomycosis/symptoms.html',
 'Haemophilus Infections': 'https://www.cdc.gov/hi-disease/about/symptoms.html',
 'Rabies': 'https://www.cdc.gov/rabies/symptoms/',
 'Listeria Infections': 'https://www.cdc.gov/listeria/symptoms.html',
 'Tetanus': 'https://www.cdc.gov/tetanus/about/symptoms-complications.html',
 'Measles': 'https://www.cdc.gov/measles/symptoms/signs-symptoms.html',
 'Mumps': 'https://www.cdc.gov/mumps/about/signs-symptoms.html',
 'Norovirus Infections': 'https://www.cdc.gov/norovirus/about/symptoms.html',
 'Pneumococcal Infections': 'https://www.cdc.gov/pneumococcal/about/symptoms-complications.