In [1]:
# Importando Bibliotecas
import requests
import re
import time
import json
from bs4 import BeautifulSoup

In [2]:
# URL
infectiousdiseases_url = "https://medlineplus.gov/infections.html"

In [3]:
def fetch_disease_html(disease_url):
    """Busca o conteúdo HTML da página da doença."""
    try:
        response = requests.get(disease_url)
        response.raise_for_status()
        return response.content
    except requests.exceptions.RequestException as e:
        print('Erro ao fazer a requisição:', e)
        exit()

In [4]:
def get_disease_names(infectiousdiseases_url):
    """Extrai os nomes das doenças da página de doenças infecciosas."""
    htlm = fetch_disease_html(infectiousdiseases_url)
    soup = BeautifulSoup(htlm, 'html.parser')
    items = soup.find_all(class_='item')
    diseases = []
    for item in items:
        for a_tag in item.find_all('a'):
            disease = a_tag.text.strip()
            if disease != 'Animal Diseases and Your Health':
                diseases.append(disease)
    return diseases

In [5]:
def clean_disease_name(disease):
    """Limpa o nome da doença para criar a URL."""
    return re.sub("[^\w]", "", disease.lower())

In [6]:
def extract_symptoms(soup):
    """Extrai os sintomas do conteúdo HTML."""
    symptoms_header = soup.find('h3', string=re.compile(r'What are the symptoms of'))
    symptoms = []

    if symptoms_header:
        last_element = symptoms_header.find_next_sibling('h3')

        while symptoms_header != last_element:
            symptoms_header = symptoms_header.find_next_sibling()
            for element in symptoms_header.find_all('li'):
                symptoms.extend([item.strip() for item in element.text.splitlines() if item.strip()])

    return symptoms

In [7]:
def scrape_disease_symptoms(infectiousdiseases_url):
    # Dicionário para armazenar as doenças e sintomas
    disease_names = get_disease_names(infectiousdiseases_url)
    # Itera sobre cada doença na lista de doenças
    disease_symptoms = {}

    for disease in disease_names:
        disease_url = f'https://medlineplus.gov/{clean_disease_name(disease)}.html'
        html_content = fetch_disease_html(disease_url)
        soup = BeautifulSoup(html_content, 'html.parser')
        symptoms = extract_symptoms(soup)
        disease_symptoms[disease] = symptoms

    return disease_symptoms

In [8]:
# Aplicação da função
disease_symptoms = scrape_disease_symptoms(infectiousdiseases_url)

In [9]:
# Cria uma lista com as chaves do dicionário disease_symptoms cujos valores são listas vazias
empty_symptoms_keys = [key for key, value in disease_symptoms.items() if not value]

# Imprime o número de chaves com valores vazios
print(len(empty_symptoms_keys))


90


In [13]:
# Filtrar doenças com sintomas não vazios
filtered_diseases_symptoms = {disease: symptoms for disease, symptoms in disease_symptoms.items() if symptoms}

In [14]:
# Contar o número de elementos no dicionário filtrado
num_elements = len(filtered_diseases_symptoms)
print("Número de elementos no dicionário filtrado:", num_elements)

Número de elementos no dicionário filtrado: 35


In [15]:
# Salvar o dicionário filtrado em um arquivo JSON
with open("filtered_diseases_symptoms_Medlineplus.json", "w") as json_file:
    json.dump(filtered_diseases_symptoms, json_file, indent=4)