In [9]:
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup
import json

In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import logging
logging.basicConfig(level=logging.INFO)

In [11]:
from dotenv import load_dotenv
import google.generativeai as genai
from google.api_core.exceptions import ResourceExhausted
import google.generativeai as genai
import os
from dotenv import load_dotenv
import google.generativeai as genai

In [12]:
def initialize_edge_driver():
    """Initializes and returns an Edge WebDriver instance with specified options."""
    edge_options = Options()
    edge_options.add_argument("--no-sandbox")
    edge_options.add_argument("--disable-dev-shm-usage")
    edge_options.add_argument("--headless")
    service = Service(r'C:/Users/PedroLustosa/OneDrive - Grupo Portfolio/Documentos/edgedriver_win64/msedgedriver.exe')
    driver = webdriver.Edge(service=service, options=edge_options)
    return driver

def get_soup_xpath(url, xpath):
    """
    Fetches a web page and returns a BeautifulSoup object parsed from the page's HTML.
    
    Args:
        url (str): The URL of the web page to fetch.
        xpath (str): The XPath of the element to wait for before parsing the HTML.

    Returns:
        BeautifulSoup: A BeautifulSoup object containing the parsed HTML.
    """
    driver = None
    soup = None
    
    try:
        driver = initialize_edge_driver()
        driver.get(url)
        
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, xpath))
        )
        
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
    
    except Exception as e:
        logging.error(f"An error occurred: {e}")
    
    finally:
        if driver:
            driver.quit()
    
    return soup

def extract_text_by_class(soup, class_name):
    """
    Extracts and concatenates text from specified HTML elements within a given class in a BeautifulSoup object.

    Args:
        soup (BeautifulSoup): The BeautifulSoup object containing the parsed HTML.
        class_name (str): The class name of the container element to extract text from.

    Returns:
        str: A string containing concatenated text from the specified HTML elements.
    """
    if soup:
        container = soup.find(class_=class_name)
        if container:
            elements = container.find_all(['h1', 'h2', 'h3', 'p', 'li', 'ul'])
            text = '\n'.join(element.get_text(strip=True) for element in elements)
        else:
            text = ""
    else:
        text = ""
    return text

def generate_text(prompt, model):
    """
    Generates text based on a given prompt using a specified model.

    Args:
        prompt (str): The prompt to generate text from.
        model: The model to use for generating the text.

    Returns:
        dict: A dictionary parsed from the JSON response generated by the model.
    """
    json_doencas = {}
    
    if prompt:
        try:
            response = model.generate_content(prompt)
            response_text = response.text.replace("```", "").replace("json\n", "").strip()
            json_doencas = json.loads(response_text)
        except json.JSONDecodeError as e:
            logging.error(f"Error decoding JSON: {e}")
        except Exception as e:
            logging.error(f"An error occurred: {e}")
    
    return json_doencas


In [13]:
soup = get_soup_xpath('https://www.cdc.gov/health-topics.html#A', '//*[@id="cdc-atozlist"]/div/div[3]/div/ul/li[1]/a')
link_hrefs  = soup.select('#cdc-atozlist div div ul li a')

In [14]:
# Carrega as variáveis de ambiente do arquivo .env
load_dotenv()

# Acessa a chave de API do Gemini
GEMINI_API_KEY  = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=GEMINI_API_KEY)

# Define o modelo
generation_config = {
  "temperature": 0.4,
}

safety_settings={
    'HATE': 'BLOCK_NONE',
    'HARASSMENT': 'BLOCK_NONE',
    'SEXUAL' : 'BLOCK_NONE',
    'DANGEROUS' : 'BLOCK_NONE'
    }

instruction = """Você está atuando como um cientista de dados sênior. Sua tarefa é extrair informações sobre doenças e sintomas em  português Brasil a partir do texto fornecido. Identifique e liste cada doença junto com seus sintomas associados conforme descrito no texto. Formate o resultado como um objeto JSON.

Especificações:
Identificação de Doenças: Enumere cada doença mencionada no texto.
Identificação de Sintomas: Para cada doença, liste os sintomas associados conforme descrito no texto.
Formato JSON: Organize as informações no formato JSON conforme o exemplo abaixo.

Exemplo de Formatação JSON:
{
  "HIV": [
    "Febre",
    "Calafrios",
    "Rash",
    "Suores noturnos (sudoração intensa durante o sono)",
    "Dores musculares",
    "Dor de garganta",
    "Fadiga",
    "Inchaço dos gânglios linfáticos",
    "Úlceras na boca"
  ],
  "Mielite Flácida Aguda": [
    "Fraqueza nos braços ou pernas",
    "Perda de tônus muscular e reflexos",
    "Queda ou fraqueza facial",
    "Dificuldade para mover os olhos",
    "Pálpebras caídas",
    "Dificuldade para engolir",
    "Dificuldade para falar",
    "Dor nos braços, pernas, costas ou pescoço"
  ]
}  
"""

model = genai.GenerativeModel(model_name="gemini-1.5-pro",
                              generation_config=generation_config,
                              safety_settings=safety_settings,
                              system_instruction=instruction)

In [15]:
doencas = {}
for ref in link_hrefs[:10]:
    soup_link = get_soup_xpath(ref.get("href"), '//*[@id="content"]/div[1]')
    texto = extract_text_by_class(soup_link, 'cdc-dfe-body__center')
    json_doenca = generate_text(texto,model)
    doencas.update(json_doenca)
    time.sleep(20)
    

What it is
Aortic aneurysms can dissect or rupture:
The force of blood pumping can split the layers of the artery wall, allowing blood to leak in between them. This process is called adissection.The aneurysm can burst completely, causing bleeding inside the body. This is called arupture.Dissections and ruptures are the cause of most deaths from aortic aneurysms.
The force of blood pumping can split the layers of the artery wall, allowing blood to leak in between them. This process is called adissection.
The aneurysm can burst completely, causing bleeding inside the body. This is called arupture.
Dissections and ruptures are the cause of most deaths from aortic aneurysms.
Facts about aortic aneurysm in the United States
Aortic aneurysms or aortic dissections were the cause of 9,904 deaths in 2019.1In 2019, about 59% of deaths due to aortic aneurysm or aortic dissection happen among men.1A history of smoking accounts for about 75% of all abdominal aortic aneurysms.2The US Preventive Serv

ERROR:root:Error decoding JSON: Extra data: line 4 column 1 (char 5)


What are child abuse and neglect?
Child abuse and neglect includes all types of abuse and neglect of a child under the age of 18 by a parent, caregiver, or another person in a custodial role (e.g., a religious leader, a coach, or a teacher) that results in harm, the potential for harm, or threat of harm to a child.
There are four common types of abuse and neglect:
Physical abuseis the intentional use of physical force that can result in physical injury. Examples include hitting, kicking, shaking, or other shows of force against a child.1Sexual abuseinvolves pressuring or forcing a child to engage in sexual acts. Examples include fondling, penetration, and exposing a child to other sexual activities.1Read more information aboutchild sexual abuse.Emotional abuserefers to behaviors that harm a child's self-worth or emotional well-being. Examples include name-calling, shaming, rejecting, and withholding love.1Neglectis the failure to meet a child's basic physical and emotional needs. These

ERROR:root:An error occurred: Message: invalid argument
  (Session info: msedge-headless-shell=127.0.2651.74)
Stacktrace:
	GetHandleVerifier [0x00007FF682440602+11762]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6823E0949+271417]
	(No symbol) [0x00007FF6821FFCD5]
	(No symbol) [0x00007FF6821EB17C]
	(No symbol) [0x00007FF6821E9A41]
	(No symbol) [0x00007FF6821E9DCC]
	(No symbol) [0x00007FF682202911]
	(No symbol) [0x00007FF68228003E]
	(No symbol) [0x00007FF682265F3A]
	(No symbol) [0x00007FF68223C2FD]
	(No symbol) [0x00007FF68227FA97]
	(No symbol) [0x00007FF682265BA3]
	(No symbol) [0x00007FF68223B80C]
	(No symbol) [0x00007FF68223ACED]
	(No symbol) [0x00007FF68223B3D1]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF6825BB584+879700]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF682375D4F+56159]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF6823686E3+1267]
	Microsoft::Applications::Events::E




ERROR:root:An error occurred: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF682440602+11762]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6823E0949+271417]
	(No symbol) [0x00007FF6821FFE6E]
	(No symbol) [0x00007FF682246964]
	(No symbol) [0x00007FF6822469EB]
	(No symbol) [0x00007FF682282107]
	(No symbol) [0x00007FF682265F7F]
	(No symbol) [0x00007FF68223C2FD]
	(No symbol) [0x00007FF68227FA97]
	(No symbol) [0x00007FF682265BA3]
	(No symbol) [0x00007FF68223B80C]
	(No symbol) [0x00007FF68223ACED]
	(No symbol) [0x00007FF68223B3D1]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF6825BB584+879700]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF682375D4F+56159]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF6823686E3+1267]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF6825BA2DD+874925]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6823E8541+303153]


Overview
ThePublic Health Accreditation Board(PHAB), a nonprofit 501(c)3 entity, serves as the independent accrediting body. PHAB led the development and testing activities, with significant participation from local, tribal, state, and national leaders and launched the national voluntary accreditation program on September 14, 2011. PHAB began accrediting health departments in 2013.
Accredited Health Departments
Resources
Public Health Accreditation Board WebsiteStandards and measures, accreditation policies, and other resources to help health departments with national accreditation efforts.
Assessing Outcomes from Public Health AccreditationEvaluation findings that describe the range of benefits of accreditation reported by health departments  (NORC at the University of Chicago, June 2023).
Impact of Public Health AccreditationJournal issue featuring scientific articles, case studies, and commentaries about the impact of public health accreditation (Journal of Public Health Management

ERROR:root:Error decoding JSON: Expecting value: line 1 column 1 (char 0)



What it is
Acinetobacteris a group of bacteria (germs) commonly found in the environment, like in soil and water. Infections caused byAcinetobacterrarely occur outside of healthcare settings.
Types
Germs in theAcinetobacterbaumanniifamily account for mostAcinetobacterinfections in humans. This type is sometimes referred to as "Acinetobacter baumanniicomplex."
Signs and symptoms
Acinetobacterbaumanniican cause infections in the blood, urinary tract, lungs (pneumonia) or wounds. In some cases, people can carry the bacteria without being infected, known as colonization.
At-risk populations
In the United States, patients in healthcare settings are at highest risk, especially those who:
Are on breathing machines (ventilators).Have devices such as catheters.Have open wounds from surgery.Are in intensive care units.Have prolonged hospital stays.
Are on breathing machines (ventilators).
Have devices such as catheters.
Have open wounds from surgery.
Are in intensive care units.
Have prolonged 

ERROR:root:An error occurred: Message: session not created: DevToolsActivePort file doesn't exist
Stacktrace:
	GetHandleVerifier [0x00007FF682440602+11762]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6823E0949+271417]
	(No symbol) [0x00007FF6821FFE6E]
	(No symbol) [0x00007FF682236C2D]
	(No symbol) [0x00007FF682233F1E]
	(No symbol) [0x00007FF68222E38C]
	(No symbol) [0x00007FF68227017F]
	(No symbol) [0x00007FF682265BA3]
	(No symbol) [0x00007FF68223B80C]
	(No symbol) [0x00007FF68223ACED]
	(No symbol) [0x00007FF68223B3D1]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF6825BB584+879700]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF682375D4F+56159]
	Microsoft::Applications::Events::EventProperty::~EventProperty [0x00007FF6823686E3+1267]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF6825BA2DD+874925]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6823E8541+303153]
	Microsof




In [16]:
doencas

{'Aneurisma da Aorta Torácica': ['Dor aguda e repentina no peito ou parte superior das costas',
  'Falta de ar',
  'Dificuldade para respirar ou engolir'],
 'Aneurisma da Aorta Abdominal': ['Pulsação ou dor profunda nas costas ou lado',
  'Dor nas nádegas, virilha ou pernas'],
 'Aneurisma Cerebral': ['Pode causar um derrame'],
 'Aneurisma Periférico': ['Podem formar coágulos sanguíneos que podem se soltar e bloquear o fluxo sanguíneo pela artéria'],
 'Abuso Físico': ['Uso intencional de força física que pode resultar em lesão física',
  'Bater',
  'Chutar',
  'Sacudir',
  'Outras demonstrações de força contra uma criança'],
 'Abuso Sexual': ['Pressionar ou forçar uma criança a se envolver em atos sexuais',
  'Acariciar',
  'Penetração',
  'Expor uma criança a outras atividades sexuais'],
 'Abuso Emocional': ['Comportamentos que prejudicam a autoestima ou o bem-estar emocional de uma criança',
  'Insultar',
  'Envergonhar',
  'Rejeitar',
  'Negar amor'],
 'Negligência': ['Falha em atend