In [None]:
import time
import csv
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC

# Base URL
base_url = "https://www.ines.gov.br/dicionario-de-libras/"
image_base_url = "https://www.ines.gov.br/"  # To construct absolute image URLs

# Set up Chrome WebDriver
options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)  # Keeps the browser open
driver = webdriver.Chrome(options=options)
driver.get(base_url)  # Open the dictionary website

# WebDriver wait object (max 15 seconds for elements to appear)
wait = WebDriverWait(driver, 15)

# CSV file to store metadata
csv_filename = "INES_Metadata.csv"
file_exists = os.path.isfile(csv_filename)

# Open CSV file in append mode
with open(csv_filename, "a", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)

    # If file is new, write the header
    if not file_exists:
        writer.writerow(["Letter", "Word", "Video URL", "Image URL", 
                         "Assuntos", "Acepção", "Exemplo", "Exemplo Libras", 
                         "Classe Gramatical", "Origem"])

    # Find all letter elements (A-Z)
    letters = driver.find_elements(By.CSS_SELECTOR, "ul.list-inline li a")

    for letter in letters:
        try:
            # Highlight the letter so the user knows where to click
            driver.execute_script("arguments[0].style.border='3px solid red'", letter)
            print(f"Highlighted letter: {letter.text}")

            # Ask the user to manually click the letter
            input(f"Click '{letter.text}' in the browser, then press Enter here to continue...")

            # Wait for the word list to load
            #time.sleep(5)

            # Find dropdown with word list
            select_element = wait.until(EC.presence_of_element_located((By.ID, "input-palavras")))
            select = Select(select_element)

            # Extract all words
            words = [option.text.strip() for option in select.options if option.text.strip()]

            for word in words:
                try:
                    # Select the word
                    select.select_by_visible_text(word)
                    #time.sleep(3)  # Allow content to load

                    # Extract video URL
                    video_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#input-video video source[type='video/mp4']")))
                    video_url = video_element.get_attribute("src")

                    # Extract image URL
                    try:
                        image_element = driver.find_element(By.CSS_SELECTOR, "#input-mao img")
                        image_url = image_base_url + image_element.get_attribute("src")  # Full URL
                    except:
                        image_url = "No Image Available"

                    # Extract metadata fields
                    def get_metadata(field_id):
                        try:
                            element = driver.find_element(By.ID, field_id)
                            return element.text.strip()
                        except:
                            return "Not Available"

                    assuntos = get_metadata("input-assunto")
                    acepcao = get_metadata("input-acepcao")
                    exemplo = get_metadata("input-exemplo")
                    exemplo_libras = get_metadata("input-libras")
                    classe_gramatical = get_metadata("input-classe")
                    origem = get_metadata("input-origem")

                    # Save to CSV
                    writer.writerow([letter.text, word, video_url, image_url, 
                                     assuntos, acepcao, exemplo, exemplo_libras, 
                                     classe_gramatical, origem])
                    file.flush()  # Ensure data is saved immediately

                    #print(f"Saved: {word} | Video: {video_url} | Image: {image_url}")

                except Exception as e:
                    print(f"Error processing word '{word}': {e}")

        except Exception as e:
            print(f"Error processing letter '{letter.text}': {e}")

print(f"Data saved in {csv_filename}")

# Keep browser open for manual verification
input("Press Enter to close the browser...")

# Close browser
driver.quit()
