### Code to scrape and list all the categories

In [107]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Base URL of the dictionary page
base_url = "https://sistemas.cead.ufv.br/capes/dicionario/"

# Set up Chrome WebDriver
options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)  # Keeps the browser open after script ends
driver = webdriver.Chrome(options=options)
driver.get(base_url)  # Open the dictionary website

# WebDriver wait object (max 15 seconds for elements to appear)
wait = WebDriverWait(driver, 15)

# Get all category links
category_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".categories a")))

# Iterate over categories and print their names
for category in category_elements:
    print(category.text.strip())  # Print category name

# Close browser when done
driver.quit()


Alimentos
Animais e Insetos
Comemorações
Comunicação e Eletrônicos
Construção
Cores
Corpo Humano
Cumprimentos
Dinheiro
Disciplina
Escola
Esporte e Diversão
Instrumentos Musicais
Lugares, Cidades e Países
Meios de Transporte
Natureza
Números
Objetos
Pessoas e Família
Profissões
Situações, Cotidiano e Eventos
Tempo e Calendário
Verbos
Vestuário
Acessar histórico
Biologia
Letras
Matemática


### Code to scrape and list all the words in one of the categories. In this example all the words from the first category

In [None]:
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Open Chrome WebDriver
options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)  # Keep browser open
driver = webdriver.Chrome(options=options)
driver.get("https://sistemas.cead.ufv.br/capes/dicionario/")  # Open dictionary site

# Wait object (max 15 sec)
wait = WebDriverWait(driver, 15)

# Get all category elements
category_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".categories a")))

# Highlight first category for manual selection
driver.execute_script("arguments[0].style.border='3px solid red'", category_elements[0])

# Ask user to manually select a category
input("Click the highlighted category in the browser, then press Enter to continue...")

# Dictionary to store metadata
word_metadata = {}

words_list = []  # Store words in order

while True:
    try:
        # Extract words from the current carousel page
        word_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".carousel-inner .item.active a")))

        current_page_words = []

        for word in word_elements:
            word_text = word.text.strip()

            # Skip empty words
            if word_text == "":
                continue  

            current_page_words.append(word_text)

        # Check if the same sequence of words has already appeared
        if any(word in words_list for word in current_page_words):
            break  # Stop if carousel repeats

        # Add new words to the overall list
        words_list.extend(current_page_words)

        print("Processing words:", current_page_words)

        # Extract metadata for each word in the current page
        for word in current_page_words:
            word_element = driver.find_element(By.LINK_TEXT, word)
            word_url = word_element.get_attribute("href")

            #time.sleep(2)

            # Open word page
            #driver.get(word_url)

            # Extract Example in Portuguese
            try:
                example_pt = wait.until(EC.presence_of_element_located(
                    (By.XPATH, "//h3[contains(text(),'Exemplo em português')]/following-sibling::h4"))).text.strip()
            except:
                example_pt = "Not Available"

            # Extract Example in Libras
            try:
                example_libras = wait.until(EC.presence_of_element_located(
                    (By.XPATH, "//h3[contains(text(),'Exemplo em libras')]/following-sibling::h4"))).text.strip()
            except:
                example_libras = "Not Available"

            # Extract Video URL
            try:
                video_element = wait.until(EC.presence_of_element_located((By.TAG_NAME, "video")))
                video_url = video_element.get_attribute("src")
            except:
                video_url = "Not Available"

            # Store metadata in dictionary
            word_metadata[word] = {
                "example_pt": example_pt,
                "example_libras": example_libras,
                "video_url": video_url
            }


        # Click "Next" button to load more words
        next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".right[role='button']")))
        next_button.click()
        time.sleep(10)

    except Exception as e:
        print("Error or no more words:", e)
        break  # Stop if any issue occurs

#print("\nFinal Word Metadata Dictionary:\n", word_metadata)
import pandas as pd

# Convert dictionary to DataFrame
df = pd.DataFrame.from_dict(word_metadata, orient='index')

# Save DataFrame to CSV
df.to_csv("Dicionario_Metadata.csv", encoding="utf-8", index_label="Word")

# Close browser
driver.quit()


Processing words: ['Abacate', 'Abacaxi', 'Abóbora', 'Açaí', 'Acerola', 'Achocolatado', 'Açúcar', 'Água de coco', 'Alface', 'Alho', 'Amendoim', 'Amora', 'Arroz', 'Azeite']
Processing words: ['Azeitona', 'Bacon', 'Bala', 'Banana', 'Banquete', 'Batata', 'Batata Frita', 'Batida (Bebida)', 'Berinjela', 'Bife', 'Biscoito água e sal', 'Bolacha', 'Bolo', 'Bombom']
Processing words: ['Brócolis', 'Café', 'Caju', 'Camarão', 'Caqui', 'Carne', 'Cebola', 'Cebolinha', 'Cereja', 'Cerveja', 'Chocolate', 'Churrasco', 'Coca Cola', 'Coco']
Processing words: ['Cogumelo', 'Comida', 'Couve', 'Coxinha', 'Damasco', 'Doce', 'Doce de abóbora', 'Empada', 'Ervilha', 'Espinafre', 'Farinha', 'Farofa', 'Feijão', 'Feijoada']
Processing words: ['Fermento', 'Figo', 'Goiabada', 'Jabuticaba', 'Laranja (Fruta)', 'Massa', 'Melancia', 'Pêra', 'Picolé', 'Sopa', 'Sorvete', 'Tomate', 'Torrada', 'Uva']
Processing words: ['Vagem', 'Vinho (bebida)', 'Vitamina', 'Vitamina (bebida)']


### Code to extract all the metadata for all the words in one category. The metadata is first stored as a dictionary and then saved in a csv file.

In [108]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Open Chrome WebDriver
options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)  # Keep browser open
driver = webdriver.Chrome(options=options)
driver.get("https://sistemas.cead.ufv.br/capes/dicionario/")  # Open dictionary site

# Wait object (max 15 sec)
wait = WebDriverWait(driver, 15)

# Get all category elements
category_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".categories a")))

# Highlight first category for manual selection
driver.execute_script("arguments[0].style.border='3px solid red'", category_elements[0])

# Ask user to manually select a category
input("Click the highlighted category in the browser, then press Enter to continue...")

# Dictionary to store metadata
word_metadata = {}

words_list = []  # Store words in order

while True:
    try:
        # Extract words from the current carousel page
        word_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".carousel-inner .item.active a")))

        current_page_words = []

        for word in word_elements:
            word_text = word.text.strip()

            # Skip empty words
            if word_text == "":
                continue  

            current_page_words.append(word_text)

        # Check if the same sequence of words has already appeared (prevents looping)
        if any(word in words_list for word in current_page_words):
            break  # Stop if carousel repeats

        # Add new words to the overall list
        words_list.extend(current_page_words)

        print("Processing words:", current_page_words)

        # Extract metadata for each word in the current page
        for word in current_page_words:
            try:
                word_element = driver.find_element(By.LINK_TEXT, word)
                word_url = word_element.get_attribute("href")

                # Open word page
                driver.get(word_url)

                # Extract Example in Portuguese
                try:
                    example_pt = wait.until(EC.presence_of_element_located(
                        (By.XPATH, "//h3[contains(text(),'Exemplo em português')]/following-sibling::h4"))).text.strip()
                except:
                    example_pt = "Not Available"

                # Extract Example in Libras
                try:
                    example_libras = wait.until(EC.presence_of_element_located(
                        (By.XPATH, "//h3[contains(text(),'Exemplo em libras')]/following-sibling::h4"))).text.strip()
                except:
                    example_libras = "Not Available"

                # Extract Video URL
                try:
                    video_element = wait.until(EC.presence_of_element_located((By.TAG_NAME, "video")))
                    video_url = video_element.get_attribute("src")
                except:
                    video_url = "Not Available"

                # Store metadata in dictionary
                word_metadata[word] = {
                    "example_pt": example_pt,
                    "example_libras": example_libras,
                    "video_url": video_url
                }

                # Go back to category page
                #driver.back()

            except Exception as e:
                print(f"Skipping word '{word}' due to error:", e)
                continue

        # Click "Next" button to load more words
        next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".right[role='button']")))
        next_button.click()

    except Exception as e:
        print("Error or no more words:", e)
        break  # Stop if any issue occurs

# Convert dictionary to DataFrame
df = pd.DataFrame.from_dict(word_metadata, orient='index')

# Save DataFrame to CSV
df.to_csv("Dicionario_Metadata.csv", encoding="utf-8", index_label="Word")

# Close browser
driver.quit()


Processing words: ['Abacate', 'Abacaxi', 'Abóbora', 'Açaí', 'Acerola', 'Achocolatado', 'Açúcar', 'Água de coco', 'Alface', 'Alho', 'Amendoim', 'Amora', 'Arroz', 'Azeite']
