In [2]:
import os
import time
import pandas as pd
import requests

# pip install selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains

from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

from bs4 import BeautifulSoup # pip install beautifulsoup4

from datetime import datetime

## 1. Overview

### 1.1 Define Initial URL

Initially, we will use a URL that seems to lead us to a generic search engine for all theses, dissertations, and post-doctoral products sorted by title.

Although the initial requirement was to extract information by sections or research fields (e.g., Political Science), since the page structure is always the same, the idea is to generate code that extracts information based on the page structure rather than the specific filter applied to that page, such as filtering only a certain type of publication by topic.

In [3]:
url = "http://repositorio2.unb.br/jspui/handle/10482/45731/browse?type=title&sort_by=1&order=ASC&rpp=100&etal=-1&null=&"

### 1.2 Workflow

The page has the following structure (**search page**):

- **Table**: Contains rows where each row represents a publication.
- **Fields**: Each row displays several fields related to the article, including the title and a link to the specific article page (**article page**).
- **Pagination**: You can limit the number of entries shown per page. In this case, we have set it to show the maximum number of results per page, which is **100**.
- **Navigation**: At the top and bottom of the table, there are "Next" and "Previous" buttons to navigate through the pages of search results.

### Extraction Flow

1. **Start on the Initial Search Page**:
   - Begin at the starting search page, which shows articles from 1 to 100 (**start_function**).

2. **Find and Collect Article Links**:
   - Locate all rows/articles (100) on the page.
   - For each row, find the HTML element that references the title and contains a hyperlink to the article's specific page.

3. **Visit Each Article Page**:
   - For each article/link found:
     - Access the hyperlink and go to the corresponding article page (**article page**).
     - Extract the information from the visited article.
     - Return to the initial search page.

4. **Move to the Next Page**:
   - After extracting information from all 100 articles, go to the next page.
   - Repeat steps 2 and 3 for the new page.

5. **End of Extraction**:
   - When the "Next" button no longer appears, it indicates that you have reached the last search page and have "scanned" all relevant articles. This means the task is complete.

## 2. Functions

### 2.1. Start Function

We create a function to initialize the driver with a given URL.

It is important to note that we have configured the driver to block downloads. This is because, in practice, running the code automatically started downloading the documents (PDFs of the publications). 

To prevent this, we configured the options to block automatic downloads.


In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

def open_url_block_dw(url):
    """
    Opens a specified URL in a Chrome browser instance with download restrictions.

    Parameters:
    url (str): The URL to open in the browser.

    Returns:
    webdriver.Chrome: The initialized Chrome WebDriver instance.
    """
    
    # Configure ChromeOptions to block downloads
    chrome_options = Options()
    chrome_options.add_experimental_option("prefs", {
        "download.prompt_for_download": False,  # Disable the download prompt dialog
        "download_restrictions": 3  # Block all automatic downloads
    })

    # Initialize the Chrome WebDriver with the specified options
    driver = webdriver.Chrome(service=Service(), options=chrome_options)
    
    # Navigate to the specified URL
    driver.get(url)
    
    return driver



### 2. Movement Function:

In [5]:
"""

LO SACAMOS AL MAIN, CREO QUE QUEDA MAS CLARO 

def page_movement(driver):
    
    n = 1
    
    header = driver.find_element(By.CLASS_NAME, "panel-heading.text-center").text
    
    while "Siguiente" in header:
        
        print(f"pagina {n}")
        
        # ejecuto target papers
        target_papers(driver)
        
        driver = next_page(driver)
        
        n +=1
        header = driver.find_element(By.CLASS_NAME, "panel-heading.text-center").text
        
    print("Se han escaneado todas las páginas")
    
"""

'\n\nLO SACAMOS AL MAIN, CREO QUE QUEDA MAS CLARO \n\ndef page_movement(driver):\n    \n    n = 1\n    \n    header = driver.find_element(By.CLASS_NAME, "panel-heading.text-center").text\n    \n    while "Siguiente" in header:\n        \n        print(f"pagina {n}")\n        \n        # ejecuto target papers\n        target_papers(driver)\n        \n        driver = next_page(driver)\n        \n        n +=1\n        header = driver.find_element(By.CLASS_NAME, "panel-heading.text-center").text\n        \n    print("Se han escaneado todas las páginas")\n    \n'

In [6]:
from selenium.webdriver.common.by import By

def next_page(driver):
    """
    Clicks the 'next' button to navigate to the next page in the pagination.

    Parameters:
    driver (webdriver.Chrome): The Chrome WebDriver instance used to interact with the web page.

    Returns:
    webdriver.Chrome: The updated Chrome WebDriver instance after navigating to the next page.
    """
    
    # Locate the footer section where the pagination controls are located
    footer = driver.find_element(By.CLASS_NAME, "panel-footer.text-center")
    
    # Find and click the 'next' button, which is typically located in the footer section
    footer.find_element(By.CLASS_NAME, "pull-right").click()
    
    return driver

    

### 3. Taget_papers

In [7]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

def target_papers(driver):
    """
    Extracts information from a list of paper links on a webpage and returns it as a DataFrame.

    Parameters:
    driver (webdriver.Chrome): The Chrome WebDriver instance used to interact with the web page.

    Returns:
    tuple: A tuple containing:
        - webdriver.Chrome: The updated Chrome WebDriver instance after processing all papers.
        - pd.DataFrame: A DataFrame containing the extracted information from the papers.
    """
    
    info_list = []
    
    # Find all elements containing titles/links to published articles
    elements = driver.find_elements(By.XPATH, '//td[@headers="t3"]')
    
    # Loop over the paper elements to extract relevant information
    for e in elements:
        
        # Target the element that contains the link to the paper page
        href = e.find_element(By.TAG_NAME, "a")
    
        # Wait for the link to be clickable and then click it to navigate to the paper's page
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(href)).click()
        
        # Extract relevant information from the paper's page
        info = extract_info(driver)
        
        # Append the extracted information to the list
        info_list.append(info)

        # Navigate back to the search results page
        driver.back()
        
    # Convert the list of information dictionaries to a DataFrame
    df = pd.DataFrame(info_list)
    
    return driver, df


# 4. Extraer informacion

## Data Structure:

All the information relevant to an article is stored in a unique web page, we will call this web page "article_page". 

Inside de page, we can find the link to the document (Ej: PDF document) and a some related fields to the article, that are contained in a table:

Table Structure:

 - First Column: Contains the field name (e.g., "Title", "Authors").

 - Second Column: Contains the field content (e.g., "A very cool dissertation", "Author 1, Author 2").

Each row of the table represents a field and its corresponding content. The field name is in the first cell (<td>), and the field content is in the second cell (<td>).
    
**NOTE:** The table can contain either a single entry or multiple entries. For example:

    - A title is unique (there is only one title).
    - Authors can be multiple people.
    - Topics can include more than one.


## Workflow:

The goal is to extract specific data from an HTML page using Python. The process involves retrieving HTML content, parsing it, and then extracting structured information from a table.

Steps

1. **Retrieve HTML Code**, Obtain the HTML source from the current page using the driver object.

2. **Parse HTML Content**, Use BeautifulSoup to parse the HTML, enabling easy navigation and manipulation of the document.

3. **Locate the table** within the parsed HTML that contains the desired information.

4. **Iterate Through Table Rows** Extract data from each row of the table, focusing on fields and their corresponding content.
    
Each row in the table has two main elements (stored in columns) 
    Field Name: Located in the first cell (<td> element).
    Field Content: Located in the second cell (<td> element).
    
Field names and contents are to be processed and stored in a dictionary.
        
5. **Extract and Format Data:**

    Field Name: Clean and format the text to use as a dictionary key.
    Field Content: Extract text from the field, including handling multiple values (e.g., links).

In [8]:
from bs4 import BeautifulSoup

def extract_info(driver):
    """
    Extracts relevant information from the current page of a paper.

    Parameters:
    driver (webdriver.Chrome): The Chrome WebDriver instance used to interact with the web page.

    Returns:
    dict: A dictionary containing the extracted information from the paper page.
    """
    
    # Get the HTML code from the current page (article page)
    page = driver.page_source

    # Parse the HTML content using BeautifulSoup for easy manipulation
    info = BeautifulSoup(page, "html.parser")

    # Create an empty dictionary to store all extracted values
    data_dictionary = {}

    # Store the document link
    try:
        # Construct the document link from the current URL and the text in the specified table cell
        data_dictionary["Document"] = f"{driver.current_url}/1/{info.find('td', headers='t1').text}"
    except:
        # If the link or text is not found, set a default message
        data_dictionary["Document"] = "No files associated with this item"

    """
    IMPORTANT NOTE:
    What if there is more than one document link?
    The link seems to be composed of the article link + /1/ + the name,
    which is why it's important to consider multiple links if they exist.
    """
    
    # Select the table where all the information is contained
    table = info.find("table", class_="table itemDisplayTable")

    # Find all rows inside the table
    rows = table.find_all("tr")

    # Iterate through all rows to get field names and field contents
    for r in rows:

        # Find data inside each row (includes field_name and field_content)
        data = r.find_all("td")  # Each row is composed of two columns or cells

        if len(data) < 2:
            continue  # Skip rows that do not have the expected structure

        # Extract the field name and field content
        field_name = data[0]
        field_content = data[1]

        # Extract the text inside both field_name and field_content
        key = field_name.text.replace(":", "").strip()  # Field name (key) with formatting

        # Extract values; handle multiple values if they exist
        if field_content.find_all("a"):
            values = [i.text for i in field_content.find_all("a")]
        else:
            values = field_content.text.strip()

        data_dictionary[key] = values

    return data_dictionary


# main():

In [13]:
import pandas as pd
from selenium.webdriver.common.by import By

def main(url, name):
    """
    Main function to scrape data from paginated pages and save it to a CSV file.

    Parameters:
    url (str): The URL of the initial page to start scraping from.
    name (str): The name to use for the output CSV file.

    Returns:
    pd.DataFrame: A DataFrame containing all the extracted data from the pages.
    """
    
    # Start the WebDriver with download restrictions
    driver = open_url_block_dw(url)

    # Create an empty DataFrame to store the extracted information
    df = pd.DataFrame()

    # Initialize a counter to track the current page number
    n = 1

    # Get the header text to determine if there is a next page
    header = driver.find_element(By.CLASS_NAME, "panel-heading.text-center").text

    while "Siguiente" in header:
        
        print(f"Page {n}")

        # Execute the function to target and extract paper information
        driver, df1 = target_papers(driver)

        # Concatenate the new data with the existing DataFrame
        df = pd.concat([df, df1], axis=0)

        # Navigate to the next page
        driver = next_page(driver)

        # Increment the page counter and update the header text
        n += 1
        header = driver.find_element(By.CLASS_NAME, "panel-heading.text-center").text

    # Process the last page
    print(f"Page {n}")
    driver, df1 = target_papers(driver)
    df = pd.concat([df, df1], axis=0)
    
    print("All pages have been scanned.")
    
    # Save the DataFrame to a CSV file
    df.to_csv(f"{name}.csv", sep=';', index=True)
    
    print(f"DataFrame {name} successfully exported.")
    
    # Quit the WebDriver session
    driver.quit()
    
    return df


In [12]:


url = "http://repositorio2.unb.br/jspui/handle/10482/45731/browse?type=ppg&order=ASC&rpp=20&value=Programa+de+P%C3%B3s-Gradua%C3%A7%C3%A3o+em+Ci%C3%AAncias+Sociais+-+Estudos+Comparados+sobre+as+Am%C3%A9ricas"

name = "probando"

df = main(url, name)



Page 1
Page 2
All pages have been scanned.
DataFrame probando successfully exported.
