In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
# Initialize list to store all journals
all_journals = []

def get_all_categories():
    # Base URL where all categories are listed
    base_url = "https://scielo.org/es/revistas/listar-por-tema"
    response = requests.get(base_url)
    
    if response.status_code != 200:
        print("Failed to retrieve categories.")
        return {}

    soup = BeautifulSoup(response.content, "html.parser")

    # Find the menu or list containing categories
    categories = {}
    
    # Categories are inside select eelement
    category_options = soup.select("#subject_area option")
    
    for option in category_options:
        category_name = option.text.strip()
        category_url = option.get("value")
        
        # add valid category URLs
        if category_url and "listar-por-tema" in category_url:
            categories[category_name] = category_url
    
    return categories

def get_journals_from_letter_page(url, category_name):
    # Request the page
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve journals from {url}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')
    # Extract journal entries
    journal_rows = soup.select('#journalsTable tbody tr')
    
    # Iterate over each row to get journal details
    for row in journal_rows:
        # Check if it is a separator row by letter, skip if true
        if "separator-by-letter" in row.get("class", []):
            continue
        
        # Extract journal name and URL
        journal_link = row.find("a")
        if journal_link:
            journal_name = journal_link.text.strip()
            journal_url = journal_link['href']
            # Check if the journal appears inactive by checking 'disabled' class
            journal_type = "Inactive" if 'disabled' in journal_link.get("class", []) else "Active"
            # Append the data
            all_journals.append({
                "journal_name": journal_name,
                "category": category_name,
                "type": journal_type,
                "URL": journal_url
            })

def get_all_journals_from_category(category_url, category_name):
    # Request the category page to get the alphabetic filter buttons
    response = requests.get(category_url)
    if response.status_code != 200:
        print(f"Failed to retrieve category page {category_url}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract letter buttons
    letter_buttons = soup.select('.btn-group-vertical .btn')
    letters = [btn.text.strip() for btn in letter_buttons if btn.text.strip()]

    # Iterate through each letter and scrape the journals
    for letter in letters:
        # Generate the URL for the specific letter
        letter_url = f"{category_url}?letter={letter}"
        print(f"Scraping {category_name} - Letter {letter}")
        get_journals_from_letter_page(letter_url, category_name)
        # To prevent overwhelming the server, add a small delay
        time.sleep(1)


In [None]:

# Dynamically extract categories
categories = get_all_categories()

# iterate over categories
for category, category_url in categories.items():
    print(f"Scraping category: {category}")
    get_all_journals_from_category(category_url, category)

# Create df
df = pd.DataFrame(all_journals)

# Save df to csv
output_path = "./scielo_all_journals.csv" # TODO: add your output path
df.to_csv(output_path, index=False)
print(f"Data saved to {output_path}")

# POSTPROCESSING: some journals are labelled as being in different categories (Example: 'Todas las categorías', 'Humanidades' and 'Ciencias Biológicas')

In [None]:
import pandas as pd

# load output_path csv file 
input_path = "./scielo_all_journals.csv" # TODO: this is the output path of the previous folder
df = pd.read_csv(input_path)

# Create a set to hold all unique category names for generating column headers
unique_categories = set(df['category'].unique())

# Format category names to be suitable for column names
category_columns = {
    category: f"category_{category.lower().replace(' ', '_').replace('á', 'a').replace('é', 'e').replace('í', 'i').replace('ó', 'o').replace('ú', 'u').replace('ñ', 'n').replace('ç', 'c')}"
    for category in unique_categories
}

# Create a dic for data
journal_dict = {}

# Iterate over each row to organize journals by name
for index, row in df.iterrows():
    journal_name = row['journal_name']
    category = row['category']
    journal_type = row['type']
    journal_url = row['URL']
    
    # If the journal is already in the dictionary, just add the new category name
    if journal_name in journal_dict:
        journal_dict[journal_name][category_columns[category]] = category
    else:
        # Create a new entry for this journal, initializing all category columns to 0
        journal_data = {
            "journal_name": journal_name,
            "type": journal_type,
            "URL": journal_url,
            **{col_name: 0 for col_name in category_columns.values()}
        }
        # Set the relevant category column to the category name
        journal_data[category_columns[category]] = category
        journal_dict[journal_name] = journal_data

# Prepare new rows for the cleaned df
cleaned_data = list(journal_dict.values())

# Create a new df from the cleaned data
cleaned_df = pd.DataFrame(cleaned_data)

# Save cleaned df to a new csv file
output_path = "./scielo_cleaned_journals.csv" # TODO: new output path with the filterd/cleaned data
cleaned_df.to_csv(output_path, index=False)

print(f"Cleaned data saved to {output_path}")


In [None]:
len(cleaned_df)