In [4]:
import sqlite3
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time
import random
from bs4 import BeautifulSoup

# Initialize SQLite database and create table with dynamic table name
def initialize_database(db_file, table_name):
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()

    # Create a table with a dynamic table name, renaming 'select' to 'selected' to avoid SQL reserved keywords
    create_table_query = f'''
    CREATE TABLE IF NOT EXISTS {table_name} (
        selected INTEGER DEFAULT 0,
        journal_name TEXT,
        frequency INTEGER,
        impact_factor TEXT,
        cite_score TEXT,
        time_to_first_decision TEXT,
        time_to_acceptance TEXT,
        acceptance_rate TEXT,
        type TEXT,
        journal_link TEXT
    )
    '''
    cursor.execute(create_table_query)
    conn.commit()
    return conn

# Insert the initial journal data into the database
def insert_initial_data(conn, journal_df, table_name):
    cursor = conn.cursor()

    # Insert rows into the dynamic journal_data table
    insert_query = f'''
        INSERT INTO {table_name} (journal_name, frequency)
        VALUES (?, ?)
    '''
    for index, row in journal_df.iterrows():
        cursor.execute(insert_query, (row['Journal Name'], row['Frequency']))

    conn.commit()

# Function to update the database row-by-row as scraping progresses
def update_journal_info(conn, table_name, journal_name, impact_factor, cite_score, time_to_1st_decision, time_to_acceptance, acceptance_rate, journal_type, journal_link):
    cursor = conn.cursor()

    # Update the row where journal_name matches
    update_query = f'''
    UPDATE {table_name}
    SET impact_factor = ?, cite_score = ?, time_to_first_decision = ?, time_to_acceptance = ?, acceptance_rate = ?, type = ?, journal_link = ?
    WHERE journal_name = ?
    '''
    cursor.execute(update_query, (impact_factor, cite_score, time_to_1st_decision, time_to_acceptance, acceptance_rate, journal_type, journal_link, journal_name))

    conn.commit()

# Scraping function that will scrape the journal info and update SQLite database
def scrape_and_update_database(journal_df, db_file, table_name):
    # Initialize SQLite database
    conn = initialize_database(db_file, table_name)

    # Insert the initial journal data into the table
    insert_initial_data(conn, journal_df, table_name)

    # Set up the WebDriver (Chrome in this case)
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)

    base_url = 'https://journalfinder.elsevier.com/results?impactFactorGte=0&impactFactorLt=11&timeToFirstDecisionLt=366&timeToAcceptanceLt=366&goldOpenAccess=true&subscription=true&elsevierOnly=true&sortBy=default&sortOrder=desc&query={}&mode=search&subjectAreaCodes='

    try:
        # Loop over each journal in the DataFrame
        for index, row in journal_df.iterrows():
            journal_name = row['Journal Name']

            # Format the URL with the journal name
            formatted_url = base_url.format(journal_name.replace(' ', '+'))
            
            # Open the URL in the browser
            driver.get(formatted_url)
            
            # Introduce a random delay between 5 and 25 seconds
            random_delay = random.uniform(5, 25)  # Random delay in seconds
            time.sleep(random_delay)

            # Get the page source and parse it with BeautifulSoup
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Initialize journal type
            journal_type = "Unknown"

            # Find the journal card with a matching title (case-insensitive)
            card_number = None
            journal_cards = soup.select('div[id^="journal-card-"]')  # Select all cards with an ID starting with 'journal-card-'
            
            for card_index, journal_card in enumerate(journal_cards):
                title_element = journal_card.select_one(f'#title-{card_index}')
                if title_element and title_element.get_text(strip=True).lower() == journal_name.strip().lower():
                    card_number = card_index
                    break

            # If no exact match found, default to the first card (card_number = 0)
            if card_number is None:
                print(f"No exact match found for {journal_name}. Using the first journal card.")
                card_number = 0

            # Now that we have the correct card number, dynamically create XPath for Open Access, Subscription, and Journal Link
            open_access_xpath = f'//*[@id="po-goldOpenAccess-{card_number}"]'
            subscription_xpath = f'//*[@id="po-subscription-{card_number}"]'
            journal_link_xpath = f'//*[@id="journal-link-cta-{card_number}"]'

            # Check if "Open Access" and "Subscription" are present
            try:
                OpenAccess = driver.find_element(By.XPATH, open_access_xpath)
                open_access_defined = True
            except:
                open_access_defined = False

            try:
                Subscription = driver.find_element(By.XPATH, subscription_xpath)
                subscription_defined = True
            except:
                subscription_defined = False

            # Determine the type based on the presence of "Open Access" and "Subscription"
            if open_access_defined and subscription_defined:
                journal_type = "Hybrid"
            elif open_access_defined:
                journal_type = "Open Access"
            elif subscription_defined:
                journal_type = "Subscription"

            # Get the journal link
            try:
                journal_link_element = driver.find_element(By.XPATH, journal_link_xpath)
                journal_link = journal_link_element.get_attribute("href")
            except:
                journal_link = None

            # Extract the required information
            impact_factor = cite_score = time_to_1st_decision = time_to_acceptance = acceptance_rate = None

            journal_info = journal_card.select_one('.jkGWvc')
            if journal_info:
                journal_text = journal_info.get_text(strip=True)

                if 'Impact Factor' in journal_text:
                    impact_factor = journal_text.split('Impact Factor')[1].split('CiteScore')[0].strip()

                if 'CiteScore' in journal_text:
                    cite_score = journal_text.split('CiteScore')[1].split('Time to 1st decision')[0].strip()

                if 'Time to 1st decision' in journal_text:
                    time_to_1st_decision = journal_text.split('Time to 1st decision')[1].split('Time to acceptance')[0].strip()

                if 'Time to acceptance' in journal_text:
                    time_to_acceptance = journal_text.split('Time to acceptance')[1].split('Acceptance rate')[0].strip()

                if 'Acceptance rate' in journal_text:
                    acceptance_rate = journal_text.split('Acceptance rate')[1].strip()

            # Update the database with scraped info, the determined type, and the journal link
            update_journal_info(conn, table_name, journal_name, impact_factor, cite_score, time_to_1st_decision, time_to_acceptance, acceptance_rate, journal_type, journal_link)
            print(f"Journal information updated for {journal_name} with type {journal_type} and link {journal_link}.")
    finally:
        # Close the browser and the SQLite connection when done
        driver.quit()
        conn.close()

# Creating the data based on the provided image
journal_data = {
    "Journal Name": [
        "Applied Energy", "Energy", "Energy and Buildings", "Renewable Energy",
        "Energy Conversion and Management", "Engineering Applications of Artificial Intelligence",
        "Energy Reports", "Expert Systems with Applications", "Electric Power Systems Research", 
        "Building and Environment", "International Journal of Electrical Power & Energy Systems",
        "Journal of Building Engineering", "Journal of Energy Storage", "Sustainable Cities and Society", 
        "Journal of Cleaner Production", "Sustainable Energy Technologies and Assessments",
        "Applied Soft Computing", "Energy and AI", "Procedia Computer Science", 
        "Computers and Electrical Engineering", "Neurocomputing", "Biomedical Signal Processing and Control", 
        "Sustainable Energy, Grids and Networks", "Sustainable Computing: Informatics and Systems", 
        "Heliyon"
    ],
    "Frequency": [
        248, 204, 113, 74, 65, 58, 56, 54, 51, 49, 48, 46, 44, 40, 39, 37, 
        33, 32, 30, 29, 29, 29, 28, 26, 24
    ]
}

# Convert the data into a DataFrame
journal_df = pd.DataFrame(journal_data)

# Call the scraping function
scrape_and_update_database(journal_df, db_file='elsevier.db', table_name = "energy_deeplearning")


Journal information updated for Applied Energy with type Hybrid and link https://www.elsevier.com/locate/issn/0306-2619?adobe_mc=MCORGID%3D4D6368F454EC41940A4C98A6%2540AdobeOrg%7CTS%3D1726453543&dgcid=sd:jf:search.
Journal information updated for Energy with type Hybrid and link https://www.elsevier.com/locate/issn/0360-5442?adobe_mc=MCORGID%3D4D6368F454EC41940A4C98A6%2540AdobeOrg%7CTS%3D1726453560&dgcid=sd:jf:search.
Journal information updated for Energy and Buildings with type Hybrid and link https://www.elsevier.com/locate/issn/0378-7788?adobe_mc=MCORGID%3D4D6368F454EC41940A4C98A6%2540AdobeOrg%7CTS%3D1726453569&dgcid=sd:jf:search.
Journal information updated for Renewable Energy with type Hybrid and link https://www.elsevier.com/locate/issn/0960-1481?adobe_mc=MCORGID%3D4D6368F454EC41940A4C98A6%2540AdobeOrg%7CTS%3D1726453575&dgcid=sd:jf:search.
Journal information updated for Energy Conversion and Management with type Hybrid and link https://www.elsevier.com/locate/issn/0196-8904?ad