# Selenium Scrape

## Load Modules

In [26]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
import csv

# Define the URL and user-agent to mimic a real browser
url = 'https://www.sciencedirect.com/science/article/pii/S0927537123000404'
user_agent = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.1234.0 Safari/537.36"
)

# Use a non-headless browser with a custom user-agent
options = webdriver.ChromeOptions()
options.add_argument(f"user-agent={user_agent}")
options.add_argument('--headless=new')

## Find the first article of every volume

### Journals to scrape

In [49]:
scrape = [('economics-letters',231),('economic-modelling', 127),
           ('journal-of-econometrics',236),('journal-of-development-economics',165),
           ('journal-of-applied-economics',20),('the-journal-of-socio-economics',48),('journal-of-economic-theory',212)]
n_vol = 45 #No. of volumes to scrape per journal

### Index articles

In [28]:
def index_volumes(journal,vol, n_vol):
    url_base = "https://www.sciencedirect.com/journal/" + str(journal) + "/vol/"
    url_list = []
    for i in range(n_vol): 
        url_pre = url_base + str(vol-i) + "/suppl/C"
        url_list.append(url_pre)
    
    driver = webdriver.Chrome(options=options)

    first_page_url = []
    
    for it, url in enumerate(url_list):
        print(f"Processing URL {it+1}/{len(url_list)}: {url}")
        try:
            driver.get(url)
        
            # Wait for the first link with class "article-content-title" to be clickable
            wait = WebDriverWait(driver, 10)
            link = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.article-content-title')))
        
            # Click the link
            link.click()
        
            # Get the current URL of the page
            first_page_url.append(driver.current_url)
        except Exception as e:
            print(f"An error occurred for URL {url}")
            
    return first_page_url

### Scrape journal

In [29]:
def scrape_article(first_page_url):
    journal = []
    for it, url in enumerate(first_page_url):
        print(f"Processing URL {it+1}/{len(first_page_url)}: {url}")

        driver = webdriver.Chrome(options=options)
        driver.get(url)

        i = 0

        page_source = []

        while i < 50:
            time.sleep(5)  # Wait for a longer time between requests

            #print(i)
            page_source.append(driver.page_source)

            try:
                # Wait for the next button to become clickable before clicking it
                wait = WebDriverWait(driver, 10)
                next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.button-alternative-icon-right')))
                next_button.click()

                i += 1
            except Exception as e:
                print("Error with article")
                break
    
        journal.append(page_source)
        

    # Close the browser
    driver.quit()
    return journal

## Data Treatment Stage 1 

In [45]:
def create_df(page_source):
    df_data = []

    for j in range(len(page_source)):
        authors_dict = {}  # Create a new dictionary for each row
        journal_name = []
        citations = []
        year = []
        introduction = []
        possible_id = ['sec_0001','sec_001','sec_01','sec_1','sec1']
        
        soup = bs(page_source[j], 'html.parser')
        
        for ids in possible_id:
            element_with_id = soup.select_one('section#' + ids)
            if element_with_id:
                break 
            
        if element_with_id:
            introduction = element_with_id.get_text(strip=True)
        else:
            introduction = None  # Handle case when element is not found
            #print('Intro Error')
        
        # Find all <span> elements with class="given-name"
        author_spans = soup.find_all('span', class_='given-name')

        # Loop through the author spans and populate the dictionary
        for i, author_span in enumerate(author_spans):
            try:
                if i < 5:
                    authors_dict[f'Author {i+1}'] = author_span.get_text().split()[0]
                    if "." in authors_dict[f'Author {i+1}']:
                        author_name = "Unknown"
                        authors_dict[f'Author {i+1}'] = author_name
                else:
                    break
            except AttributeError:
                authors_dict[f'Author {i+1}'] = "None"
                #print('error author')
        
        # Fill in remaining entries with "None" for Author 4 and Author 5
        for i in range(len(author_spans), 5):
            authors_dict[f'Author {i+1}'] = "None"
        
        try:
            j_name = soup.find('a', class_="publication-title-link").get_text()
            journal_name = str(j_name)
        except:
            journal_name = "Error no journal name"
            #print('error journal')
            
        try:
            cit = soup.find('header', id="citing-articles-header").get_text().split()[2]
            #cit = cit[-1:][:-1]
            #citations = int(cit)
            cit = ''.join(filter(str.isdigit, cit))
            citations = int(cit)
            #citations = cit
        except:
            citations = None
            #print('error citations')
            
        try:
            yr = soup.find('div', class_="text-xs").get_text()
            year = yr

        except:
            year = None
            #print('error yr')
        
        
        # Append the data for this row to the list of rows
        df_data.append({'Introduction': introduction, **authors_dict, 'Journal': journal_name, 'Year': year, 'Citations': citations})

    df = pd.DataFrame(df_data)

    return df

In [46]:
### Extract år ###
import re

def extract_year(s):
    year_match = re.search(r'(\d{4})', s)
    if year_match:
        return year_match.group(1)
    else:
        return None

In [47]:
def combine_dataframes(list_of_lists):
    combined_df = pd.DataFrame()  # Initialize an empty DataFrame to store the combined data

    for sublist in list_of_lists:
        # Apply create_df function to the sublist and get a dataframe
        df = create_df(sublist)
        
        # Concatenate the current dataframe with the combined dataframe
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    df['Year'] = df['Year'].apply(extract_year)
    #df['Author 4'] = df['Author 4'].astype(str)
    #df['Author 5'] = df['Author 5'].astype(str)
    
    return combined_df

## Execute Scraping and export

In [48]:
sub_scrape = scrape[1]
print(sub_scrape)
articles = []

for j_1 in scrape:
    name_of_journal, j_vol = j_1
    print(name_of_journal)
    url_index = index_volumes(name_of_journal , j_vol, n_vol)
    print('no of vols ' + str(len(url)))
    
    data = scrape_article(url_index)
    data_treated = combine_dataframes(data)
    
    #articles.append(data)
    
    # Specify the CSV file name
    csv_file = 'articles_out_' + str(name_of_journal) + '.csv'
    data_treated.to_csv(csv_file, index=False)
    
    break
    
    

('economic-modelling', 127)
economics-letters
Processing URL 1/1: https://www.sciencedirect.com/journal/economics-letters/vol/231/suppl/C
no of vols 67
Processing URL 1/1: https://www.sciencedirect.com/science/article/pii/S0165176523002823
Error with article


In [43]:
print(data_treated)

                                        Introduction Author 1  Author 2  \
0  1. IntroductionTheories of imitation, conformi...    Paola   Georgia   
1  1. IntroductionThe all-or-nothing mechanism is...  Timothy  Robertas   
2  1. IntroductionA bilateral exchange rate is th...  Michael       NaN   
3  1. IntroductionPerformance-contingent bonuses ...     Timo       NaN   
4  1. IntroductionSocial media has become an esse...       Ho       Ali   
5  1. IntroductionA series of theoretical models ...  Antoine       NaN   

  Author 3            Journal                              Year  Citations  \
0     Luis  Economics Letters  Volume 231, October 2023, 111257          0   
1      NaN  Economics Letters  Volume 231, October 2023, 111265          0   
2      NaN  Economics Letters  Volume 231, October 2023, 111267          0   
3      NaN  Economics Letters  Volume 231, October 2023, 111266          0   
4   Sascha  Economics Letters  Volume 231, October 2023, 111270          0   
5     