## Web Scraping en el Journal of Macroeconomics

El objetivo es vincularme con la página web del Journal of Macroeconomics para extraer la informacion de cada volumen, los articulos de cada uno, los autores, los links, entre otros.

Voy a trabajar con un entorno virtual llamado `env`. El Driver de Chrome se puede descargar en https://sites.google.com/chromium.org/driver/downloads
 

In [10]:
import pandas as pd
from time import sleep
# from numba import njit

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

import warnings
warnings.simplefilter("ignore")

Nos vinculamos con la sección de articulos dentro del sitio web del Journal of Macroeconomics

In [11]:
def html_import(url):
    driver = webdriver.Chrome("./driver/chromedriver.exe")
    driver.get(url)

    # Clicks a los botones
    for i in range(1, 30):
        try:
            button = driver.find_element(By.XPATH, value=f'/html/body/div[3]/div/div/div/main/div[2]/div/section[2]/div/div/ol/li[{i}]/button')
            button.click()
            sleep(1)
        except:
            pass

    # Código fuente como HTML
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    
    driver.close()

    return soup

Definimos los links para la importacion

In [12]:
%%time

url_1 = "https://www.sciencedirect.com/journal/journal-of-macroeconomics/issues?page=1"
url_2 = "https://www.sciencedirect.com/journal/journal-of-macroeconomics/issues?page=2"
url_3 = "https://www.sciencedirect.com/journal/journal-of-macroeconomics/issues?page=3"

page_1 = html_import(url_1)
page_2 = html_import(url_2)
page_3 = html_import(url_3)

  driver = webdriver.Chrome("./driver/chromedriver.exe")


CPU times: total: 234 ms
Wall time: 1min 12s


Tras haber importado todos los elementos dentro de los links, es importante definir los elementos que deseamos importar y sus clases

* Secciones: `<li class="accordion-panel js-accordion-panel">`
* Volumenes: `<div class="issue-item u-margin-s-bottom">`
* Nombre del volumen: `<span class="anchor-text">`
* Link: `<a class="anchor js-issue-item-link text-m anchor-default">`

In [17]:
def get_volumens(soup):
    sections = soup.find_all("li", {"class": "accordion-panel js-accordion-panel"})

    list_names=[]
    list_urls=[]
    list_date=[]

    for section in sections:
        volumens = section.find_all("div", {"class": "issue-item u-margin-s-bottom"})

        for volume in volumens:
            name = volume.find("span", {"class": "anchor-text"}).text
            url = volume.find("a", {"class": "anchor js-issue-item-link text-m anchor-default"}).get("href")
            date = volume.find("h3", {"class": "js-issue-status text-s"}).text

            # Guardando los resultados
            list_names.append(name)
            list_urls.append(url)
            list_date.append(date)

    return list_names, list_urls, list_date

In [18]:
names_1, urls_1, dates_1 = get_volumens(page_1)
names_2, urls_2, dates_2 = get_volumens(page_2)
names_3, urls_3, dates_3 = get_volumens(page_3)

names = names_1 + names_2 + names_3
urls = urls_1 + urls_2 + urls_3
dates = dates_1 + dates_2 + dates_3

# Dataframe
dta_volumens = pd.DataFrame({"volume_name": names, "volumen_date": dates, "volume_url": urls})
dta_volumens 

Unnamed: 0,volume_name,volumen_date,volume_url
0,Volume 74,December 2022,/journal/journal-of-macroeconomics/vol/74/suppl/C
1,Volume 73,September 2022,/journal/journal-of-macroeconomics/vol/73/suppl/C
2,Volume 72,June 2022,/journal/journal-of-macroeconomics/vol/72/suppl/C
3,Volume 71,March 2022,/journal/journal-of-macroeconomics/vol/71/suppl/C
4,Volume 70,December 2021,/journal/journal-of-macroeconomics/vol/70/suppl/C
...,...,...,...
175,"Volume 2, Issue 1",Pages 1-102 (Winter 1980),/journal/journal-of-macroeconomics/vol/2/issue/1
176,"Volume 1, Issue 4",Pages 321-426 (Autumn 1979),/journal/journal-of-macroeconomics/vol/1/issue/4
177,"Volume 1, Issue 3",Pages 245-320 (Summer 1979),/journal/journal-of-macroeconomics/vol/1/issue/3
178,"Volume 1, Issue 2",Pages 149-243 (Spring 1979),/journal/journal-of-macroeconomics/vol/1/issue/2


* Article: `<h3 class="text-m u-font-serif u-display-inline">`
* Url: `<a class="anchor article-content-title u-margin-xs-top u-margin-s-bottom anchor-default">`
* Name: `<span class="js-article-title">`

In [None]:
def get_articles(array):
    
    list_articles = []

    for i in array:
        # Extraendo los nombres de los articulos en cada HTML
        soup = html_import(f"https://www.sciencedirect.com{i}")
        articles = soup.find_all("h3", {"class": "text-m u-font-serif u-display-inline"})

        for article in articles:
            name = article.find("span", {"class": "js-article-title"}).text
            url = article.find("a", {"class": "anchor article-content-title u-margin-xs-top u-margin-s-bottom anchor-default"}).get("href")
            
            # Guardando resultados
            list_articles.append([i, name, url])    

    return list_articles

In [None]:
%%time

urls = dta_volumens["volume_url"]

articles = get_articles(urls)

In [None]:
dta_articles = pd.DataFrame(articles, columns=["volume_url", "article_name", "article_url"])
dta_articles

* Autores: `<a class="author size-m workspace-trigger">`
* Nombre: `<span class="text given-name">`
* Apellido: `<span class="text surname">`
* Doi: `<a class="doi">`
* Keyword: `<div class="keyword">`

In [None]:
def get_components(array):
    
    list_components = []

    for i in array:
        # Entraendo los componentes de cada uno de los articulos
        soup = html_import(f"https://www.sciencedirect.com{i}")


        try:
            # Elementos
            doi = soup.find("a", {"class": "doi"}).get("href")
            keywords = soup.find_all("div", {"class": "keyword"})
            group_authors = soup.find_all("a", {"class": "author size-m workspace-trigger"})

            list_authors = []

            for authors in group_authors:
                name = authors.find("span", {"class": "text given-name"}).text
                surname = authors.find("span", {"class": "text surname"}).text

                author = f"[{surname}, {name}]"
                list_authors.append(author)

            # Union
            list_components.append([i, list_authors, doi, keywords])
        except:
            pass

    return list_components

In [None]:
%%time

urls = dta_articles["article_url"]

components = get_components(urls)

In [None]:
dta_components = pd.DataFrame(components, columns=["article_url", "authors", "doi", "keywords"])
dta_components

Finalmente, se puede unir todos los resultados en un único Dataframe

In [None]:
dta = dta_components.merge(dta_articles, how="inner").merge(dta_volumens, how="inner")

# Completando el nombre de los urls
dta["article_url"] = "https://www.sciencedirect.com" + dta["article_url"].astype(str)
dta["volume_url"] = "https://www.sciencedirect.com" + dta["volume_url"].astype(str)

dta_f = dta[["article_name", "authors", "article_url", "doi", "keywords", "volume_name", "volumen_date", "volume_url"]]
dta_f[0:10]

In [None]:
dta_f.to_csv("./Bases de datos/dta_f.csv")