# A Scraper for Top Econ Journal Ariticles

In [1]:
import pandas as pd
import bs4
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
import random
import csv
import winsound

## 1. The Review of Financial Studies

In [2]:
# set selenium service
chromedrive_path = "D:\chromedriver.exe"
s = Service(chromedrive_path)
# open driver
driver = webdriver.Chrome(service=s)

In [None]:
# for pages
base_url = "https://econpapers.repec.org/article/ouprfinst/default"
pages = [".htm"]
for i in range(25):
    pages.append(str(i+1)+".htm")

# set attrs
attrs = ["journal","title","authors","volume","jel","abstract","url","doi"]
dict = {}
for attr in attrs:
    dict[attr] = []

# prefix for article link
prefix = "https://econpapers.repec.org/article/ouprfinst/"

for p, page in enumerate(pages):
    print("Start: Page", p+1)
    driver.get(base_url+page)
    driver.refresh()
    # load into soup
    soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
    links = [prefix+dttag.a.get("href") for dttag in soup.find_all("dt") if dttag.a != None]
    for i, link in enumerate(links):
        # load page
        driver.get(link)
        driver.refresh()
        # load into soup
        soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
        # get url
        url = link
        # get title
        title = soup.find("h1").text
        if soup.find("h2") != None:
            h = soup.find("h2") 
        else:
            h = soup.find("h1")
        # get authors
        authorstag = h.next_sibling.next_sibling
        authors = [itag.text for itag in authorstag.find_all("i")]
        # get journal and volume
        publishtag = authorstag.next_sibling
        publish_text = publishtag.text.strip().split(", ")
        journal = publish_text[0]
        volume = ", ".join(publish_text[1:])
        # get abstract
        abstracttag = publishtag.next_sibling
        abstract = abstracttag.text.replace("\n"," ").strip()
        # get jel
        jel = [atag.text for atag in abstracttag.next_sibling.find_all("a") if "/scripts/search.pf?jel=" in atag.get("href")]
        # get doi
        doi = abstracttag.next_sibling.next_sibling.a.text.replace("http://hdl.handle.net/","")
        # append to dict
        dict["journal"].append(journal)
        dict["title"].append(title)
        dict["authors"].append(authors)
        dict["volume"].append(volume)
        dict["abstract"].append(abstract)
        dict["jel"].append(jel)
        dict["url"].append(url)
        dict["doi"].append(doi)
        
        print("Finished:", i+1)

In [70]:
df = pd.DataFrame(dict)
df.head()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi
0,Review of Financial Studies,New Perspectives on Insurance,"[Ralph S J Koijen, Motohiro Yogo]","2022, vol. 35, issue 12, 5275-5286","[G22, G52, I13, J65]",Abstract: This special issue originates from a...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhac063
1,Review of Financial Studies,Trust and Insurance Contracts,"[Nicola Gennaioli, Rafael La Porta, Florencio ...","2022, vol. 35, issue 12, 5287-5333","[D23, G22, L14, Z13]",Abstract: We assemble homeowner insurance clai...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhab112
2,Review of Financial Studies,Conflicting Interests and the Effect of Fiduci...,"[Mark Egan, Shan Ge, Johnny Tang]","2022, vol. 35, issue 12, 5334-5386","[D14, D18, G22, G24, G28]",Abstract: We examine the variable annuity mark...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhac047
3,Review of Financial Studies,Can Risk Be Shared across Investor Cohorts? Ev...,"[Johan Hombert, Victor Lyonnet]","2022, vol. 35, issue 12, 5387-5437","[G22, G52]",Abstract: We study how retail savings products...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhac054
4,Review of Financial Studies,Regulatory Forbearance in the U.S. Insurance I...,"[Bo Becker, Marcus M Opp, Farzad Saidi]","2022, vol. 35, issue 12, 5438-5482","[G20, G22, G23, G28]",Abstract: We analyze the effects of a reform o...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhab102


In [72]:
df.to_csv("data/journals/rfs.csv", index = False, encoding = "utf-8-sig")

In [3]:
# define function to scrape
def scrape_articles(journalabbr, page_num, output_filename):
    # for pages
    base_url = "https://econpapers.repec.org/article/"+journalabbr+"/default"
    pages = [".htm"]
    for i in range(page_num):
        pages.append(str(i+1)+".htm")

    # set attrs
    attrs = ["journal","title","authors","volume","jel","abstract","url","doi"]
    dict = {}
    for attr in attrs:
        dict[attr] = []

    # prefix for article link
    prefix = "https://econpapers.repec.org/article/"+journalabbr+"/"

    for p, page in enumerate(pages):
        driver.get(base_url+page)
        driver.refresh()
        # load into soup
        soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
        links = [prefix+dttag.a.get("href") for dttag in soup.find_all("dt") if dttag.a != None]
        for i, link in enumerate(links):
            # load page
            driver.get(link)
            driver.refresh()
            # load into soup
            soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
            # get url
            url = link
            # get title
            title = soup.find("h1").text
            if soup.find("h2") != None:
                h = soup.find("h2") 
            else:
                h = soup.find("h1")
            # get authors
            authorstag = h.next_sibling.next_sibling
            authors = [itag.text for itag in authorstag.find_all("i")]
            # get journal and volume
            publishtag = authorstag.next_sibling
            publish_text = publishtag.text.strip().split(", ")
            journal = publish_text[0]
            volume = ", ".join(publish_text[1:])
            # get abstract
            abstracttag = publishtag.next_sibling
            abstract = abstracttag.text.replace("\n"," ").strip()
            if 'type="main">' in abstract:
                abstracttag = abstracttag.next_sibling
                abstract = "Abstract: "+abstracttag.text.replace("\n"," ").strip()
            # get jel
            jel = [atag.text for atag in abstracttag.next_sibling.find_all("a") if "/scripts/search.pf?jel=" in atag.get("href")]
            # get doi
            if "Abstract" in abstract:
                doi = abstracttag.next_sibling.next_sibling.a.text.replace("http://hdl.handle.net/","")
            else:
                doi = abstracttag.next_sibling.a.text.replace("http://hdl.handle.net/","")
            if "issue-" in doi[-8:]:
                doi = ""
            # append to dict
            dict["journal"].append(journal)
            dict["title"].append(title)
            dict["authors"].append(authors)
            dict["volume"].append(volume)
            dict["abstract"].append(abstract)
            dict["jel"].append(jel)
            dict["url"].append(url)
            dict["doi"].append(doi)
            
            print("Finished: Page ", p+1, "-",i+1,doi)

    # save to csv
    df = pd.DataFrame(dict)
    df.to_csv(output_filename, index = False, encoding = "utf-8-sig")

In [5]:
t = "casdca.issue-17"
t[-8:]

'issue-17'

## 2. The Quarterly Journal of Economics

In [None]:
scrape_articles("oupqjecon",75,"data/journals/qje.csv")

## 3. Journal of Political Economy

In [None]:
scrape_articles("ucpjpolec", 82, "data/journals/jpe.csv")

## 4. Journal of Finance

In [None]:
scrape_articles("blajfinan", 68, "data/journals/jf.csv")

## 5. The Review of Economic Studies

In [None]:
scrape_articles("ouprestud", 52, "data/journals/res.csv")

## 6. Econometrica

In [None]:
scrape_articles("wlyemetrp", 8, "data/journals/e.csv")

In [None]:
df_old = pd.read_csv("data/journals/e.csv", encoding="utf-8-sig")
df_old.head()

In [None]:
df_new = scrape_articles("ecmemetrp",43,"x")

# Scrape author_article linkage and author google scholar htmls (save to local)

In [52]:
# set selenium service and open driver
chromedrive_path = "D:/chromedriver.exe"
s = Service(chromedrive_path)
driver = webdriver.Chrome(service = s)

In [76]:
df = pd.read_csv("data/journals/raw_data/restat.csv", encoding = "utf-8-sig").astype("str")
df["doi"] = df["doi"].apply(lambda x: x.strip().replace("http://dx.doi.org/","").replace("https://doi.org/","").replace("http://hdl.handle.net/","").replace("http://www.mitpressjournals.org/doi/pdf/",""))
df.tail()

Unnamed: 0,journal,title,authors,abstract,cite,jel,url,volume,doi,year
4368,The Review of Economics and Statistics,"Comment on the ""H"" Concentration Measure as a ...","['Adelman, M A']",No abstract is available for this item.,"Adelman, M A, 1969. ""Comment on the ""H"" Concen...",N\A,https://ideas.repec.org/a/tpr/restat/v51y1969i...,"51(1), pages 99-101, February.",http://links.jstor.org/sici?sici=0034-6535%281...,1969
4369,The Review of Economics and Statistics,Import Constraints and Development: Causes of ...,"['Bergsman, Joel', 'Morley, Samuel A']",No abstract is available for this item.,"Bergsman, Joel & Morley, Samuel A, 1969. ""Impo...",N\A,https://ideas.repec.org/a/tpr/restat/v51y1969i...,"51(1), pages 101-102, February.",http://links.jstor.org/sici?sici=0034-6535%281...,1969
4370,The Review of Economics and Statistics,Import Constraints and Development: A Reply,"['Leff, Nathaniel H']",No abstract is available for this item.,"Leff, Nathaniel H, 1969. ""Import Constraints a...",N\A,https://ideas.repec.org/a/tpr/restat/v51y1969i...,"51(1), pages 102-104, February.",http://links.jstor.org/sici?sici=0034-6535%281...,1969
4371,The Review of Economics and Statistics,Profitability and Size of Firm: Some Further E...,"['Marcus, Matityahu']",No abstract is available for this item.,"Marcus, Matityahu, 1969. ""Profitability and Si...",N\A,https://ideas.repec.org/a/tpr/restat/v51y1969i...,"51(1), pages 104-107, February.",http://links.jstor.org/sici?sici=0034-6535%281...,1969
4372,The Review of Economics and Statistics,Effect of the Length of the Time Period on Ser...,"['Rayner, A C']",No abstract is available for this item.,"Rayner, A C, 1969. ""Effect of the Length of th...",N\A,https://ideas.repec.org/a/tpr/restat/v51y1969i...,"51(1), pages 107-108, February.",http://links.jstor.org/sici?sici=0034-6535%281...,1969


In [None]:
for d in df["doi"]:
    print(d)

In [81]:
authorids = []
articleids = []

base_url = "https://scholar.google.com/scholar?hl=zh-CN&q="
start_idx = 1532
existing_doi = list(pd.read_csv("author_article.csv", encoding = "utf-8-sig")["doi"])

for i, doi in enumerate(df["doi"][start_idx:]):
    if ("10." == doi[:3]) and (doi not in existing_doi) and ("issue" not in doi):
        try:
            driver.get(base_url+doi)
            driver.refresh()
            soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
            authordiv = soup.find("div", "gs_a")
            authorlinks = ["https://scholar.google.com/"+tag.get("href") for tag in authordiv.find_all("a")]
            if authorlinks != []:
                for link in authorlinks:
                    # get authorid
                    authorid = link.split("user=")[1].split("&")[0]
                    driver.get(link)
                    driver.refresh()
                    soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
                    # save to html
                    with open("data/authors/html/"+authorid+".html", "w", encoding = "utf-8-sig") as file:
                        file.write(str(soup))
                    file.close()
                    # append authorid
                    authorids.append(authorid)
                    # append articleid
                    articleids.append(doi)
                    print(str(i+start_idx)+","+authorid+","+doi)
                    time.sleep(random.randrange(1,10))
            else:
                time.sleep(random.randrange(1,10))
        except:
            try:
                time.sleep(random.randrange(1,10))
                res = soup.find("div", id = "gs_res_ccl_top").text
            except:
                with open("author_article.csv", "a", newline="") as file:
                    writer = csv.writer(file)
                    for j in range(len(authorids)):
                        writer.writerow([authorids[j],articleids[j]])
                file.close()
                print("Blocked!!!")
                winsound.Beep(2500,1000)
                break

1532,GpVUFVMAAAAJ,10.1162/003465302320259484
1532,upFspGoAAAAJ,10.1162/003465302320259484
1533,zQAhyTsAAAAJ,10.1162/003465302320259493
1533,8rjowPQAAAAJ,10.1162/003465302320259493
1533,ueDpb64AAAAJ,10.1162/003465302320259493
1536,61QM_esAAAAJ,10.1162/003465302320259529
1538,ok06MuMAAAAJ,10.1162/003465302320259547
1540,KW_synoAAAAJ,10.1162/003465302317411479
1540,TIfa3okAAAAJ,10.1162/003465302317411479
1541,Wo7OPBQAAAAJ,10.1162/003465302317411488
1542,m1-SRZwAAAAJ,10.1162/003465302317411497
1542,RRevF6oAAAAJ,10.1162/003465302317411497
1544,uEhRnHIAAAAJ,10.1162/003465302317411514
1544,5rI7fxYAAAAJ,10.1162/003465302317411514
1546,9Vkps6sAAAAJ,10.1162/003465302317411532
1546,TAj2jqAAAAAJ,10.1162/003465302317411532
1547,NYk0FcgAAAAJ,10.1162/003465302317411541
1547,hZN95ZgAAAAJ,10.1162/003465302317411541
1548,T88Gev4AAAAJ,10.1162/003465302317411550
1549,sY_KmtUAAAAJ,10.1162/003465302317411569
1550,lIV7LhIAAAAJ,10.1162/003465302317411578
1550,DTbePNAAAAAJ,10.1162/003465302317411578
1552,rOxxi

In [80]:
i+start_idx

1532

In [82]:
with open("author_article.csv", "a", newline="") as file:
    writer = csv.writer(file)
    for j in range(len(authorids)):
        writer.writerow([authorids[j],articleids[j]])
file.close()