In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

In [2]:
def read_one_page(url):
    '''
    Crawl the paper title and paper url in one page of the journal database.

    Inputs:
        url(str): The url of one page of the journal database.

    Outputs:
        Volume(Dictionary): The dict record the title and url for each paper.
    '''
    
    Volume = {"Title":[],"Link":[]}
    # Read the journal database page and find all the paper links.
    response = requests.get(url)
    soup = BeautifulSoup(response.text,"html")
    papers = soup.find_all("li", class_ = "list-group-item downgate")
    papers.extend(soup.find_all("li", class_ = "list-group-item downfree"))
    papers.extend(soup.find_all("li", class_ = "list-group-item downnone"))
    # Record all the papers title and link in Volume.
    for paper in papers:
        title = paper.a.text
        href = paper.a.get("href")
        Volume["Title"].append(title)
        Volume["Link"].append("https://ideas.repec.org" + href)
    
    return Volume

In [3]:
def find_url_list(start_url, page_number):
    '''
    Get the papers url list for all the journal database/

    Inputs:
        start_url(str): The url of the first page of the journal database.
        page_number(int): The number of pages of the journal database.

    Outputs:
        Volume(DataFrame): A DataFrame record the title and url for all\
            papers of a specified journal.
    '''
    Volume = {}
    Volume = read_one_page(start_url)

    for i in range(2, page_number):
        url = start_url[:-5]
        page = "{}.html".format(i)
        url = url + page
        Volume_one_page = read_one_page(start_url)
        Volume["Title"].extend(Volume_one_page["Title"])
        Volume["Link"].extend(Volume_one_page["Link"])

    Volume = pd.DataFrame(data=Volume)
    return Volume

In [4]:
def obtain_articles_infos(url):
    '''
    Obtain and record the article's journal name, title, authors, abstract,\
        cite, url, vol, publish time and the doi on the page of a single url.
        (This code is designed for The Economic Journal in ideas.repec.org\
        database as an example, we also crawl the other journals in other databases)
    
        
    Inputs:
        url(str): The string of the URL we want to crawl.

    Outputs:
        info(DataFrame): The DataFrame with all the recorded information we\
                         said above.
    '''
    
    # Define the structure of the DataFrame.
    info = {"journal":[], "title":[],"authors":[],"abstract":[], "cite":[],\
            "JEL":[],"url":[],"vol":[], "doi":[], "year":[]}
    journal = "The Economic Journal"

    # Request the url and get the soup.
    response = requests.get(url)
    soup = BeautifulSoup(response.text,"html")

    # Crawl the title of the journal
    title = soup.find("h1").text
    
    # ideas.repec.org have three type of keyword in cite and vol, try each keyword.
    try:
        cite = soup.find("li", class_="list-group-item downgate").text
    except:
        try:
            cite = soup.find("li", class_="list-group-item downfree").text
        except:
            try:
                cite = soup.find("li", class_="list-group-item downnone").text
            except:
                cite = "N\A"
                vol = "N\A"

    # If we crawl the cite successfully, clean and record cite and vol.
    if cite != "N\A":
        vol = re.split(r"vol.", cite)[1]
        cite = re.sub('(\t|\n)'," ",cite)
        cite = re.sub('\s+'," ",cite).strip()
        vol = re.sub('(\t|\n)'," ",vol)
        vol = re.sub('\s+'," ",vol).strip()
        vol.strip(".")

    # Crawl, clean and record the author names.
    try:
        authors = soup.find_all("li", class_="authorname")
        authors = [author.text for author in authors]
    except:
        authors = []        

    # Crawl, clean and record the Abstract.
    try:
        abstract = soup.find("div", id="abstract-body").text
    except:
        abstract = "N\A"

    # Crawl, clean and record the doi.
    try:
        doi = soup.find("div", id="biblio-body").text
        [doi] = re.findall(r"DOI:...+", doi)
        doi = re.sub(r"DOI:","", doi)
    except:
        doi = "N\A"

    # If there no "doi" in ideas.repec.org, record the url of the offical website.
    if doi == "N\A":
        try:
            doi = soup.find("span", style="word-break:break-all").text
        except:
            doi = "N\A"

    # Crawl, clean and record the publish time.
    try:
        year = soup.find("i", style="word-break:break-all").text
        [year] = re.findall(r"y:....", year)
        year = re.sub("y:","",year)
    except:
        year = "N\A"

    # Only journals in AEA(American Economic Association) have JEL.
    jel = "N\A"

    # Make all the information into DataFrame.
    info["journal"].append(journal)
    info["title"].append(title)
    info["authors"].append(authors)
    info["abstract"].append(abstract)
    info["cite"].append(cite)
    info["JEL"].append(jel)
    info["url"].append(url)
    info["vol"].append(vol)
    info["doi"].append(doi)
    info["year"].append(year)
    info = pd.DataFrame(info)

    return info

In [5]:
def crawl_all_url(Volume, start = None, end = None, sleep_time = 0.5):
    '''
    Crawl all the paper informations recorded in the Volume.

    Inputs:
        Volume(DataFrame): A DataFrame record the title and url for all\
            papers of a specified journal.
        start(int): The start Volume index of the paper we want to start to crawl.
        end(int): The end Volume index of the paper we want to end the crawl.
        sleep_time(int): The seelp time for the crawl for each papaer.

    Outputs:
        df(DataFrame): The DataFrame record all the crawl information for all\
            papers in the journal.
    '''

    df = pd.DataFrame({"journal":[], "title":[],"authors":[],"abstract":[],\
                       "cite":[], "JEL":[],"url":[],"vol":[], "doi":[], "year":[]})
    count = start
    # Iterate over all volumes
    for index, row in Volume.iloc[start: end].iterrows():
        title, link = row["Title"], row["Link"]
        print(count, "Currently at", title, "link =", link)

        # Iterate over articles of the volumes
        article = obtain_articles_infos(link)
        df = pd.concat([df, article],ignore_index=True)
        time.sleep(sleep_time)
        count += 1
    
    return df


In [6]:
# Get the url for all the papers in one journal.
Volume = find_url_list("https://ideas.repec.org/s/ecj/econjl.html", 18)
print(Volume)

                                                  Title  \
0     Optimal Monetary Policy in a Model of the Cred...   
1     Gender Differences in Market Competitiveness i...   
2     Identification of Peer Effects with Missing Pe...   
3     Reaction to Public Information in Markets: How...   
4          Financial Contagion and Attention Allocation   
...                                                 ...   
3378  Information Cascades and Revolutionary Regime ...   
3379  Unionisation Triggers Tax Incentives to Attrac...   
3380  A Model of Public Consultation: Why is Binary ...   
3381  Classification, Detection and Consequences of ...   
3382  Are Children Decision‐Makers within the Househ...   

                                                   Link  
0     https://ideas.repec.org/a/ecj/econjl/v123y2013...  
1     https://ideas.repec.org/a/ecj/econjl/v123y2013...  
2     https://ideas.repec.org/a/ecj/econjl/v123y2013...  
3     https://ideas.repec.org/a/ecj/econjl/v123y2013...  
4

In [7]:
# Crawl the information of a certain papers in the journal. (the first 10 papers)
df = crawl_all_url(Volume, start = 0, end = 10, sleep_time = 0.5)
df

0 Currently at Optimal Monetary Policy in a Model of the Credit Channel link = https://ideas.repec.org/a/ecj/econjl/v123y2013i571p906-931.html
1 Currently at Gender Differences in Market Competitiveness in a Real Workplace: Evidence from Performance‐based Pay Tournaments among Teachers link = https://ideas.repec.org/a/ecj/econjl/v123y2013i569p540-573.html
2 Currently at Identification of Peer Effects with Missing Peer Data: Evidence from Project STAR link = https://ideas.repec.org/a/ecj/econjl/v123y2013i569p574-605.html
3 Currently at Reaction to Public Information in Markets: How much does Ambiguity Matter? link = https://ideas.repec.org/a/ecj/econjl/v123y2013i569p699-737.html
4 Currently at Financial Contagion and Attention Allocation link = https://ideas.repec.org/a/ecj/econjl/v123y2013i568p429-454.html
5 Currently at Do Prediction Markets Produce Well‐Calibrated Probability Forecasts?-super- link = https://ideas.repec.org/a/ecj/econjl/v123y2013i568p491-513.html
6 Currently at Optim

Unnamed: 0,journal,title,authors,abstract,cite,JEL,url,vol,doi,year
0,The Economic Journal,Optimal Monetary Policy in a Model of the Cred...,"[Fiorella De Fiore, Oreste Tristani]",We consider a simple extension of the basic ne...,"Fiorella De Fiore & Oreste Tristani, 2013. ""Op...",N\A,https://ideas.repec.org/a/ecj/econjl/v123y2013...,"123(571), pages 906-931, September.",http://hdl.handle.net/10.1111/,2013
1,The Economic Journal,Gender Differences in Market Competitiveness i...,[Victor Lavy],Recent lab and field experiments suggest that ...,"Victor Lavy, 2013. ""Gender Differences in Mark...",N\A,https://ideas.repec.org/a/ecj/econjl/v123y2013...,"123(569), pages 540-573, June.",http://hdl.handle.net/10.1111/,2013
2,The Economic Journal,Identification of Peer Effects with Missing Pe...,[Aaron Sojourner],This paper studies peer effects on student ach...,"Aaron Sojourner, 2013. ""Identification of Peer...",N\A,https://ideas.repec.org/a/ecj/econjl/v123y2013...,"123(569), pages 574-605, June.",http://hdl.handle.net/10.1111/,2013
3,The Economic Journal,Reaction to Public Information in Markets: How...,"[Brice Corgnet, Praveen Kujal, David Porter]","In this article, we experimentally study trade...","Brice Corgnet & Praveen Kujal & David Porter, ...",N\A,https://ideas.repec.org/a/ecj/econjl/v123y2013...,"123(569), pages 699-737, June.",http://hdl.handle.net/10.1111/,2013
4,The Economic Journal,Financial Contagion and Attention Allocation,"[Jordi Mondria, Climent Quintana‐Domeque]",This paper explains financial contagion betwee...,"Jordi Mondria & Climent Quintana‐Domeque, 2013...",N\A,https://ideas.repec.org/a/ecj/econjl/v123y2013...,"123(568), pages 429-454, May.",http://hdl.handle.net/10.1111/,2013
5,The Economic Journal,Do Prediction Markets Produce Well‐Calibrated ...,"[Lionel Page, Robert T. Clemen]",No abstract is available for this item.,"Lionel Page & Robert T. Clemen, 2013. ""Do Pred...",N\A,https://ideas.repec.org/a/ecj/econjl/v123y2013...,"123(568), pages 491-513, May.",http://hdl.handle.net/10.1111/,2013
6,The Economic Journal,Optimal Bank Capital,"[David Miles, Jing Yang, Gilberto Marcheggiano]",This paper reports estimates of the long-run c...,David Miles & Jing Yang & Gilberto Marcheggian...,N\A,https://ideas.repec.org/a/ecj/econjl/v123y2013...,"123(567), pages 1-37, March.",http://hdl.handle.net/10.1111/,2013
7,The Economic Journal,Social Learning and Monetary Policy Rules,"[Jasmina Arifovic, James Bullard, Olena Kostys...",We analyze the effects of social learning in a...,Jasmina Arifovic & James Bullard & Olena Kosty...,N\A,https://ideas.repec.org/a/ecj/econjl/v123y2013...,"123(567), pages 38-76, March.",http://hdl.handle.net/10.1111/,2013
8,The Economic Journal,Testing for Asymmetric Information in Private ...,"[Pau Olivella, Marcos Vera‐Hernández]",No abstract is available for this item.,"Pau Olivella & Marcos Vera‐Hernández, 2013. ""T...",N\A,https://ideas.repec.org/a/ecj/econjl/v123y2013...,"123(567), pages 96-130, March.",http://hdl.handle.net/10.1111/,2013
9,The Economic Journal,Market Power in the Global Economy: The Exhaus...,[Kamal Saggi],This paper analyzes economic linkages between ...,"Kamal Saggi, 2013. ""Market Power in the Global...",N\A,https://ideas.repec.org/a/ecj/econjl/v123y2013...,"123(567), pages 131-161, March.",http://hdl.handle.net/10.1111/,2013


In [8]:
# Write the DataFrame to .csv
df.to_csv("The Economic Journal_test.csv", encoding="utf-8")

In [9]:
# If we crawl a journal more than one times and get multiple csv,\
# merge them into one file and write
csv_1 = pd.read_csv("The Economic Journal23_19.csv")
csv_2 = pd.read_csv("The Economic Journal18_08.csv")
csv_3 = pd.read_csv("The Economic Journal13_1.csv")
csv_4 = pd.read_csv("The Economic Journal13_2.csv")
csv = pd.concat([csv_1, csv_2, csv_3, csv_4])
csv = csv.reset_index()
csv = csv.drop(columns = "index")
csv = csv.drop(columns = "Unnamed: 0")
csv.to_csv("The Economic Journal.csv", encoding="utf-8")