In [None]:
## Name: Nikhil Kumar
## Task: AEA Webscraping

### Using the Python Requests package, this notebook makes a call to the American Economic Review page. Using Python BeautifulSoup or re, and Pandas, it extracts the volume number, issue, and hyperlink of all issues available and creates an excel spreadsheet with that information.

### Then, it constructs a different spreadsheet with the following columns: Volume, Issue, Article Title, Authors, Page Numbers, Article Link, JEL Classification Code, JEL Classification Description for each article in each issue

### Import Packages

In [112]:
import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import requests

### Access the AEA issue webpage

In [113]:
url = "https://www.aeaweb.org/journals/aer/issues"
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')

### Scrape the data for all issues on the webpage

In [119]:
# number of rows
count = 0

# dictionary to save data
dict = {}

for link in soup.findAll('a'):
    # for every link on the issues webpage
    if ('Vol.' in link.get_text()):
        # if link-text has 'Vol.' in it

        # save url of the issue
        issue_url = link.get('href')
        
        # save text of the link
        url_text = link.get_text()

        # start index for volume number
        start = url_text.find('Vol.')+4

        # end index for volume number
        end = url_text.find(',')

        # get volume number
        vol = (url_text[start: end])

        # save data in dictionary
        dict[count] = {'Volume': 'Volume '+str(vol), 'Link': "https://www.aeaweb.org"+issue_url, 'Issue': url_text}

        # update the counter
        count += 1
# save data in dataframe
excel = pd.DataFrame.from_dict(dict).T
excel

Unnamed: 0,Volume,Link,Issue
0,Volume 110,https://www.aeaweb.org/issues/614,"December 2020 (Vol. 110, No.12 )"
1,Volume 110,https://www.aeaweb.org/issues/611,"November 2020 (Vol. 110, No.11 )"
2,Volume 110,https://www.aeaweb.org/issues/607,"October 2020 (Vol. 110, No.10 )"
3,Volume 110,https://www.aeaweb.org/issues/604,"September 2020 (Vol. 110, No.9 )"
4,Volume 110,https://www.aeaweb.org/issues/600,"August 2020 (Vol. 110, No.8 )"
...,...,...,...
159,Volume 89,https://www.aeaweb.org/issues/156,"December 1999 (Vol. 89, No.5 )"
160,Volume 89,https://www.aeaweb.org/issues/157,"September 1999 (Vol. 89, No.4 )"
161,Volume 89,https://www.aeaweb.org/issues/158,"June 1999 (Vol. 89, No.3 )"
162,Volume 89,https://www.aeaweb.org/issues/159,"May 1999 (Vol. 89, No.2 )"


### Scrape data for all the articles 

In [163]:
# dictionary to save data
dic = {}

# count number of rows
count = 1
for link in excel['Link']:
    # for every issue on AEA webpage

    # get volume number from previous dataframe
    volume = excel[excel['Link']==link].set_index('Link').loc[link,"Volume"]

    # get issue number from previous dataframe
    issue = excel[excel['Link']==link].set_index('Link').loc[link,"Issue"]
    
    # open the webpage containing articles in that issue
    html = urlopen(link)
    soup = BeautifulSoup(html, 'lxml')

    # for link on this webpage
    for art in soup.findAll('a'):

        # if the link is an article
        if '/articles' in art.get('href') and 'Front Matter' not in art.get_text():

            # save url of article
            article_url = "https://www.aeaweb.org" + art.get('href')
            
            # open article url
            html = urlopen(article_url)
            soup = BeautifulSoup(html, 'lxml')

            # save title of the article
            title = soup.findAll('h1', class_= "title")[0].get_text()
            authors = ""

            # save the names of all authors
            for item in soup.findAll('li', class_= "author"):
                authors = authors + item.get_text().strip() +", "
                authors = authors[:len(authors)-2]
            
            # save the page numbers of the article
            pages = soup.findAll('li', class_= "pages")[0].get_text()
            pages = pages[1:len(pages)-1]

            try:
                # save JEL classification code and description
                jel = soup.findAll('ul', class_= "jel-codes")[0]
                for li in jel.findAll('li'):
                    x = li.get_text()
                    jel_code = x[:4].strip()
                    jel_desc = x[12:].strip()

                    # save all the above information into a dictionary
                    dic[count] = {'volume': volume, 'issue': issue, 'article_title': title, 'authors': authors, 'page_numbers': pages, 'article_link': article_url , 'jel_code': jel_code, 'jel_description': jel_desc  }
                    count += 1
            except:
                print("hi")
# save the data into a dataframe
excel2 = pd.DataFrame.from_dict(dic).T
excel2

Unnamed: 0,volume,issue,article_title,authors,page_numbers,article_link,jel_code,jel_description
1,Volume 110,"December 2020 (Vol. 110, No.12 )",Competition and Entry in Agricultural Markets:...,Lauren Falcao BergquistMichael Dinerstein,pp. 3705-47,https://www.aeaweb.org/articles?id=10.1257/aer...,L13,Oligopoly and Other Imperfect Markets
2,Volume 110,"December 2020 (Vol. 110, No.12 )",Competition and Entry in Agricultural Markets:...,Lauren Falcao BergquistMichael Dinerstein,pp. 3705-47,https://www.aeaweb.org/articles?id=10.1257/aer...,O13,Economic Development: Agriculture; Natural Res...
3,Volume 110,"December 2020 (Vol. 110, No.12 )",Competition and Entry in Agricultural Markets:...,Lauren Falcao BergquistMichael Dinerstein,pp. 3705-47,https://www.aeaweb.org/articles?id=10.1257/aer...,Q11,Agriculture: Aggregate Supply and Demand Analy...
4,Volume 110,"December 2020 (Vol. 110, No.12 )",Competition and Entry in Agricultural Markets:...,Lauren Falcao BergquistMichael Dinerstein,pp. 3705-47,https://www.aeaweb.org/articles?id=10.1257/aer...,Q12,"Micro Analysis of Farm Firms, Farm Households,..."
5,Volume 110,"December 2020 (Vol. 110, No.12 )",Competition and Entry in Agricultural Markets:...,Lauren Falcao BergquistMichael Dinerstein,pp. 3705-47,https://www.aeaweb.org/articles?id=10.1257/aer...,Q13,Agricultural Markets and Marketing; Cooperativ...
...,...,...,...,...,...,...,...,...
15076,Volume 89,"March 1999 (Vol. 89, No.1 )",Social Distance and Other-Regarding Behavior i...,Iris BohnetBruno S. Frey,pp. 335-339,https://www.aeaweb.org/articles?id=10.1257/aer...,C72,Noncooperative Games
15077,Volume 89,"March 1999 (Vol. 89, No.1 )",Social Distance and Other-Regarding Behavior i...,Elizabeth HoffmanKevin McCabeVernon L. Smith,pp. 340-341,https://www.aeaweb.org/articles?id=10.1257/aer...,C72,Noncooperative Games
15078,Volume 89,"March 1999 (Vol. 89, No.1 )","Optimal Inflation Targets, ""Conservative"" Cent...",Roel M. W. J. BeetsmaHenrik Jensen,pp. 342-347,https://www.aeaweb.org/articles?id=10.1257/aer...,E52,Monetary Policy
15079,Volume 89,"March 1999 (Vol. 89, No.1 )","Optimal Inflation Targets, ""Conservative"" Cent...",Roel M. W. J. BeetsmaHenrik Jensen,pp. 342-347,https://www.aeaweb.org/articles?id=10.1257/aer...,E31,Price Level; Inflation; Deflation
