# Data Cleaning and Put into SQL

In [1]:
import pandas as pd
import sqlite3
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import bs4
import time
import random
import numpy as np

## Clean author_article.csv

In [2]:
# clean author_article
df_art_au = pd.read_csv("author_article.csv", encoding = "utf-8-sig").drop_duplicates()
df_art_au.to_csv("author_article.csv", encoding = "utf-8-sig", index = False)
df_art_au.to_csv("author_article_no_header.csv", encoding = "utf-8-sig", index = False, header = False)

## Clean article.csv

In [3]:
# select valid records with valid doi, issue format, and not duplicated
def select_valid(df):
    df_s = df[(df["doi"].str[:3] == "10.") & (df["volume"].str.contains("Supplement") == False) & 
              (df["volume"].str.contains("_Part_") == False) & 
              (df["volume"].str.contains("S1") == False) & 
              (df["volume"].str.contains("S2") == False) &
              (df["volume"].str.contains("issue ") == True)].drop_duplicates(subset=["doi"]).reset_index(drop = True)
    return df_s

# convert volume to date
def conv_volume_to_date(v,freq):
    v_list = v.split(", ")
    year = v_list[0]
    issue = v_list[2].replace("issue ","")
    if freq == 4:
        d = year+"-"+str(int(issue)*3-1)+"-15"
    elif freq == 6 or freq == 5:
        d = year+"-"+str(int(issue)*2)+"-01"
    elif freq == 12:
        d = year+"-"+issue+"-15"
    elif freq == 3:
        d = year+"-"+str(int(issue)*3)+"-01"
    elif freq == 2:
        d = year+"-"+str(int(issue)*6-2)+"-01"
    elif freq == 1:
        d = year+"-"+str(7)+"-01"
    
    if len(d) < 10:
        return d.split("-")[0]+"-"+"0"+d.split("-")[1]+"-"+d.split("-")[2]
    else:
        return d
    
# calculate issue frequency
def cal_freq(vs):
    dict = {}
    years = [int(v.split(", ")[0]) for v in vs]
    unique_years = set(years)
    issues = [int(v.split(", ")[2].replace("issue ","")) for v in vs]
    for unique_year in unique_years:
        max_issue = 0
        for i in range(len(years)):
            if unique_year == years[i]:
                if issues[i] > max_issue:
                    max_issue = issues[i]
        dict[unique_year] = max_issue

    return dict


### 1. RFS

In [4]:
# clean rfs
df_rfs = pd.read_csv("data/journals/raw_data/rfs.csv", encoding = "utf-8-sig").astype("str")
df_rfs["abstract"] = df_rfs["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_rfs_s = select_valid(df_rfs)
freq = cal_freq(df_rfs_s["volume"])

dates = []
for idx in df_rfs_s.index:
    year = int(df_rfs_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_rfs_s.loc[idx,"volume"],freq[year]))

df_rfs_s["date"] = dates
df_rfs_s.head()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
0,Review of Financial Studies,Dynamic Equilibrium with Liquidity Constraints,"['Jerome Detemple', 'Angel Serrat']","2003, vol. 16, issue 2, 597-629",[],This article studies an intertemporal economy ...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhg003,2003-05-15
1,Review of Financial Studies,Stochastic Discount Factor Bounds with Conditi...,"['Wayne Ferson', 'Andrew F. Siegel']","2003, vol. 16, issue 2, 567-595",[],Hansen and Jagannathan (1991) (hereafter HJ) d...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhg004,2003-05-15
2,Review of Financial Studies,Delta-Hedged Gains and the Negative Market Vol...,"['Gurdip Bakshi', 'Nikunj Kapadia']","2003, vol. 16, issue 2, 527-566",[],We investigate whether the volatility risk pre...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhg002,2003-05-15
3,Review of Financial Studies,"Differences of Opinion, Short-Sales Constraint...","['Harrison Hong', 'Jeremy Stein']","2003, vol. 16, issue 2, 487-525",[],We develop a theory of market crashes based on...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhg006,2003-05-15
4,Review of Financial Studies,Risk Adjustment and Trading Strategies,"['Dong-Hyun Ahn', 'Jennifer Conrad', 'Robert D...","2003, vol. 16, issue 2, 459-485",[],We assess the profitability of momentum strate...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhg001,2003-05-15


### 2. RES

In [5]:
# clean res
df_res = pd.read_csv("data/journals/raw_data/res.csv", encoding = "utf-8-sig").astype("str")
df_res["abstract"] = df_res["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_res_s = select_valid(df_res)
freq = cal_freq(df_res_s["volume"])

dates = []
for idx in df_res_s.index:
    year = int(df_res_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_res_s.loc[idx,"volume"],freq[year]))

df_res_s["date"] = dates
df_res_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
3575,Review of Economic Studies,Profit in American Economic Theory,['William S. Hopkins'],"1933, vol. 1, issue 1, 60-66",[],"DURING the past sixty years, there has been a ...",https://econpapers.repec.org/article/ouprestud...,10.2307/2967438,1933-07-01
3576,Review of Economic Studies,I.—A Note on Relative Shares,['P. M. Sweezy'],"1933, vol. 1, issue 1, 67-68",[],,https://econpapers.repec.org/article/ouprestud...,10.2307/2967439,1933-07-01
3577,Review of Economic Studies,II.—The Diagrammatical Representation,['A. P. Lerner'],"1933, vol. 1, issue 1, 68-71",[],,https://econpapers.repec.org/article/ouprestud...,10.2307/2967440,1933-07-01
3578,Review of Economic Studies,III.—The Elasticity of Substitution and the Re...,['R. F. Kahn'],"1933, vol. 1, issue 1, 72-78",[],WE owe to Dr. Hicks the extremely interesting ...,https://econpapers.repec.org/article/ouprestud...,10.2307/restud/1.1.72,1933-07-01
3579,Review of Economic Studies,IV.—A Note on Mr. Kahn's Paper,['John Hicks'],"1933, vol. 1, issue 1, 78-80",[],,https://econpapers.repec.org/article/ouprestud...,10.2307/2967442,1933-07-01


### 3. QJE

In [6]:
# clean qje
df_qje = pd.read_csv("data/journals/raw_data/qje.csv", encoding = "utf-8-sig").astype("str")
df_qje["abstract"] = df_qje["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_qje_s = select_valid(df_qje)
freq = cal_freq(df_qje_s["volume"])

dates = []
for idx in df_qje_s.index:
    year = int(df_qje_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_qje_s.loc[idx,"volume"],freq[year]))

df_qje_s["date"] = dates
df_qje_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
5415,The Quarterly Journal of Economics,Private Monopolies and Public Rights,['Arthur T. Hadley'],"1886, vol. 1, issue 1, 28-44",[],,https://econpapers.repec.org/article/oupqjecon...,10.2307/1883107,1886-07-01
5416,The Quarterly Journal of Economics,Silver Before Congress in 1886,['S. Dana Horton'],"1886, vol. 1, issue 1, 45-75",[],,https://econpapers.repec.org/article/oupqjecon...,10.2307/1883108,1886-07-01
5417,The Quarterly Journal of Economics,"The Arithmetic, Geometric, and Harmonic Means",['F. Coggeshall'],"1886, vol. 1, issue 1, 83-86",[],This chapter is devoted to the properties and ...,https://econpapers.repec.org/article/oupqjecon...,10.2307/1883111,1886-07-01
5418,The Quarterly Journal of Economics,Legislation for Labor Arbitration,['H. M. Williams'],"1886, vol. 1, issue 1, 86-91",[],,https://econpapers.repec.org/article/oupqjecon...,10.2307/1883112,1886-07-01
5419,The Quarterly Journal of Economics,Correspondence,['Arthur Mangin'],"1886, vol. 1, issue 1, 91-102",[],,https://econpapers.repec.org/article/oupqjecon...,10.2307/1883113,1886-07-01


### 4. JPE

In [7]:
# clean jpe
df_jpe = pd.read_csv("data/journals/raw_data/jpe.csv", encoding = "utf-8-sig").astype("str")
df_jpe["abstract"] = df_jpe["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_jpe["doi"] = df_jpe["doi"].apply(lambda x: x.replace("http://dx.doi.org/",""))
df_jpe_s = select_valid(df_jpe)
freq = cal_freq(df_jpe_s["volume"])

dates = []
for idx in df_jpe_s.index:
    year = int(df_jpe_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_jpe_s.loc[idx,"volume"],freq[year]))

df_jpe_s["date"] = dates

# modify date for 2023
for idx in df_jpe_s.index:
    year = int(df_jpe_s.loc[idx,"volume"].split(", ")[0])
    if year == 2023:
        df_jpe_s.loc[idx, "date"] = conv_volume_to_date(df_jpe_s.loc[idx,"volume"],12)

df_jpe_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
3324,Journal of Political Economy,Some Evidence on the Demand for Money,['David Laidler'],"1966, vol. 74, issue 1, 55-68",[],I ALTHOUGH the demand function for money balan...,https://econpapers.repec.org/article/ucpjpolec...,10.1086/259109,1966-07-01
3325,Journal of Political Economy,Paul Baran: An Appreciation,['M. Bronfenbrenner'],"1966, vol. 74, issue 1, 69-71",[],HTEN Joseph Dorfman's monumental W Economic Mi...,https://econpapers.repec.org/article/ucpjpolec...,10.1086/259110,1966-07-01
3326,Journal of Political Economy,Walras' Law and the Patinkin Paradox: A Qualit...,"['James G. Witte, Jr.']","1966, vol. 74, issue 1, 72-76",[],"TN HIS Money, Interest, and Prices Patinkin (1...",https://econpapers.repec.org/article/ucpjpolec...,10.1086/259111,1966-07-01
3327,Journal of Political Economy,Factor-Intensity Reversals in International Co...,['David Stafford Ball'],"1966, vol. 74, issue 1, 77-80",[],THE role of two assumptions inthe factorpropor...,https://econpapers.repec.org/article/ucpjpolec...,10.1086/259112,1966-07-01
3328,Journal of Political Economy,Testing Economic Assumptions: A Comment,['T. W. Hutchison'],"1966, vol. 74, issue 1, 81-83",[],Hutchison is responsible for considerable conf...,https://econpapers.repec.org/article/ucpjpolec...,10.1086/259113,1966-07-01


### 5. E

In [8]:
# clean e
df_e = pd.read_csv("data/journals/raw_data/e.csv", encoding = "utf-8-sig").astype("str")
df_e["abstract"] = df_e["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_e["doi"] = df_e["doi"].apply(lambda x: x.replace("https://doi.org/",""))
df_e_s = select_valid(df_e)
freq = cal_freq(df_e_s["volume"])

dates = []
for idx in df_e_s.index:
    year = int(df_e_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_e_s.loc[idx,"volume"],freq[year]))

df_e_s["date"] = dates

# modify date for 2023
for idx in df_e_s.index:
    year = int(df_e_s.loc[idx,"volume"].split(", ")[0])
    if year == 2023:
        df_e_s.loc[idx, "date"] = conv_volume_to_date(df_e_s.loc[idx,"volume"],6)

df_e_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
1000,Econometrica,Higher Order Properties of Gmm and Generalized...,"['Whitney Newey', 'Richard Smith']","2004, vol. 72, issue 1, 219-255",[],In an effort to improve the small sample prope...,https://econpapers.repec.org/article/ecmemetrp...,10.1111/j.1468-0262.2004.00482.x,2004-02-01
1001,Econometrica,Random Matching Under Dichotomous Preferences,"['Anna Bogomolnaia', 'Herve Moulin']","2004, vol. 72, issue 1, 257-279",[],We consider bilateral matching problems where ...,https://econpapers.repec.org/article/ecmemetrp...,10.1111/j.1468-0262.2004.00483.x,2004-02-01
1002,Econometrica,Notes and Comments the Amsterdam Auction,"['Jacob Goeree', 'Theo Offerman']","2004, vol. 72, issue 1, 281-294",[],The Amsterdam auction has been used to sell re...,https://econpapers.repec.org/article/ecmemetrp...,10.1111/j.1468-0262.2004.00484.x,2004-02-01
1003,Econometrica,Random Effects Estimators with many Instrument...,"['Gary Chamberlain', 'Guido Imbens']","2004, vol. 72, issue 1, 295-306",[],In this paper we propose a new estimator for a...,https://econpapers.repec.org/article/ecmemetrp...,10.1111/j.1468-0262.2004.00485.x,2004-02-01
1004,Econometrica,The Econometric Society Annual Reports Report ...,"['Eddie Dekel', 'Glenn Ellison', 'Joel Horowit...","2004, vol. 72, issue 1, 336-338",[],,https://econpapers.repec.org/article/ecmemetrp...,10.1111/j.1468-0262.2004.00490.x,2004-02-01


### 6. AER

In [9]:
# clean aer
df_aer = pd.read_csv("data/journals/raw_data/aer.csv", encoding = "utf-8-sig").astype("str")
df_aer["journal"] = df_aer["journal"].apply(lambda x: "American Economic Review")
df_aer["abstract"] = df_aer["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_aer["volume"] = df_aer["volume"].apply(lambda x: x.replace("\n","").replace("\t","").strip())
df_aer["doi"] = df_aer["doi"].apply(lambda x: x.strip())
df_aer = df_aer[df_aer["doi"].str[:3] == "10."].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_aer.index:
    date_text = df_aer.loc[idx,"volume"].split(",")[2].strip()
    year = date_text.split(" ")[1]
    month_text = date_text.split(" ")[0]
    if month_text == "January":
        month = "01"
    elif month_text == "February":
        month = "02"
    elif month_text == "March":
        month = "03"
    elif month_text == "April":
        month = "04"
    elif month_text == "May":
        month = "05"
    elif month_text == "June":
        month = "06"
    elif month_text == "July":
        month = "07"
    elif month_text == "August":
        month = "08"
    elif month_text == "September":
        month = "09"
    elif month_text == "October":
        month = "10"
    elif month_text == "November":
        month = "11"
    elif month_text == "December":
        month = "12"

    dates.append(year+"-"+month+"-"+"15")

df_aer["date"] = dates
df_aer_s = df_aer.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_aer_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
4799,American Economic Review,Enabling or Limiting Cognitive Flexibility? Ev...,"['\n Silvia Saccardo ', '\n ...","American Economic Review \r\r\rvol. 113,\rno. ...","[('C91', 'Design of Experiments: Laboratory, I...",Moral behavior is more prevalent when individu...,https://www.aeaweb.org/articles?id=10.1257/aer...,10.1257/aer.20201333,2023-02-15
4800,American Economic Review,"Droughts, Deluges, and (River) Diversions: Val...",['\n Will Rafey '],"American Economic Review \r\r\rvol. 113,\rno. ...","[('D23', 'Organizational Behavior; Transaction...",This paper develops and applies a method to va...,https://www.aeaweb.org/articles?id=10.1257/aer...,10.1257/aer.20201434,2023-02-15
4801,American Economic Review,"Technology Gaps, Trade, and Income",['\n Thomas Sampson '],"American Economic Review \r\r\rvol. 113,\rno. ...","[('D21', 'Firm Behavior: Theory'), ('D24', 'Pr...",This paper quantifies the contribution of tech...,https://www.aeaweb.org/articles?id=10.1257/aer...,10.1257/aer.20201940,2023-02-15
4802,American Economic Review,Electronic Food Vouchers: Evidence from an At-...,"['\n Abhijit Banerjee ', '\n ...","American Economic Review \r\r\rvol. 113,\rno. ...","[('H53', 'National Government Expenditures and...",We compare how in-kind food assistance and an ...,https://www.aeaweb.org/articles?id=10.1257/aer...,10.1257/aer.20210461,2023-02-15
4803,American Economic Review,The Voice of Monetary Policy,"['\n Yuriy Gorodnichenko ', '\n ...","American Economic Review \r\r\rvol. 113,\rno. ...","[('D83', 'Search; Learning; Information and Kn...",We develop a deep learning model to detect emo...,https://www.aeaweb.org/articles?id=10.1257/aer...,10.1257/aer.20220129,2023-02-15


### 7. JF

In [10]:
# clean jf
df_jf = pd.read_csv("data/journals/raw_data/jf.csv", encoding = "utf-8-sig").astype("str")
df_jf["abstract"] = df_jf["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_jf["doi"] = df_jf["doi"].apply(lambda x: x.replace("https://doi.org/",""))
df_jf_s = select_valid(df_jf)
freq = cal_freq(df_jf_s["volume"])

dates = []
for idx in df_jf_s.index:
    year = int(df_jf_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_jf_s.loc[idx,"volume"],freq[year]))

df_jf_s["date"] = dates

# modify date for 2023
for idx in df_jf_s.index:
    year = int(df_jf_s.loc[idx,"volume"].split(", ")[0])
    if year == 2023:
        df_jf_s.loc[idx, "date"] = conv_volume_to_date(df_jf_s.loc[idx,"volume"],6)

df_jf_s.head()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
0,Journal of Finance,Optimal Financial Transaction Taxes,['Eduardo Dávila'],"2023, vol. 78, issue 1, 5-61",[],This paper characterizes the optimal transacti...,https://econpapers.repec.org/article/blajfinan...,10.1111/jofi.13188,2023-02-01
1,Journal of Finance,"Less Mainstream Credit, More Payday Borrowing?...",['Julia Fonseca'],"2023, vol. 78, issue 1, 63-103",[],Governments regulate debt collectors to protec...,https://econpapers.repec.org/article/blajfinan...,10.1111/jofi.13189,2023-02-01
2,Journal of Finance,Disruption and Credit Markets,"['Bo Becker', 'Victoria Ivashina']","2023, vol. 78, issue 1, 105-139",[],"We show that over the past half‐century, innov...",https://econpapers.repec.org/article/blajfinan...,10.1111/jofi.13187,2023-02-01
3,Journal of Finance,How Risky Are U.S. Corporate Assets?,"['Tetiana Davydiuk', 'Scott Richard', 'Ivan Sh...","2023, vol. 78, issue 1, 141-208",[],We use market data on corporate bonds and equi...,https://econpapers.repec.org/article/blajfinan...,10.1111/jofi.13196,2023-02-01
4,Journal of Finance,International Yield Curves and Currency Puzzles,"['Mikhail Chernov', 'Drew Creal']","2023, vol. 78, issue 1, 209-245",[],The currency depreciation rate is often comput...,https://econpapers.repec.org/article/blajfinan...,10.1111/jofi.13191,2023-02-01


## 8. JEP

In [11]:
# clean jep
df_jep = pd.read_csv("data/journals/raw_data/jep.csv", encoding = "utf-8-sig").astype("str")
df_jep["journal"] = df_jep["journal"].apply(lambda x: "Journal of Economic Perspectives")
df_jep["abstract"] = df_jep["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_jep["volume"] = df_jep["volume"].apply(lambda x: x.replace("\n","").replace("\t","").strip())
df_jep["doi"] = df_jep["doi"].apply(lambda x: x.strip())
df_jep = df_jep[df_jep["doi"].str[:3] == "10."].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_jep.index:
    date_text = df_jep.loc[idx,"volume"].lower().split(" ")
    season = date_text[0]
    year = date_text[1]
    if season == "spring":
        md = "02-15"
    elif season == "summer":
        md = "05-15"
    elif season == "fall":
        md = "08-15"
    elif month_text == "winter":
        md = "11-15"

    dates.append(year+"-"+md)

df_jep["date"] = dates
df_jep_s = df_jep.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_jep_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
1830,Journal of Economic Perspectives,Emerging Market Sovereign Debt in the Aftermat...,['\n Kenneth Rogoff '],"Fall 2022 (Vol. 36, No.4 )","[('E23', 'Macroeconomics: Production'), ('E62'...","For emerging markets, fiscal space is a very r...",https://www.aeaweb.org/articles?id=10.1257/jep...,10.1257/jep.36.4.147,2022-08-15
1831,Journal of Economic Perspectives,Popular Personal Financial Advice versus the P...,['\n James J. Choi '],"Fall 2022 (Vol. 36, No.4 )","[('D15', 'Intertemporal Household Choice; Life...",I survey the advice given by the fifty most po...,https://www.aeaweb.org/articles?id=10.1257/jep...,10.1257/jep.36.4.167,2022-08-15
1832,Journal of Economic Perspectives,A Linear Panel Model with Heterogeneous Coeffi...,"['\n Liyang Sun ', '\n Jess...","Fall 2022 (Vol. 36, No.4 )","[('C23', 'Single Equation Models; Single Varia...",Linear panel models featuring unit and time fi...,https://www.aeaweb.org/articles?id=10.1257/jep...,10.1257/jep.36.4.193,2022-08-15
1833,Journal of Economic Perspectives,"Sadie T. M. Alexander: Black Women and a ""Tast...",['\n Nina Banks '],"Fall 2022 (Vol. 36, No.4 )","[('B31', 'History of Economic Thought: Individ...",The employment history of African American wom...,https://www.aeaweb.org/articles?id=10.1257/jep...,10.1257/jep.36.4.205,2022-08-15
1834,Journal of Economic Perspectives,Recommendations for Further Reading,['\n Timothy Taylor '],"Fall 2022 (Vol. 36, No.4 )","[('Y50', 'Further Reading (unclassified)')]",N\A,https://www.aeaweb.org/articles?id=10.1257/jep...,10.1257/jep.36.4.221,2022-08-15


## 9. AEJM

In [12]:
# clean aejm
df_aejm = pd.read_csv("data/journals/raw_data/aejm.csv", encoding = "utf-8-sig").astype("str")
df_aejm = df_aejm[df_aejm["volume"].str.contains("pages")]
df_aejm["abstract"] = df_aejm["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_aejm["doi"] = df_aejm["doi"].apply(lambda x: x.strip())
df_aejm = df_aejm[df_aejm["doi"].str[:3] == "10."].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_aejm.index:
    year = df_aejm.loc[idx,"year"]
    date_text = df_aejm.loc[idx,"volume"]
    month_text = date_text.split(", ")[2].replace(".","")
    if month_text == "January":
        month = "01"
    elif month_text == "February":
        month = "02"
    elif month_text == "March":
        month = "03"
    elif month_text == "April":
        month = "04"
    elif month_text == "May":
        month = "05"
    elif month_text == "June":
        month = "06"
    elif month_text == "July":
        month = "07"
    elif month_text == "August":
        month = "08"
    elif month_text == "September":
        month = "09"
    elif month_text == "October":
        month = "10"
    elif month_text == "November":
        month = "11"
    elif month_text == "December":
        month = "12"

    dates.append(year+"-"+month+"-"+"01")

df_aejm["date"] = dates
df_aejm_s = df_aejm.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_aejm_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
476,American Economic Journal: Macroeconomics,Civic Virtue and Labor Market Institutions,"['Pierre Cahuc', 'Yann Algan']","1(1), pages 111-145, January.",N\A,We argue civic virtue plays a key role in expl...,https://ideas.repec.org/a/aea/aejmac/v1y2009i1...,10.1257/mac.1.1.111,2009-01-01
477,American Economic Journal: Macroeconomics,Rent Preservation and the Persistence of Under...,['Raghuram G. Rajan'],"1(1), pages 178-218, January.",N\A,When citizens in a poor constrained society ar...,https://ideas.repec.org/a/aea/aejmac/v1y2009i1...,10.1257/mac.1.1.178,2009-01-01
478,American Economic Journal: Macroeconomics,Border Effect or Country Effect? Seattle May N...,"['Yuriy Gorodnichenko', 'Linda L. Tesar']","1(1), pages 219-241, January.",N\A,This paper reexamines the evidence on the bord...,https://ideas.repec.org/a/aea/aejmac/v1y2009i1...,10.1257/mac.1.1.219,2009-01-01
479,American Economic Journal: Macroeconomics,New Keynesian Models: Not Yet Useful for Polic...,"['V. V. Chari', 'Patrick J. Kehoe', 'Ellen R. ...","1(1), pages 242-266, January.",N\A,Macroeconomists have largely converged on meth...,https://ideas.repec.org/a/aea/aejmac/v1y2009i1...,10.1257/mac.1.1.242,2009-01-01
480,American Economic Journal: Macroeconomics,Convergence in Macroeconomics: The Labor Wedge,['Robert Shimer'],"1(1), pages 280-297, January.",N\A,I review research on the behavior of the labor...,https://ideas.repec.org/a/aea/aejmac/v1y2009i1...,10.1257/mac.1.1.280,2009-01-01


## 10. EER

In [13]:
# clean eer
df_eer = pd.read_csv("data/journals/raw_data/eer.csv", encoding = "utf-8-sig").astype("str")
df_eer["abstract"] = df_eer["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_eer = df_eer.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_eer["doi"] = df_eer["doi"].apply(lambda x: x.strip())
df_eer = df_eer[df_eer["doi"].str[:3] == "10."].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

months = ["January","February","March","April","May","June",
          "July","August","September","October","November","December"]
dates = []
for idx in df_eer.index:
    year = df_eer.loc[idx,"year"]
    date_parts = df_eer.loc[idx,"volume"].split("(")
    vol = int(date_parts[0])
    month_text =  date_parts[1].split(", ")[-1].replace(".","")

    if month_text in months:
        if month_text == "January":
            month = "01"
        elif month_text == "February":
            month = "02"
        elif month_text == "March":
            month = "03"
        elif month_text == "April":
            month = "04"
        elif month_text == "May":
            month = "05"
        elif month_text == "June":
            month = "06"
        elif month_text == "July":
            month = "07"
        elif month_text == "August":
            month = "08"
        elif month_text == "September":
            month = "09"
        elif month_text == "October":
            month = "10"
        elif month_text == "November":
            month = "11"
        elif month_text == "December":
            month = "12"
    # 2016-2023
    elif int(year) >= 2016:
        if vol % 10 == 0:
            month = "11"
        elif vol % 10 == 9:
            month = "10"
        elif vol % 10 == 8:
            month = "09"
        elif vol % 10 == 7:
            month = "08"
        elif vol % 10 == 6:
            month = "07"
        elif vol % 10 == 5:
            month = "06"
        elif vol % 10 == 4:
            month = "05"
        elif vol % 10 == 3:
            month = "04"
        elif vol % 10 == 2:
            month = "02"
        elif vol % 10 == 1:
            month = "01"
    # 2013-2015
    elif int(year) >= 2013:
        if vol % 8 == 0:
            month = "11"
        elif vol % 8 == 7:
            month = "10"
        elif vol % 8 == 6:
            month = "08"
        elif vol % 8 == 5:
            month = "07"
        elif vol % 8 == 4:
            month = "05"
        elif vol % 8 == 3:
            month = "04"
        elif vol % 8 == 2:
            month = "02"
        elif vol % 8 == 1:
            month = "01"
    # 2005-2012
    elif int(year) >= 2005:
        issue = int(date_parts[1][0])
        if issue == 8:
            month = "11"
        elif issue == 7:
            month = "10"
        elif issue == 6:
            month = "08"
        elif issue == 5:
            month = "07"
        elif issue == 4:
            month = "05"
        elif issue == 3:
            month = "04"
        elif issue == 2:
            month = "02"
        elif issue == 1:
            month = "01"
    # 2003-2004
    elif int(year) >= 2003:
        issue = int(date_parts[1][0])
        if issue == 6:
            month = "12"
        elif issue == 5:
            month = "10"
        elif issue == 4:
            month = "08"
        elif issue == 3:
            month = "06"
        elif issue == 2:
            month = "04"
        elif issue == 1:
            month = "02"
    # 2000-2002
    elif int(year) == 2002:
        issue = date_parts[1].split(")")[0]
        if issue == "10":
            month = "12"
        elif issue == "9":
            month = "10"
        elif issue == "8":
            month = "09"
        elif issue == "7":
            month = "07"
        elif issue == "6":
            month = "06"
        elif issue == "4-5":
            month = "05"
        elif issue == "3":
            month = "03"
        elif issue == "2":
            month = "02"
        elif issue == "1":
            month = "01"
    elif int(year) == 2001:
        issue = date_parts[1].split(")")[0]
        if issue == "10":
            month = "12"
        elif issue == "9":
            month = "10"
        elif issue == "8":
            month = "08"
        elif issue == "7":
            month = "06"
        elif issue == "4-6":
            month = "05"
        elif issue == "3":
            month = "03"
        elif issue == "2":
            month = "02"
        elif issue == "1":
            month = "01"
    elif int(year) == 2000:
        issue = date_parts[1].split(")")[0]
        if issue == "10":
            month = "12"
        elif issue == "9":
            month = "10"
        elif issue == "8":
            month = "08"
        elif issue == "7":
            month = "06"
        elif issue == "4-6":
            month = "05"
        elif issue == "3":
            month = "03"
        elif issue == "2":
            month = "02"
        elif issue == "1":
            month = "01"
    else:
        issue = date_parts[1][0]
        if issue == "9":
            month = "09"
        elif issue == "8":
            month = "08"
        elif issue == "7":
            month = "07"
        elif issue == "6":
            month = "06"
        elif issue == "5":
            month = "05"
        elif issue == "4":
            month = "04"
        elif issue == "3":
            month = "03"
        elif issue == "2":
            month = "02"
        elif issue == "1":
            month = "01"

    dates.append(year+"-"+month+"-"+"15")

df_eer["date"] = dates
df_eer_s = df_eer.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_eer_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
1631,European Economic Review,Intergenerational exchange mobility and econom...,"['Markandya, Anil']","17(3), pages 307-324.",N\A,The concepts of exchange and structural mobili...,https://ideas.repec.org/a/eee/eecrev/v17y1982i...,10.1016/S0014-2921(82)80066-4,1982-03-15
1632,European Economic Review,The estimation of welfare levels of a cardinal...,"['Buyze, Jeannine']","17(3), pages 325-332.",N\A,In order to measure an individual's welfare fu...,https://ideas.repec.org/a/eee/eecrev/v17y1982i...,10.1016/S0014-2921(82)80067-6,1982-03-15
1633,European Economic Review,"Earnings and education in Greece, 1960–1977","['Psacharopoulos, George']","17(3), pages 333-347.",N\A,This paper analyses the structure of earnings ...,https://ideas.repec.org/a/eee/eecrev/v17y1982i...,10.1016/S0014-2921(82)80068-8,1982-03-15
1634,European Economic Review,The testing and estimation of complete demand ...,"['Ray, Ranjan']","17(3), pages 349-369.",N\A,A recent demand system (AIDS) is extended to i...,https://ideas.repec.org/a/eee/eecrev/v17y1982i...,10.1016/S0014-2921(82)80069-X,1982-03-15
1635,European Economic Review,Modelling consumers' expenditure,"['Rossi, Nicola', 'Schiantarelli, Fabio']","17(3), pages 371-391.",N\A,The purpose of this paper is to model the dyna...,https://ideas.repec.org/a/eee/eecrev/v17y1982i...,10.1016/S0014-2921(82)80070-6,1982-03-15


## 11. EL

In [14]:
# clean el
df_el = pd.read_csv("data/journals/raw_data/el.csv", encoding = "utf-8-sig").astype("str")
df_el["abstract"] = df_el["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_el = df_el.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_el["doi"] = df_el["doi"].apply(lambda x: x.strip())
df_el = df_el[df_el["doi"].str[:3] == "10."].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_el.index:
    year = df_el.loc[idx,"year"]
    date_parts = df_el.loc[idx,"volume"].split("(")
    vol = int(date_parts[0])

    # 2015-2023
    if int(year) >= 2015:
        if vol % 12 == 5:
            month = "12"
        elif vol % 12 == 4:
            month = "11"
        elif vol % 12 == 3:
            month = "10"
        elif vol % 12 == 2:
            month = "09"
        elif vol % 12 == 1:
            month = "08"
        elif vol % 12 == 0:
            month = "07"
        elif vol % 12 == 11:
            month = "06"
        elif vol % 12 == 10:
            month = "05"
        elif vol % 12 == 9:
            month = "04"
        elif vol % 12 == 8:
            month = "03"
        elif vol % 12 == 7:
            month = "02"
        elif vol % 12 == 6:
            month = "01"
    else:
        issue = int(date_parts[1][0])
        if vol % 4 == 1 and issue == 3:
            month = "12"
        elif vol % 4 == 1 and issue == 2:
            month = "11"
        elif vol % 4 == 1 and issue == 1:
            month = "10"
        elif vol % 4 == 0 and issue == 3:
            month = "09"
        elif vol % 4 == 0 and issue == 2:
            month = "08"
        elif vol % 4 == 0 and issue == 1:
            month = "07"
        elif vol % 4 == 3 and issue == 3:
            month = "06"
        elif vol % 4 == 3 and issue == 2:
            month = "05"
        elif vol % 4 == 3 and issue == 1:
            month = "04"
        elif vol % 4 == 2 and issue == 3:
            month = "03"
        elif vol % 4 == 2 and issue == 2:
            month = "02"
        elif vol % 4 == 2 and issue == 1:
            month = "01"
            
    dates.append(year+"-"+month+"-"+"15")

df_el["date"] = dates
df_el_s = df_el.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_el_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
4938,Economics Letters,A robust test for multivariate normality,"['Jönsson, Kristian']","113(2), pages 199-201.",N\A,The size of the Jarque–Bera test for multivari...,https://ideas.repec.org/a/eee/ecolet/v113y2011...,10.1016/j.econlet.2011.06.018,2011-11-15
4939,Economics Letters,A differential measure of the real wage index,"['Baye, Michael R.', 'Black, Dan A.']","36(3), pages 295-298.",N\A,In this paper we derive an approximation of th...,https://ideas.repec.org/a/eee/ecolet/v36y1991i...,10.1016/0165-1765(91)90036-K,1991-09-15
4940,Economics Letters,"Aggregate price indexes, cointegration, and te...","['Johnson, Paul A.']","36(3), pages 305-309.",N\A,In this paper I show that cointegration tests ...,https://ideas.repec.org/a/eee/ecolet/v36y1991i...,10.1016/0165-1765(91)90038-M,1991-09-15
4941,Economics Letters,Valuation effects of Canadian stock split anno...,"['Kryzanowski, Lawrence', 'Zhang, Hao']","36(3), pages 317-322.",N\A,The abnormal returns for two types of announce...,https://ideas.repec.org/a/eee/ecolet/v36y1991i...,10.1016/0165-1765(91)90040-R,1991-09-15
4942,Economics Letters,An index of relative deprivation,"['Paul, Satya']","36(3), pages 337-341.",N\A,This paper proposes an index of relative depri...,https://ideas.repec.org/a/eee/ecolet/v36y1991i...,10.1016/0165-1765(91)90043-K,1991-09-15


## 12. EM

In [15]:
# clean em
df_em = pd.read_csv("data/journals/raw_data/em.csv", encoding = "utf-8-sig").astype("str")
df_em["abstract"] = df_em["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_em = df_em.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_em["doi"] = df_em["doi"].apply(lambda x: x.strip())
df_em = df_em[df_em["doi"].str[:3] == "10."].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_em.index:
    year = df_em.loc[idx,"year"]
    date_parts = df_em.loc[idx,"volume"].split("(")
    try:
        vol = int(date_parts[0])
    except:
        try:
            date_parts = df_em.loc[idx-1,"volume"].split("(")
            vol = int(date_parts[0])
        except:
            try:
                date_parts = df_em.loc[idx-2,"volume"].split("(")
                vol = int(date_parts[0])
            except:
                date_parts = df_em.loc[idx-3,"volume"].split("(")
                vol = int(date_parts[0])

    # 2021-2023
    if int(year) >= 2021:
        if vol % 12 == 9:
            month = "12"
        elif vol % 12 == 8:
            month = "11"
        elif vol % 12 == 7:
            month = "10"
        elif vol % 12 == 6:
            month = "09"
        elif vol % 12 == 5:
            month = "08"
        elif vol % 12 == 4:
            month = "07"
        elif vol % 12 == 3:
            month = "06"
        elif vol % 12 == 2:
            month = "05"
        elif vol % 12 == 1:
            month = "04"
        elif vol % 12 == 0:
            month = "03"
        elif vol % 12 == 11:
            month = "02"
        elif vol % 12 == 10:
            month = "01"
    # 2020
    elif int(year) == 2020:
        if vol % 10 == 3:
            month = "12"
        elif vol % 10 == 2:
            month = "11"
        elif vol % 10 == 1:
            month = "09"
        elif vol % 10 == 0:
            month = "08"
        elif vol % 10 == 9:
            month = "07"
        elif vol % 10 == 8:
            month = "06"
        elif vol % 10 == 7:
            month = "05"
        elif vol % 10 == 6:
            month = "03"
        elif vol % 10 == 5:
            month = "02"
        elif vol % 10 == 4:
            month = "01"
    # 2015-2019
    elif int(year) >= 2015:
        if vol % 8 == 3:
            month = "12"
        elif vol % 8 == 2:
            month = "11"
        elif vol % 8 == 1:
            month = "09"
        elif vol % 8 == 0:
            month = "08"
        elif vol % 8 == 7:
            month = "06"
        elif vol % 8 == 6:
            month = "04"
        elif vol % 8 == 5:
            month = "02"
        elif vol % 8 == 4:
            month = "01"
    # 2014
    elif int(year) == 2014:
        if vol % 9 == 8 or vol % 9 == 7:
            month = "12"
        elif vol % 9 == 6:
            month = "10"
        elif vol % 9 == 5:
            month = "08"
        elif vol % 9 == 4:
            month = "06"
        elif vol % 9 == 3:
            month = "04"
        elif vol % 9 == 2:
            month = "02"
        elif vol % 9 == 1:
            month = "02"
        elif vol % 9 == 0:
            month = "01"
    # 2013
    elif int(year) == 2013:
        if vol % 6 == 5:
            month = "09"
        elif vol % 6 == 4:
            month = "08"
        elif vol % 6 == 3:
            month = "07"
        elif vol % 6 == 2:
            month = "05"
        elif vol % 6 == 1:
            month = "03"
        elif vol % 6 == 0:
            month = "01"
    else:
        issue = date_parts[1][0]
        if issue == "6":
            month = "11"
        elif issue == "5":
            month = "09"
        elif issue == "4":
            month = "07"
        elif issue == "3":
            month = "05"
        elif issue == "2":
            month = "03"
        elif issue == "1":
            month = "01"
        
    dates.append(year+"-"+month+"-"+"15")

df_em["date"] = dates
df_em_s = df_em.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_em_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
3884,Economic Modelling,The effectiveness of the sunshine effect in Ta...,"['Lee, Yuan-Ming', 'Wang, Kuan-Min']","28(1), pages 710-727.",N\A,This study constructs a variety of GARCH model...,https://ideas.repec.org/a/eee/ecmode/v28y2011i...,10.1016/j.econmod.2010.05.008,2011-01-15
3885,Economic Modelling,The optimality of a gulf currency union: Commo...,"['Rafiq, M.S.']","28(1), pages 728-740.",N\A,A high degree of shared national elements that...,https://ideas.repec.org/a/eee/ecmode/v28y2011i...,10.1016/j.econmod.2010.05.007,2011-01-15
3886,Economic Modelling,Long-term macroeconometric models,"['Welfe, Wladyslaw']","28(1), pages 741-753.",N\A,Long-term forecasts and scenario analysis shou...,https://ideas.repec.org/a/eee/ecmode/v28y2011i...,10.1016/j.econmod.2010.05.002,2011-01-15
3887,Economic Modelling,The effects of the monetary policy regime shif...,"['Reschreiter, Andreas']","28(1), pages 754-759.",N\A,This paper studies the effects of the monetary...,https://ideas.repec.org/a/eee/ecmode/v28y2011i...,10.1016/j.econmod.2010.04.009,2011-01-15
3888,Economic Modelling,WITHDRAWN: More powerful non-linear panel unit...,"['Lau, Marco Chi-Keung']","28(1), pages 760-760.",N\A,This article has been withdrawn at the request...,https://ideas.repec.org/a/eee/ecmode/v28y2011i...,10.1016/j.econmod.2010.06.007,2011-01-15


## 13. JDE

In [16]:
# clean jde
df_jde = pd.read_csv("data/journals/raw_data/jde.csv", encoding = "utf-8-sig").astype("str")
df_jde["abstract"] = df_jde["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_jde = df_jde.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_jde["doi"] = df_jde["doi"].apply(lambda x: x.strip())
df_jde = df_jde[df_jde["doi"].str[:3] == "10."].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_jde.index:
    year = df_jde.loc[idx,"year"]
    date_parts = df_jde.loc[idx,"volume"].split("(")
    try:
        vol = int(date_parts[0])
    except:
        try:
            date_parts = df_jde.loc[idx-1,"volume"].split("(")
            vol = int(date_parts[0])
        except:
            try:
                date_parts = df_jde.loc[idx-2,"volume"].split("(")
                vol = int(date_parts[0])
            except:
                date_parts = df_jde.loc[idx-3,"volume"].split("(")
                vol = int(date_parts[0])
    
    # 2013-2023
    if int(year) >= 2013:
        if vol % 6 == 3:
            month = "11"
        elif vol % 6 == 2:
            month = "09"
        elif vol % 6 == 1:
            month = "06"
        elif vol % 6 == 0:
            month = "05"
        elif vol % 6 == 5:
            month = "03"
        elif vol % 6 == 4:
            month = "01"
    else:
        issue = date_parts[1][0]
        if vol % 3  == 0 and issue == "2":
            month = "11"
        elif vol % 3 == 0 and issue == "1":
            month = "09"
        elif vol % 3 == 2 and issue == "2":
            month = "07"
        elif vol % 3 == 2 and issue == "1":
            month = "05"
        elif vol % 3 == 1 and issue == "2":
            month = "03"
        elif vol % 3 == 1 and issue == "1":
            month = "01"
        
    dates.append(year+"-"+month+"-"+"15")

df_jde["date"] = dates
df_jde_s = df_jde.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_jde_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
1198,Journal of Development Economics,Agricultural terms of trade and distributional...,"['Andrews, Margaret S.']","17(1), pages 117-129.",N\A,The comparative-static effects of a change in ...,https://ideas.repec.org/a/eee/deveco/v17y1985i...,10.1016/0304-3878(85)90025-2,1985-05-15
1199,Journal of Development Economics,"Profit, rent and the terms of trade","['Gibson, Bill', 'McLeod, Darryl']","17(1), pages 131-139.",N\A,Andrews (1985) investigates the theoretical co...,https://ideas.repec.org/a/eee/deveco/v17y1985i...,10.1016/0304-3878(85)90026-4,1985-05-15
1200,Journal of Development Economics,"Profit, rent and the terms of trade","['Andrews, Margaret S.']","17(1), pages 141-149.",N\A,"In response to my article in this issue, Gibso...",https://ideas.repec.org/a/eee/deveco/v17y1985i...,10.1016/0304-3878(85)90027-6,1985-05-15
1201,Journal of Development Economics,More on the employment effects of innovation,"['Hall, P.H.', 'Heffernan, S.A.']","17(1), pages 151-162.",N\A,This paper discusses in the context of a fully...,https://ideas.repec.org/a/eee/deveco/v17y1985i...,10.1016/0304-3878(85)90028-8,1985-05-15
1202,Journal of Development Economics,More on the employment effects of innovation,"['Hagen, Everett E.']","17(1), pages 163-173.",N\A,This article argues the lack of relevance to r...,https://ideas.repec.org/a/eee/deveco/v17y1985i...,10.1016/0304-3878(85)90029-X,1985-05-15


## 14. JEEA

In [17]:
# clean jeea
df_jeea = pd.read_csv("data/journals/raw_data/jeea.csv", encoding = "utf-8-sig").astype("str")
df_jeea["abstract"] = df_jeea["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_jeea = df_jeea.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_jeea["doi"] = df_jeea["doi"].apply(lambda x: x.strip().replace("http://hdl.handle.net/",""))
df_jeea = df_jeea[(df_jeea["doi"].str[:3] == "10.") & (df_jeea["doi"].str.contains("issue") == False)].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_jeea.index:
    year = df_jeea.loc[idx,"year"]
    date_parts = df_jeea.loc[idx,"volume"].split("(")
    try:
        vol = int(date_parts[0])
    except:
        try:
            date_parts = df_jeea.loc[idx-1,"volume"].split("(")
            vol = int(date_parts[0])
        except:
            try:
                date_parts = df_jeea.loc[idx-2,"volume"].split("(")
                vol = int(date_parts[0])
            except:
                date_parts = df_jeea.loc[idx-3,"volume"].split("(")
                vol = int(date_parts[0])
    
    
    issue = date_parts[1][0]
    if issue == "6":
        month = "12"
    elif issue == "5":
        month = "10"
    elif issue == "4":
        month = "08"
    elif issue == "3":
        month = "06"
    elif issue == "2":
        month = "04"
    elif issue == "1":
        month = "02"
        
    dates.append(year+"-"+month+"-"+"15")

df_jeea["date"] = dates
df_jeea_s = df_jeea.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_jeea_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
502,Journal of the European Economic Association,Do Immigrants Cause Crime?,"['Milo Bianchi', 'Paolo Buonanno', 'Paolo Pino...","10(6), pages 1318-1347, December.",N\A,We examine the empirical relationship between ...,https://ideas.repec.org/a/bla/jeurec/v10y2012i...,10.1111/j.1542-4774.2012.01085.x,2012-12-15
503,Journal of the European Economic Association,Risk Aversion And Expected Utility Theory: An ...,"['Matilde Bombardini', 'Francesco Trebbi']","10(6), pages 1348-1399, December.",N\A,No abstract is available for this item.,https://ideas.repec.org/a/bla/jeurec/v10y2012i...,10.1111/j.1542-4774.2012.01086.x,2012-12-15
504,Journal of the European Economic Association,Can We Infer Hospital Quality From Medical Gra...,"['Matilde P. Machado', 'Ricardo Mora', 'Antoni...","10(6), pages 1400-1424, December.",N\A,"In this paper, we propose an alternative metho...",https://ideas.repec.org/a/bla/jeurec/v10y2012i...,10.1111/j.1542-4774.2012.01087.x,2012-12-15
505,Journal of the European Economic Association,"The Relationship Between Child Health, Develop...","['Martin Salm', 'Daniel Schunk']","10(6), pages 1425-1449, December.",N\A,No abstract is available for this item.,https://ideas.repec.org/a/bla/jeurec/v10y2012i...,10.1111/j.1542-4774.2012.01089.x,2012-12-15
506,Journal of the European Economic Association,Uninsured Countercyclical Risk: An Aggregation...,"['R. Anton Braun', 'Tomoyuki Nakajima']","10(6), pages 1450-1474, December.",N\A,We consider an incomplete markets economy with...,https://ideas.repec.org/a/bla/jeurec/v10y2012i...,10.1111/j.1542-4774.2012.01091.x,2012-12-15


## 15. JEL

In [18]:
# clean jel
df_jel = pd.read_csv("data/journals/raw_data/jel.csv", encoding = "utf-8-sig").astype("str")
df_jel["abstract"] = df_jel["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_jel = df_jel.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_jel["doi"] = df_jel["doi"].apply(lambda x: x.strip().replace("http://hdl.handle.net/",""))
df_jel = df_jel[(df_jel["doi"].str[:3] == "10.") & (df_jel["doi"].str.contains("issue") == False)].drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_jel["journal"] = df_jel["journal"].apply(lambda x: "Journal of Economic Literature")

dates = []
for idx in df_jel.index:
    year = df_jel.loc[idx,"year"]
    if "(" not in df_jel.loc[idx,"volume"]:
        dates.append(year+"-06-15")
    else:
        date_parts = df_jel.loc[idx,"volume"].split("(")
        issue = date_parts[1][0]
        if issue == "4":
            month = "12"
        elif issue == "3":
            month = "09"
        elif issue == "2":
            month = "06"
        elif issue == "1":
            month = "03"
        else:
            month = "12"
            
        dates.append(year+"-"+month+"-"+"01")

df_jel["date"] = dates
df_jel_s = df_jel.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_jel_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
508,Journal of Economic Literature,The Numerical Reliability of Econometric Software,"['H. D. Vinod', 'B. D. McCullough']","37(2), pages 633-665, June.",N\A,Numerous examples show that some econometric s...,https://ideas.repec.org/a/aea/jeclit/v37y1999i...,10.1257/jel.37.2.633,1999-06-01
509,Journal of Economic Literature,The Provision of Incentives in Firms,['Canice Prendergast'],"37(1), pages 7-63, March.",N\A,This paper provides an overview of the existin...,https://ideas.repec.org/a/aea/jeclit/v37y1999i...,10.1257/jel.37.1.7,1999-03-01
510,Journal of Economic Literature,Explaining African Economic Performance,"['Jan Willem Gunning', 'Paul Collier']","37(1), pages 64-111, March.",N\A,Africa has had slow growth and a massive exodu...,https://ideas.repec.org/a/aea/jeclit/v37y1999i...,10.1257/jel.37.1.64,1999-03-01
511,Journal of Economic Literature,The New Growth Evidence,['Jonathan Temple'],"37(1), pages 112-156, March.",N\A,Why do growth rates differ? This paper surveys...,https://ideas.repec.org/a/aea/jeclit/v37y1999i...,10.1257/jel.37.1.112,1999-03-01
512,Journal of Economic Literature,The Young Person's Guide to Writing Economic T...,['William Thomson'],"37(1), pages 157-183, March.",N\A,I formulate recommendations to young authors f...,https://ideas.repec.org/a/aea/jeclit/v37y1999i...,10.1257/jel.37.1.157,1999-03-01


## 16. JIE

In [19]:
# clean jie
df_jie = pd.read_csv("data/journals/raw_data/jie.csv", encoding = "utf-8-sig").astype("str")
df_jie["abstract"] = df_jie["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_jie = df_jie.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_jie["doi"] = df_jie["doi"].apply(lambda x: x.strip().replace("http://hdl.handle.net/",""))
df_jie = df_jie[(df_jie["doi"].str[:3] == "10.") & (df_jie["doi"].str.contains("issue") == False)].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_jie.index:
    year = df_jie.loc[idx,"year"]
    date_parts = df_jie.loc[idx,"volume"].split("(")
    try:
        vol = int(date_parts[0])
    except:
        try:
            date_parts = df_jie.loc[idx-1,"volume"].split("(")
            vol = int(date_parts[0])
        except:
            try:
                date_parts = df_jie.loc[idx-2,"volume"].split("(")
                vol = int(date_parts[0])
            except:
                date_parts = df_jie.loc[idx-3,"volume"].split("(")
                vol = int(date_parts[0])
    # 2016-2023
    if int(year) >= 2016:
        if vol % 6 == 1:
            month = "11"
        elif vol % 6 == 0:
            month = "09"
        elif vol % 6 == 5:
            month = "07"
        elif vol % 6 == 4:
            month = "05"
        elif vol % 6 == 3:
            month = "03"
        elif vol % 6 == 2:
            month = "01"
    else:
        issue = date_parts[1][0]
        if vol % 3 == 1 and issue == "2":
            month = "11"
        elif vol % 3 == 1 and issue == "1":
            month = "09"
        elif vol % 3 == 0 and issue == "2":
            month = "07"
        elif vol % 3 == 0 and issue == "1":
            month = "05"
        elif vol % 3 == 2 and issue == "2":
            month = "03"
        elif vol % 3 == 2 and issue == "1":
            month = "01"
        
    dates.append(year+"-"+month+"-"+"15")

df_jie["date"] = dates
df_jie_s = df_jie.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_jie_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
1009,Journal of International Economics,"Transportation, freight rates, and economic ge...","['Behrens, Kristian', 'Picard, Pierre M.']","85(2), pages 280-291.",N\A,We investigate the role of competitive transpo...,https://ideas.repec.org/a/eee/inecon/v85y2011i...,10.1016/j.jinteco.2011.06.003,2011-11-15
1010,Journal of International Economics,The impact of trade liberalization on producti...,"['Nataraj, Shanthi']","85(2), pages 292-301.",N\A,Despite a large literature investigating the i...,https://ideas.repec.org/a/eee/inecon/v85y2011i...,10.1016/j.jinteco.2011.07.003,2011-11-15
1011,Journal of International Economics,International real business cycles with endoge...,"['Davis, J. Scott', 'Huang, Kevin X.D.']","85(2), pages 302-316.",N\A,The aggregate impact of decisions made at the ...,https://ideas.repec.org/a/eee/inecon/v85y2011i...,10.1016/j.jinteco.2011.06.004,2011-11-15
1012,Journal of International Economics,"Financial liberalization, structural change, a...","['Meza, Felipe', 'Urrutia, Carlos']","85(2), pages 317-328.",N\A,The last twenty years have witnessed periods o...,https://ideas.repec.org/a/eee/inecon/v85y2011i...,10.1016/j.jinteco.2011.06.001,2011-11-15
1013,Journal of International Economics,"Size, productivity, and international banking","['Buch, Claudia M.', 'Koch, Cathérine T.', 'Ko...","85(2), pages 329-334.",N\A,Heterogeneity in size and productivity is cent...,https://ideas.repec.org/a/eee/inecon/v85y2011i...,10.1016/j.jinteco.2011.07.001,2011-11-15


## 17. JME

In [20]:
# clean jme
df_jme = pd.read_csv("data/journals/raw_data/jme.csv", encoding = "utf-8-sig").astype("str")
df_jme["abstract"] = df_jme["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_jme = df_jme.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_jme["doi"] = df_jme["doi"].apply(lambda x: x.strip().replace("http://hdl.handle.net/",""))
df_jme = df_jme[(df_jme["doi"].str[:3] == "10.") & (df_jme["doi"].str.contains("issue") == False)].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_jme.index:
    year = df_jme.loc[idx,"year"]
    date_parts = df_jme.loc[idx,"volume"].split("(")
    try:
        vol = int(date_parts[0])
    except:
        try:
            date_parts = df_jme.loc[idx-1,"volume"].split("(")
            vol = int(date_parts[0])
        except:
            try:
                date_parts = df_jme.loc[idx-2,"volume"].split("(")
                vol = int(date_parts[0])
            except:
                date_parts = df_jme.loc[idx-3,"volume"].split("(")
                vol = int(date_parts[0])
    # 2014-2023
    if int(year) >= 2014:
        if vol % 8 == 4:
            month = "11"
        elif vol % 8 == 3:
            month = "10"
        elif vol % 8 == 2:
            month = "09"
        elif vol % 8 == 1:
            month = "07"
        elif vol % 8 == 0:
            month = "05"
        elif vol % 8 == 7:
            month = "04"
        elif vol % 8 == 6:
            month = "03"
        elif vol % 8 == 5:
            month = "01"
    else:
        issue = date_parts[1][0]
        if issue == "8":
            month = "11"
        elif issue == "7":
            month = "10"
        elif issue == "6":
            month = "09"
        elif issue == "5":
            month = "07"
        elif issue == "4":
            month = "05"
        elif issue == "3":
            month = "04"
        elif issue == "2":
            month = "03"
        elif issue == "1":
            month = "01"

    dates.append(year+"-"+month+"-"+"15")

df_jme["date"] = dates
df_jme_s = df_jme.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_jme_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
890,Journal of Monetary Economics,"Incomplete information, higher-order beliefs a...","['Angeletos, George-Marios', 'La’O, Jennifer']","56(S), pages 19-37.",N\A,The question that motivates this paper is how ...,https://ideas.repec.org/a/eee/moneco/v56y2009i...,10.1016/j.jmoneco.2009.07.001,2009-04-15
891,Journal of Monetary Economics,Imperfect information and the business cycle,"['Collard, Fabrice', 'Dellas, Harris', 'Smets,...","56(S), pages 38-56.",N\A,Imperfect information has played a prominent r...,https://ideas.repec.org/a/eee/moneco/v56y2009i...,10.1016/j.jmoneco.2009.06.011,2009-04-15
892,Journal of Monetary Economics,Setting the right prices for the wrong reasons,"['Hellwig, Christian', 'Venkateswaran, Venky']","56(S), pages 57-77.",N\A,Nominal price adjustment is studied in an envi...,https://ideas.repec.org/a/eee/moneco/v56y2009i...,10.1016/j.jmoneco.2009.06.013,2009-04-15
893,Journal of Monetary Economics,Sectoral price data and models of price setting,"['Maćkowiak, Bartosz', 'Moench, Emanuel', 'Wie...","56(S), pages 78-99.",N\A,"In the median sector, 100 percent of the long-...",https://ideas.repec.org/a/eee/moneco/v56y2009i...,10.1016/j.jmoneco.2009.06.012,2009-04-15
894,Journal of Monetary Economics,Information-constrained state-dependent pricing,"['Woodford, Michael']","56(S), pages 100-124.",N\A,A model is presented in which decisions about ...,https://ideas.repec.org/a/eee/moneco/v56y2009i...,10.1016/j.jmoneco.2009.06.014,2009-04-15


## 18. EJ

In [21]:
# clean ej
df_ej = pd.read_csv("data/journals/raw_data/ej.csv", encoding = "utf-8-sig").astype("str")
df_ej["abstract"] = df_ej["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_ej = df_ej.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_ej["doi"] = df_ej["doi"].apply(lambda x: x.strip().replace("http://hdl.handle.net/",""))
df_ej = df_ej[(df_ej["doi"].str[:3] == "10.") & (df_ej["doi"].str.contains("issue") == False)].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

months = ["January","February","March","April","May","June",
          "July","August","September","October","November","December"]

dates = []
for idx in df_ej.index:
    year = df_ej.loc[idx,"year"]
    found = False
    for month in months:
        if month in df_ej.loc[idx, "volume"]:
            dates.append(year+"-"+month+"-01")
            found = True
            break
    if not found:
        dates.append(year+"-06-01")

df_ej["date"] = dates
df_ej_s = df_ej.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_ej_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
859,The Economic Journal,The Timing and Persistence of Fiscal Policy Im...,"['Norman Gemmell', 'Richard Kneller', 'Ismael ...","137(1), pages 353-382, November. Pietro F. Per...",N\A,The literatures testing for aggregate short-ru...,https://ideas.repec.org/a/ecj/econjl/v121y2011...,10.1111/j.1468-0297.2010.02414.x,2011-November-01
860,The Economic Journal,Tax Policy for Economic Recovery and Growth,"['Jens Matthias Arnold', 'Bert Brys', 'Christo...","10(2), pages 107-126, March. Devereux, Michael...",N\A,This article identifies tax policy that both s...,https://ideas.repec.org/a/ecj/econjl/v121y2011...,10.1111/j.1468-0297.2010.02415.x,2011-March-01
861,The Economic Journal,Destruction and Distress: Using a Quasi‐Experi...,"['Robert Metcalfe', 'Nattavudh Powdthavee', 'P...","121(550), pages 81-103, February.",N\A,Using a longitudinal household panel dataset i...,https://ideas.repec.org/a/ecj/econjl/v121y2011...,10.1111/j.1468-0297.2010.02416.x,2011-February-01
862,The Economic Journal,Balancing the Banks: Global Lessons from the F...,['David Vines'],"121(550), pages 104-111, February.",N\A,No abstract is available for this item.,https://ideas.repec.org/a/ecj/econjl/v121y2011...,10.1111/j.1468-0297.2010.02417.x,2011-February-01
863,The Economic Journal,Balancing the Banks: Global Lessons from the F...,['David C. Webb'],"121(550), pages 111-118, February.",N\A,No abstract is available for this item.,https://ideas.repec.org/a/ecj/econjl/v121y2011...,10.1111/j.1468-0297.2010.02418.x,2011-February-01


## 19. JPET

In [22]:
# clean jpet
df_jpet = pd.read_csv("data/journals/raw_data/jpet.csv", encoding = "utf-8-sig").astype("str")
df_jpet["abstract"] = df_jpet["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_jpet = df_jpet.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_jpet["doi"] = df_jpet["doi"].apply(lambda x: x.strip().replace("http://hdl.handle.net/",""))
df_jpet = df_jpet[(df_jpet["doi"].str[:3] == "10.") & (df_jpet["doi"].str.contains("issue") == False)].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

months = ["January","February","March","April","May","June",
          "July","August","September","October","November","December"]

dates = []
for idx in df_jpet.index:
    year = df_jpet.loc[idx,"year"]
    found = False
    for month in months:
        if month in df_jpet.loc[idx, "volume"]:
            dates.append(year+"-"+month+"-01")
            found = True
            break
    if not found:
        dates.append(year+"-06-01")

df_jpet["date"] = dates
df_jpet_s = df_jpet.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_jpet_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
841,Journal of Public Economic Theory,School Finance Induced Migration and Stratific...,['Thomas J. Nechyba'],"1(1), pages 5-50, January.",N\A,This paper introduces a general equilibrium mo...,https://ideas.repec.org/a/bla/jpbect/v1y1999i1...,10.1111/1097-3923.00002,1999-January-01
842,Journal of Public Economic Theory,"Uncertainty, Commitment, and Optimal Taxation","['Helmuth Cremer', 'Firouz Gahvari']","1(1), pages 51-70, January.",N\A,This paper examines the optimal tax design pro...,https://ideas.repec.org/a/bla/jpbect/v1y1999i1...,10.1111/1097-3923.00003,1999-January-01
843,Journal of Public Economic Theory,Systems of Benevolent Utility Functions,['Theodore C. Bergstrom'],"nt Utility Functions,"" Journal of Public Econo...",N\A,This paper studies systems of utility function...,https://ideas.repec.org/a/bla/jpbect/v1y1999i1...,10.1111/1097-3923.00004,1999-06-01
844,Journal of Public Economic Theory,Inductive Game Theory: Discrimination and Prej...,"['Mamoru Kaneko', 'Akihiko Matsui']","1(1), pages 101-137, January.",N\A,"This paper proposes a new theory, which we cal...",https://ideas.repec.org/a/bla/jpbect/v1y1999i1...,10.1111/1097-3923.00005,1999-January-01
845,Journal of Public Economic Theory,Economies with Public Goods: An Elementary Geo...,['William Thomson'],"1(1), pages 139-176, January.",N\A,This paper explains how to represent economies...,https://ideas.repec.org/a/bla/jpbect/v1y1999i1...,10.1111/1097-3923.00006,1999-January-01


## 20. RESTAT

In [23]:
# clean restat
df_restat = pd.read_csv("data/journals/raw_data/restat.csv", encoding = "utf-8-sig").astype("str")
df_restat["abstract"] = df_restat["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_restat = df_restat.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_restat["doi"] = df_restat["doi"].apply(lambda x: x.strip().replace("http://hdl.handle.net/","").replace("http://www.mitpressjournals.org/doi/pdf/",""))
df_restat = df_restat[(df_restat["doi"].str[:3] == "10.") & (df_restat["doi"].str.contains("issue") == False)].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

months = ["January","February","March","April","May","June",
          "July","August","September","October","November","December"]

dates = []
for idx in df_restat.index:
    year = df_restat.loc[idx,"year"]
    found = False
    for month in months:
        if month in df_restat.loc[idx, "volume"]:
            dates.append(year+"-"+month+"-01")
            found = True
            break
    if not found:
        dates.append(year+"-06-01")

df_restat["date"] = dates
df_restat_s = df_restat.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_restat_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
1904,The Review of Economics and Statistics,Market Share Behavior And Mobility: An Analysi...,['John R. Cable'],"79(1), pages 136-141, February.",N\A,A measure of market share mobility is derived ...,https://ideas.repec.org/a/tpr/restat/v79y1997i...,10.1162/003465397556476,1997-February-01
1905,The Review of Economics and Statistics,The Response Of Exchange Rate Pass-Through To ...,['Jaewoo Lee'],"79(1), pages 142-145, February.",N\A,This paper finds that domestic market concentr...,https://ideas.repec.org/a/tpr/restat/v79y1997i...,10.1162/003465397556485,1997-February-01
1906,The Review of Economics and Statistics,The Skill Bias Of Technological Change In Cana...,['Julian R. Betts'],"79(1), pages 146-150, February.",N\A,The paper tests whether technological change h...,https://ideas.repec.org/a/tpr/restat/v79y1997i...,10.1162/003465397556494,1997-February-01
1907,The Review of Economics and Statistics,Another Look At The Impact Of The National Ind...,['Matthew B. Krepps'],"79(1), pages 151-154, February.",N\A,Alexander's (1994) finding that the National R...,https://ideas.repec.org/a/tpr/restat/v79y1997i...,10.1162/003465397556502,1997-February-01
1908,The Review of Economics and Statistics,"Beer Taxes, Workers' Compensation, And Industr...","['Robert L. Ohsfeldt', 'Michael A. Morrisey']","79(1), pages 155-160, February.",N\A,"The apparent effects of beer taxes, workers' c...",https://ideas.repec.org/a/tpr/restat/v79y1997i...,10.1162/003465397556511,1997-February-01


In [41]:
153 % 6

3

In [None]:
for item in df_eer.volume:
    print(item)

In [24]:
# concatenate journals
df = pd.concat([df_rfs_s,df_res_s,df_qje_s,df_jpe_s,df_e_s,
                df_aer_s,df_jf_s,df_jep_s,df_aejm_s,df_eer_s,
                df_el_s,df_em_s,df_jde_s,df_jeea_s,df_jel_s,
                df_jie_s,df_jme_s,df_ej_s,df_jpet_s,df_restat_s]).reset_index(drop=True)
len(df)

43405

In [25]:
df.nunique()

journal        20
title       42773
authors     36259
volume      33692
jel          5742
abstract    38854
url         43405
doi         43405
date         1401
dtype: int64

In [26]:
df_selected = df[["doi","journal","volume","date","title","abstract","url"]]
df_selected.tail()

Unnamed: 0,doi,journal,volume,date,title,abstract,url
43400,10.1162/003465397556476,The Review of Economics and Statistics,"79(1), pages 136-141, February.",1997-February-01,Market Share Behavior And Mobility: An Analysi...,A measure of market share mobility is derived ...,https://ideas.repec.org/a/tpr/restat/v79y1997i...
43401,10.1162/003465397556485,The Review of Economics and Statistics,"79(1), pages 142-145, February.",1997-February-01,The Response Of Exchange Rate Pass-Through To ...,This paper finds that domestic market concentr...,https://ideas.repec.org/a/tpr/restat/v79y1997i...
43402,10.1162/003465397556494,The Review of Economics and Statistics,"79(1), pages 146-150, February.",1997-February-01,The Skill Bias Of Technological Change In Cana...,The paper tests whether technological change h...,https://ideas.repec.org/a/tpr/restat/v79y1997i...
43403,10.1162/003465397556502,The Review of Economics and Statistics,"79(1), pages 151-154, February.",1997-February-01,Another Look At The Impact Of The National Ind...,Alexander's (1994) finding that the National R...,https://ideas.repec.org/a/tpr/restat/v79y1997i...
43404,10.1162/003465397556511,The Review of Economics and Statistics,"79(1), pages 155-160, February.",1997-February-01,"Beer Taxes, Workers' Compensation, And Industr...","The apparent effects of beer taxes, workers' c...",https://ideas.repec.org/a/tpr/restat/v79y1997i...


In [27]:
# save article
df_selected.to_csv("article.csv", index = False, encoding = "utf-8-sig")
df_selected.to_csv("article_no_header.csv", index = False, header = False, encoding = "utf-8-sig")

## Clean and update author.csv, email_affiliation.csv, affiliation.csv

In [28]:
# clean author
df_email = pd.read_csv("email_affiliation.csv", encoding = "utf-8-sig").replace(np.NaN,"")
emaildict = df_email.to_dict("list")
df_au = pd.read_csv("author.csv", encoding = "utf-8-sig")

In [376]:
# open chrome service
chromedriver_path = "D:/chromedriver.exe"
s = Service(chromedriver_path)
driver = webdriver.Chrome(service=s)

In [377]:
base_url = "https://scholar.google.com/scholar?hl=zh-CN&q="
start_idx = 1
for i, e in enumerate(sorted(df_au["emailsuffix"].unique())[start_idx:]):
    if e not in emaildict["subemail"]:
        driver.get(base_url+e)
        driver.refresh()
        soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
        try:
            h3 = soup.find("div", class_ = "gs_r").h3
            if h3.text == "以下机构的用户个人学术档案：":
                div = soup.find("div", class_ = "gs_r").div
                name = div.a.text
                email = div.span.text.replace(" - ", "")
                time.sleep(random.randrange(5,10))
            else:
                raise ValueError("x")
        except:
            try:
                h2 = soup.find("div", class_ = "gs_r").h2
                if "显示的是以下查询字词的结果：" in h2.text.strip():
                    driver.get("https://scholar.google.com"+soup.find("div", class_ = "gs_r").div.a.get("href"))
                    driver.refresh()
                    soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
                    div = soup.find("div", class_ = "gs_r").div
                    name = div.a.text
                    email = div.span.text.replace(" - ", "")
                    time.sleep(random.randrange(5,10))
                else:
                    raise ValueError("x")
            except:
                driver.get(base_url+".".join(e.split(".")[1:]))
                driver.refresh()
                soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
                try:
                    h3 = soup.find("div", class_ = "gs_r").h3
                    if h3.text == "以下机构的用户个人学术档案：":
                        div = soup.find("div", class_ = "gs_r").div
                        name = div.a.text
                        email = div.span.text.replace(" - ", "")
                        time.sleep(random.randrange(5,10))
                    else:
                        raise ValueError("x")
                except:
                    try:
                        h2 = soup.find("div", class_ = "gs_r").h2
                        if "显示的是以下查询字词的结果：" in h2.text.strip():
                            driver.get("https://scholar.google.com"+soup.find("div", class_ = "gs_r").div.a.get("href"))
                            driver.refresh()
                            soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
                            div = soup.find("div", class_ = "gs_r").div
                            name = div.a.text
                            email = div.span.text.replace(" - ", "")
                            time.sleep(random.randrange(5,10))
                        else:
                            raise ValueError("x")
                    except:
                        try:
                            if soup.find("h1").text == "请进行人机身份验证":
                                print("Blocked!!!")
                                break
                            else:
                                name = ""
                                email = ""
                            time.sleep(random.randrange(5,10))
                        except:
                            name = ""
                            email = ""
                            time.sleep(random.randrange(5,10))
        # append info
        try:
            affid = "".join([w[0] for w in name.lower().split(" ")])+"."+email
        except:
            affid = ""

        if "PDF" in affid:
            name = ""
            email = ""
            affid = ""

        emaildict["name"].append(name)
        emaildict["email"].append(email)
        emaildict["subemail"].append(e)
        emaildict["affiliationid"].append(affid)
        print(i+start_idx, e, name, email, affid)

In [29]:
len(emaildict["name"])

3459

In [30]:
df_email = pd.DataFrame(emaildict).drop_duplicates(subset=["subemail"],keep="last")
len(df_email)

3459

In [31]:
df_email.nunique()

name             1393
email            1391
subemail         3459
affiliationid    1393
dtype: int64

In [381]:
df_aff = df_email[["affiliationid","name","email"]][df_email["affiliationid"]!=""].drop_duplicates()
len(df_aff)

1392

In [32]:
# add affiliationid to author
for idx in df_au.index:
    try:
        df_au.loc[idx, "affiliationid"] = df_email.set_index("subemail").loc[df_au.loc[idx, "emailsuffix"],"affiliationid"]
    except:
        df_au.loc[idx, "affiliationid"] = ""

In [33]:
df_aff = pd.read_csv("affiliation.csv", encoding = "utf-8-sig")

df_email.to_csv("email_affiliation.csv", index = False, encoding = "utf-8-sig")
df_email.to_csv("email_affiliation_no_header.csv", index = False, header = False, encoding = "utf-8-sig")
df_aff.to_csv("affiliation.csv", index = False, encoding = "utf-8-sig")
df_aff.to_csv("affiliation_no_header.csv", index = False, header = False, encoding = "utf-8-sig")
df_au.to_csv("author.csv", index = False, encoding = "utf-8-sig")
df_au.to_csv("author_no_header.csv", index = False, header = False, encoding = "utf-8-sig")

## Database

In [35]:
# use this line to run the script directly in Jupyter
! sqlite3 econtop.db < create-econtop.sql

In [41]:
conn = sqlite3.connect("econtop.db")
cur = conn.cursor()

### University Level

In [42]:
# select university level of publications
q = """
    SELECT art.journal, art.title, art.abstract, art.date, au.authorname, af.name
    FROM author_article AS aa JOIN author AS au JOIN article AS art JOIN affiliation AS af
    ON aa.authorid = au.authorid AND
    aa.doi = art.doi AND
    au.affiliationid = af.affiliationid
    WHERE art.date >= '1000-01-01' AND
    art.date < '2105-01-01'
    """

# q = " SELECT authorid FROM author"
# generate valid abstracts with dates
df_af_art = pd.read_sql_query(q, conn).drop_duplicates(subset=["abstract"])
df_af_art.head()

Unnamed: 0,journal,title,abstract,date,authorname,name
0,Review of Financial Studies,Dynamic Equilibrium with Liquidity Constraints,This article studies an intertemporal economy ...,2003-05-15,Jerome Detemple,Boston University
1,Review of Financial Studies,Stochastic Discount Factor Bounds with Conditi...,Hansen and Jagannathan (1991) (hereafter HJ) d...,2003-05-15,Wayne Ferson,University of Southern California
2,Review of Financial Studies,Delta-Hedged Gains and the Negative Market Vol...,We investigate whether the volatility risk pre...,2003-05-15,Nikunj Kapadia,"University of Massachusetts, Amherst"
4,Review of Financial Studies,"Differences of Opinion, Short-Sales Constraint...",We develop a theory of market crashes based on...,2003-05-15,Jeremy C. Stein,Harvard University
6,Review of Financial Studies,Risk Adjustment and Trading Strategies,We assess the profitability of momentum strate...,2003-05-15,Dong-Hyun Ahn (안동현),Seoul National University


In [38]:
df_af_art.nunique()

journal          20
title         27638
abstract      27647
date           1025
authorname    11830
name           1267
dtype: int64

In [43]:
df_af_art.to_csv("abstract.csv", index = False, encoding = "utf-8-sig")

In [44]:
# show top appearance
freq = {}
for doi in df_af_art.index:
    try:
        name = df_af_art.loc[doi][1]
        try:
            freq[name] += 1
        except:
            freq[name] = 1
    except:
        df_af_art_new = df_af_art.loc[doi]
        names = set(list(df_af_art_new.name))
        for name in names:
            try:
                freq[name] += 1
            except:
                freq[name] = 1

In [45]:
freq_sorted = sorted(freq.items(), key=lambda x:x[1], reverse = True)
freq_sorted

[('A Comment on: “State Capacity, Reciprocity, and the Social Contract” by Timothy Besley',
  3),
 ('Panel Discussion', 3),
 ('Returns to Capital in Microenterprises: Evidence from a Field Experiment',
  2),
 ('Verifying the Solution from a Nonlinear Solver: A Case Study: Comment', 2),
 ('Household Finance', 2),
 ('Entrepreneurship education and teacher training in Rwanda', 2),
 ('A Review of the Stern Review on the Economics of Climate Change', 2),
 ('Dynamic Equilibrium with Liquidity Constraints', 1),
 ('Stochastic Discount Factor Bounds with Conditioning Information', 1),
 ('Delta-Hedged Gains and the Negative Market Volatility Risk Premium', 1),
 ('Differences of Opinion, Short-Sales Constraints, and Market Crashes', 1),
 ('Risk Adjustment and Trading Strategies', 1),
 ('An Analysis of Covariance Risk and Pricing Anomalies', 1),
 ('Order Preferencing and Market Quality on U.S. Equity Exchanges', 1),
 ('Market Making with Costly Monitoring: An Analysis of the SOES Controversy',
  1

In [52]:
# collaboration freq
id_pair = {}
name_pair = {}

for doi in df_af_art.index:
    try:
        df_af_art.loc[doi][0]
    except:
        df_af_art_new = df_af_art.loc[doi].sort_values("affiliationid")
        length = len(df_af_art_new)
        for i in range(length-1):
            for j in range(length-1-i):
                try:
                    id_pair[df_af_art_new["affiliationid"][i]+"+"+df_af_art_new["affiliationid"][i+j+1]] += 1
                    name_pair[df_af_art_new["name"][i]+"+"+df_af_art_new["name"][i+j+1]] += 1
                except:
                    id_pair[df_af_art_new["affiliationid"][i]+"+"+df_af_art_new["affiliationid"][i+j+1]] = 1
                    name_pair[df_af_art_new["name"][i]+"+"+df_af_art_new["name"][i+j+1]] = 1

In [None]:
name_pair_sorted = sorted(name_pair.items(), key=lambda x:x[1], reverse = True)
name_pair_sorted

In [46]:
conn.close()