# Data Cleaning and Put into SQL

In [2]:
import pandas as pd
import sqlite3
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import bs4
import time
import random
import numpy as np

## Clean author_article.csv

In [80]:
# clean author_article
df_art_au = pd.read_csv("author_article.csv", encoding = "utf-8-sig").drop_duplicates()
df_art_au.to_csv("author_article.csv", encoding = "utf-8-sig", index = False)
df_art_au.to_csv("author_article_no_header.csv", encoding = "utf-8-sig", index = False, header = False)

## Clean article.csv

In [81]:
# select valid records with valid doi, issue format, and not duplicated
def select_valid(df):
    df_s = df[(df["doi"].str[:3] == "10.") & (df["volume"].str.contains("Supplement") == False) & 
              (df["volume"].str.contains("_Part_") == False) & 
              (df["volume"].str.contains("S1") == False) & 
              (df["volume"].str.contains("S2") == False) &
              (df["volume"].str.contains("issue ") == True)].drop_duplicates(subset=["doi"]).reset_index(drop = True)
    return df_s

# convert volume to date
def conv_volume_to_date(v,freq):
    v_list = v.split(", ")
    year = v_list[0]
    issue = v_list[2].replace("issue ","")
    if freq == 4:
        d = year+"-"+str(int(issue)*3-1)+"-15"
    elif freq == 6 or freq == 5:
        d = year+"-"+str(int(issue)*2)+"-01"
    elif freq == 12:
        d = year+"-"+issue+"-15"
    elif freq == 3:
        d = year+"-"+str(int(issue)*3)+"-01"
    elif freq == 2:
        d = year+"-"+str(int(issue)*6-2)+"-01"
    elif freq == 1:
        d = year+"-"+str(7)+"-01"
    
    if len(d) < 10:
        return d.split("-")[0]+"-"+"0"+d.split("-")[1]+"-"+d.split("-")[2]
    else:
        return d
    
# calculate issue frequency
def cal_freq(vs):
    dict = {}
    years = [int(v.split(", ")[0]) for v in vs]
    unique_years = set(years)
    issues = [int(v.split(", ")[2].replace("issue ","")) for v in vs]
    for unique_year in unique_years:
        max_issue = 0
        for i in range(len(years)):
            if unique_year == years[i]:
                if issues[i] > max_issue:
                    max_issue = issues[i]
        dict[unique_year] = max_issue

    return dict


### 1. RFS

In [82]:
# clean rfs
df_rfs = pd.read_csv("data/journals/raw_data/rfs.csv", encoding = "utf-8-sig").astype("str")
df_rfs["abstract"] = df_rfs["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_rfs_s = select_valid(df_rfs)
freq = cal_freq(df_rfs_s["volume"])

dates = []
for idx in df_rfs_s.index:
    year = int(df_rfs_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_rfs_s.loc[idx,"volume"],freq[year]))

df_rfs_s["date"] = dates
df_rfs_s.head()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
0,Review of Financial Studies,Dynamic Equilibrium with Liquidity Constraints,"['Jerome Detemple', 'Angel Serrat']","2003, vol. 16, issue 2, 597-629",[],This article studies an intertemporal economy ...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhg003,2003-05-15
1,Review of Financial Studies,Stochastic Discount Factor Bounds with Conditi...,"['Wayne Ferson', 'Andrew F. Siegel']","2003, vol. 16, issue 2, 567-595",[],Hansen and Jagannathan (1991) (hereafter HJ) d...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhg004,2003-05-15
2,Review of Financial Studies,Delta-Hedged Gains and the Negative Market Vol...,"['Gurdip Bakshi', 'Nikunj Kapadia']","2003, vol. 16, issue 2, 527-566",[],We investigate whether the volatility risk pre...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhg002,2003-05-15
3,Review of Financial Studies,"Differences of Opinion, Short-Sales Constraint...","['Harrison Hong', 'Jeremy Stein']","2003, vol. 16, issue 2, 487-525",[],We develop a theory of market crashes based on...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhg006,2003-05-15
4,Review of Financial Studies,Risk Adjustment and Trading Strategies,"['Dong-Hyun Ahn', 'Jennifer Conrad', 'Robert D...","2003, vol. 16, issue 2, 459-485",[],We assess the profitability of momentum strate...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhg001,2003-05-15


### 2. RES

In [83]:
# clean res
df_res = pd.read_csv("data/journals/raw_data/res.csv", encoding = "utf-8-sig").astype("str")
df_res["abstract"] = df_res["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_res_s = select_valid(df_res)
freq = cal_freq(df_res_s["volume"])

dates = []
for idx in df_res_s.index:
    year = int(df_res_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_res_s.loc[idx,"volume"],freq[year]))

df_res_s["date"] = dates
df_res_s.head()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
0,Review of Economic Studies,"The Dynamics of Return Migration, Human Capita...","['Jerome Adda', 'Christian Dustmann', 'Joseph-...","2022, vol. 89, issue 6, 2841-2871",[],This article develops and estimates a dynamic ...,https://econpapers.repec.org/article/ouprestud...,10.1093/restud/rdac003,2022-12-01
1,Review of Economic Studies,The Impact of Car Pollution on Infant and Chil...,"['Diane Alexander', 'Hannes Schwandt']","2022, vol. 89, issue 6, 2872-2910",[],"In 2008, Volkswagen introduced a new generatio...",https://econpapers.repec.org/article/ouprestud...,10.1093/restud/rdac007,2022-12-01
2,Review of Economic Studies,The Welfare Effects of Transportation Infrastr...,"['Treb Allen', 'Costas Arkolakis']","2022, vol. 89, issue 6, 2911-2957",[],"Each year in the US, hundreds of billions of d...",https://econpapers.repec.org/article/ouprestud...,10.1093/restud/rdac001,2022-12-01
3,Review of Economic Studies,Subjective Models of the Macroeconomy: Evidenc...,"['Peter Andrebriq', 'Carlo Pizzinelli', 'Chris...","2022, vol. 89, issue 6, 2958-2991",[],We study people’s subjective models of the mac...,https://econpapers.repec.org/article/ouprestud...,10.1093/restud/rdac008,2022-12-01
4,Review of Economic Studies,"Product Life Cycle, Learning, and Nominal Shocks","['David Argente', 'Chen Yeh']","2022, vol. 89, issue 6, 2992-3054",[],This article documents a new set of stylized f...,https://econpapers.repec.org/article/ouprestud...,10.1093/restud/rdac004,2022-12-01


### 3. QJE

In [84]:
# clean qje
df_qje = pd.read_csv("data/journals/raw_data/qje.csv", encoding = "utf-8-sig").astype("str")
df_qje["abstract"] = df_qje["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_qje_s = select_valid(df_qje)
freq = cal_freq(df_qje_s["volume"])

dates = []
for idx in df_qje_s.index:
    year = int(df_qje_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_qje_s.loc[idx,"volume"],freq[year]))

df_qje_s["date"] = dates
df_qje_s.head()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
0,The Quarterly Journal of Economics,Systemic Discrimination Among Large U.S. Emplo...,"['Patrick Kline', 'Evan K Rose', 'Christopher ...","2022, vol. 137, issue 4, 1963-2036",[],We study the results of a massive nationwide c...,https://econpapers.repec.org/article/oupqjecon...,10.1093/qje/qjac024,2022-11-15
1,The Quarterly Journal of Economics,Valuing the Global Mortality Consequences of C...,"['Tamma Carleton', 'Amir Jina', 'Michael Delga...","2022, vol. 137, issue 4, 2037-2105",[],"Using 40 countries’ subnational data, we estim...",https://econpapers.repec.org/article/oupqjecon...,10.1093/qje/qjac020,2022-11-15
2,The Quarterly Journal of Economics,Reshaping Global Trade: The Immediate and Long...,['Chenzi Xu'],"2022, vol. 137, issue 4, 2107-2161",[],I show that a disruption to the financial sect...,https://econpapers.repec.org/article/oupqjecon...,10.1093/qje/qjac016,2022-11-15
3,The Quarterly Journal of Economics,Sexual Harassment and Gender Inequality in the...,"['Olle Folke', 'Johanna Rickne']","2022, vol. 137, issue 4, 2163-2212",[],We describe how sexual harassment contributes ...,https://econpapers.repec.org/article/oupqjecon...,10.1093/qje/qjac018,2022-11-15
4,The Quarterly Journal of Economics,The Quality and Efficiency of Public and Priva...,"['Daniel Knutsson', 'Björn Tyrefors']","2022, vol. 137, issue 4, 2213-2262",[],Economic theory predicts that outsourcing publ...,https://econpapers.repec.org/article/oupqjecon...,10.1093/qje/qjac014,2022-11-15


### 4. JPE

In [85]:
# clean jpe
df_jpe = pd.read_csv("data/journals/raw_data/jpe.csv", encoding = "utf-8-sig").astype("str")
df_jpe["abstract"] = df_jpe["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_jpe["doi"] = df_jpe["doi"].apply(lambda x: x.replace("http://dx.doi.org/",""))
df_jpe_s = select_valid(df_jpe)
freq = cal_freq(df_jpe_s["volume"])

dates = []
for idx in df_jpe_s.index:
    year = int(df_jpe_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_jpe_s.loc[idx,"volume"],freq[year]))

df_jpe_s["date"] = dates

# modify date for 2023
for idx in df_jpe_s.index:
    year = int(df_jpe_s.loc[idx,"volume"].split(", ")[0])
    if year == 2023:
        df_jpe_s.loc[idx, "date"] = conv_volume_to_date(df_jpe_s.loc[idx,"volume"],12)

df_jpe_s.head()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
0,Journal of Political Economy,Collective Brand Reputation,"['Volker Nocke', 'Roland Strausz']","2023, vol. 131, issue 1, 1 - 58",[],We develop a theory of collective brand reputa...,https://econpapers.repec.org/article/ucpjpolec...,10.1086/720986,2023-01-15
1,Journal of Political Economy,The Mechanics of the Industrial Revolution,"['Morgan Kelly', 'Joel Mokyr', 'Cormac Ó Gráda']","2023, vol. 131, issue 1, 59 - 94",[],Although there are many competing explanations...,https://econpapers.repec.org/article/ucpjpolec...,10.1086/720890,2023-01-15
2,Journal of Political Economy,Optimal Cooperative Taxation in the Global Eco...,"['V. V. Chari', 'Juan Pablo Nicolini', 'Pedro ...","2023, vol. 131, issue 1, 95 - 130",[],How should countries cooperate in setting fisc...,https://econpapers.repec.org/article/ucpjpolec...,10.1086/720889,2023-01-15
3,Journal of Political Economy,Personalized Pricing and Consumer Welfare,"['Jean-Pierre Dubé', 'Sanjog Misra']","2023, vol. 131, issue 1, 131 - 189",[],We study the welfare implications of personali...,https://econpapers.repec.org/article/ucpjpolec...,10.1086/720793,2023-01-15
4,Journal of Political Economy,A Semistructural Methodology for Policy Counte...,['Martin Beraja'],"2023, vol. 131, issue 1, 190 - 201",[],I propose a methodology for constructing count...,https://econpapers.repec.org/article/ucpjpolec...,10.1086/720982,2023-01-15


### 5. E

In [86]:
# clean e
df_e = pd.read_csv("data/journals/raw_data/e.csv", encoding = "utf-8-sig").astype("str")
df_e["abstract"] = df_e["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_e["doi"] = df_e["doi"].apply(lambda x: x.replace("https://doi.org/",""))
df_e_s = select_valid(df_e)
freq = cal_freq(df_e_s["volume"])

dates = []
for idx in df_e_s.index:
    year = int(df_e_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_e_s.loc[idx,"volume"],freq[year]))

df_e_s["date"] = dates

# modify date for 2023
for idx in df_e_s.index:
    year = int(df_e_s.loc[idx,"volume"].split(", ")[0])
    if year == 2023:
        df_e_s.loc[idx, "date"] = conv_volume_to_date(df_e_s.loc[idx,"volume"],6)

df_e_s.head()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
0,Econometrica,Invidious Comparisons: Ranking and Selection a...,"['Jiaying Gu', 'Roger Koenker']","2023, vol. 91, issue 1, 1-41",[],"There is an innate human tendency, one might c...",https://econpapers.repec.org/article/wlyemetrp...,10.3982/ECTA19304,2023-02-01
1,Econometrica,A Comment on: “Invidious Comparisons: Ranking ...,['Keisuke Hirano'],"2023, vol. 91, issue 1, 43-46",[],,https://econpapers.repec.org/article/wlyemetrp...,10.3982/ECTA20364,2023-02-01
2,Econometrica,A Comment on: “Invidious Comparisons: Ranking ...,['Patrick Kline'],"2023, vol. 91, issue 1, 47-52",[],,https://econpapers.repec.org/article/wlyemetrp...,10.3982/ECTA20322,2023-02-01
3,Econometrica,A Comment on: “Invidious Comparisons: Ranking ...,"['Magne Mogstad', 'Joseph P. Romano', 'Azeem M...","2023, vol. 91, issue 1, 53-60",[],,https://econpapers.repec.org/article/wlyemetrp...,10.3982/ECTA20460,2023-02-01
4,Econometrica,Reply to: Comments on “Invidious Comparisons: ...,"['Jiaying Gu', 'Roger Koenker']","2023, vol. 91, issue 1, 61-66",[],"There is an innate human tendency, one might c...",https://econpapers.repec.org/article/wlyemetrp...,10.3982/ECTA20537,2023-02-01


### 6. AER

In [87]:
# clean aer
df_aer = pd.read_csv("data/journals/raw_data/aer.csv", encoding = "utf-8-sig").astype("str")
df_aer["journal"] = df_aer["journal"].apply(lambda x: "American Economic Review")
df_aer["abstract"] = df_aer["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_aer["volume"] = df_aer["volume"].apply(lambda x: x.replace("\n","").replace("\t","").strip())
df_aer["doi"] = df_aer["doi"].apply(lambda x: x.strip())
df_aer = df_aer[df_aer["doi"].str[:3] == "10."].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_aer.index:
    date_text = df_aer.loc[idx,"volume"].split(",")[2].strip()
    year = date_text.split(" ")[1]
    month_text = date_text.split(" ")[0]
    if month_text == "January":
        month = "01"
    elif month_text == "February":
        month = "02"
    elif month_text == "March":
        month = "03"
    elif month_text == "April":
        month = "04"
    elif month_text == "May":
        month = "05"
    elif month_text == "June":
        month = "06"
    elif month_text == "July":
        month = "07"
    elif month_text == "August":
        month = "08"
    elif month_text == "September":
        month = "09"
    elif month_text == "October":
        month = "10"
    elif month_text == "November":
        month = "11"
    elif month_text == "December":
        month = "12"

    dates.append(year+"-"+month+"-"+"15")

df_aer["date"] = dates
df_aer_s = df_aer.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_aer_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
4799,American Economic Review,Enabling or Limiting Cognitive Flexibility? Ev...,"['\n Silvia Saccardo ', '\n ...","American Economic Review \r\r\rvol. 113,\rno. ...","[('C91', 'Design of Experiments: Laboratory, I...",Moral behavior is more prevalent when individu...,https://www.aeaweb.org/articles?id=10.1257/aer...,10.1257/aer.20201333,2023-02-15
4800,American Economic Review,"Droughts, Deluges, and (River) Diversions: Val...",['\n Will Rafey '],"American Economic Review \r\r\rvol. 113,\rno. ...","[('D23', 'Organizational Behavior; Transaction...",This paper develops and applies a method to va...,https://www.aeaweb.org/articles?id=10.1257/aer...,10.1257/aer.20201434,2023-02-15
4801,American Economic Review,"Technology Gaps, Trade, and Income",['\n Thomas Sampson '],"American Economic Review \r\r\rvol. 113,\rno. ...","[('D21', 'Firm Behavior: Theory'), ('D24', 'Pr...",This paper quantifies the contribution of tech...,https://www.aeaweb.org/articles?id=10.1257/aer...,10.1257/aer.20201940,2023-02-15
4802,American Economic Review,Electronic Food Vouchers: Evidence from an At-...,"['\n Abhijit Banerjee ', '\n ...","American Economic Review \r\r\rvol. 113,\rno. ...","[('H53', 'National Government Expenditures and...",We compare how in-kind food assistance and an ...,https://www.aeaweb.org/articles?id=10.1257/aer...,10.1257/aer.20210461,2023-02-15
4803,American Economic Review,The Voice of Monetary Policy,"['\n Yuriy Gorodnichenko ', '\n ...","American Economic Review \r\r\rvol. 113,\rno. ...","[('D83', 'Search; Learning; Information and Kn...",We develop a deep learning model to detect emo...,https://www.aeaweb.org/articles?id=10.1257/aer...,10.1257/aer.20220129,2023-02-15


### 7. JF

In [88]:
# clean jf
df_jf = pd.read_csv("data/journals/raw_data/jf.csv", encoding = "utf-8-sig").astype("str")
df_jf["abstract"] = df_jf["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_jf["doi"] = df_jf["doi"].apply(lambda x: x.replace("https://doi.org/",""))
df_jf_s = select_valid(df_jf)
freq = cal_freq(df_jf_s["volume"])

dates = []
for idx in df_jf_s.index:
    year = int(df_jf_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_jf_s.loc[idx,"volume"],freq[year]))

df_jf_s["date"] = dates

# modify date for 2023
for idx in df_jf_s.index:
    year = int(df_jf_s.loc[idx,"volume"].split(", ")[0])
    if year == 2023:
        df_jf_s.loc[idx, "date"] = conv_volume_to_date(df_jf_s.loc[idx,"volume"],6)

df_jf_s.head()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
0,Journal of Finance,Optimal Financial Transaction Taxes,['Eduardo Dávila'],"2023, vol. 78, issue 1, 5-61",[],This paper characterizes the optimal transacti...,https://econpapers.repec.org/article/blajfinan...,10.1111/jofi.13188,2023-02-01
1,Journal of Finance,"Less Mainstream Credit, More Payday Borrowing?...",['Julia Fonseca'],"2023, vol. 78, issue 1, 63-103",[],Governments regulate debt collectors to protec...,https://econpapers.repec.org/article/blajfinan...,10.1111/jofi.13189,2023-02-01
2,Journal of Finance,Disruption and Credit Markets,"['Bo Becker', 'Victoria Ivashina']","2023, vol. 78, issue 1, 105-139",[],"We show that over the past half‐century, innov...",https://econpapers.repec.org/article/blajfinan...,10.1111/jofi.13187,2023-02-01
3,Journal of Finance,How Risky Are U.S. Corporate Assets?,"['Tetiana Davydiuk', 'Scott Richard', 'Ivan Sh...","2023, vol. 78, issue 1, 141-208",[],We use market data on corporate bonds and equi...,https://econpapers.repec.org/article/blajfinan...,10.1111/jofi.13196,2023-02-01
4,Journal of Finance,International Yield Curves and Currency Puzzles,"['Mikhail Chernov', 'Drew Creal']","2023, vol. 78, issue 1, 209-245",[],The currency depreciation rate is often comput...,https://econpapers.repec.org/article/blajfinan...,10.1111/jofi.13191,2023-02-01


## 8. JEP

In [89]:
# clean jep
df_jep = pd.read_csv("data/journals/raw_data/jep.csv", encoding = "utf-8-sig").astype("str")
df_jep["journal"] = df_jep["journal"].apply(lambda x: "Journal of Economic Perspectives")
df_jep["abstract"] = df_jep["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_jep["volume"] = df_jep["volume"].apply(lambda x: x.replace("\n","").replace("\t","").strip())
df_jep["doi"] = df_jep["doi"].apply(lambda x: x.strip())
df_jep = df_jep[df_jep["doi"].str[:3] == "10."].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_jep.index:
    date_text = df_jep.loc[idx,"volume"].lower().split(" ")
    season = date_text[0]
    year = date_text[1]
    if season == "spring":
        md = "02-15"
    elif season == "summer":
        md = "05-15"
    elif season == "fall":
        md = "08-15"
    elif month_text == "winter":
        md = "11-15"

    dates.append(year+"-"+md)

df_jep["date"] = dates
df_jep_s = df_jep.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_jep_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
1830,Journal of Economic Perspectives,Emerging Market Sovereign Debt in the Aftermat...,['\n Kenneth Rogoff '],"Fall 2022 (Vol. 36, No.4 )","[('E23', 'Macroeconomics: Production'), ('E62'...","For emerging markets, fiscal space is a very r...",https://www.aeaweb.org/articles?id=10.1257/jep...,10.1257/jep.36.4.147,2022-08-15
1831,Journal of Economic Perspectives,Popular Personal Financial Advice versus the P...,['\n James J. Choi '],"Fall 2022 (Vol. 36, No.4 )","[('D15', 'Intertemporal Household Choice; Life...",I survey the advice given by the fifty most po...,https://www.aeaweb.org/articles?id=10.1257/jep...,10.1257/jep.36.4.167,2022-08-15
1832,Journal of Economic Perspectives,A Linear Panel Model with Heterogeneous Coeffi...,"['\n Liyang Sun ', '\n Jess...","Fall 2022 (Vol. 36, No.4 )","[('C23', 'Single Equation Models; Single Varia...",Linear panel models featuring unit and time fi...,https://www.aeaweb.org/articles?id=10.1257/jep...,10.1257/jep.36.4.193,2022-08-15
1833,Journal of Economic Perspectives,"Sadie T. M. Alexander: Black Women and a ""Tast...",['\n Nina Banks '],"Fall 2022 (Vol. 36, No.4 )","[('B31', 'History of Economic Thought: Individ...",The employment history of African American wom...,https://www.aeaweb.org/articles?id=10.1257/jep...,10.1257/jep.36.4.205,2022-08-15
1834,Journal of Economic Perspectives,Recommendations for Further Reading,['\n Timothy Taylor '],"Fall 2022 (Vol. 36, No.4 )","[('Y50', 'Further Reading (unclassified)')]",N\A,https://www.aeaweb.org/articles?id=10.1257/jep...,10.1257/jep.36.4.221,2022-08-15


## 9. AEJM

In [90]:
# clean aejm
df_aejm = pd.read_csv("data/journals/raw_data/aejm.csv", encoding = "utf-8-sig").astype("str")
df_aejm = df_aejm[df_aejm["volume"].str.contains("pages")]
df_aejm["abstract"] = df_aejm["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_aejm["doi"] = df_aejm["doi"].apply(lambda x: x.strip())
df_aejm = df_aejm[df_aejm["doi"].str[:3] == "10."].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_aejm.index:
    year = df_aejm.loc[idx,"year"]
    date_text = df_aejm.loc[idx,"volume"]
    month_text = date_text.split(", ")[2].replace(".","")
    if month_text == "January":
        month = "01"
    elif month_text == "February":
        month = "02"
    elif month_text == "March":
        month = "03"
    elif month_text == "April":
        month = "04"
    elif month_text == "May":
        month = "05"
    elif month_text == "June":
        month = "06"
    elif month_text == "July":
        month = "07"
    elif month_text == "August":
        month = "08"
    elif month_text == "September":
        month = "09"
    elif month_text == "October":
        month = "10"
    elif month_text == "November":
        month = "11"
    elif month_text == "December":
        month = "12"

    dates.append(year+"-"+month+"-"+"01")

df_aejm["date"] = dates
df_aejm_s = df_aejm.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_aejm_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
476,American Economic Journal: Macroeconomics,Civic Virtue and Labor Market Institutions,"['Pierre Cahuc', 'Yann Algan']","1(1), pages 111-145, January.",N\A,We argue civic virtue plays a key role in expl...,https://ideas.repec.org/a/aea/aejmac/v1y2009i1...,10.1257/mac.1.1.111,2009-01-01
477,American Economic Journal: Macroeconomics,Rent Preservation and the Persistence of Under...,['Raghuram G. Rajan'],"1(1), pages 178-218, January.",N\A,When citizens in a poor constrained society ar...,https://ideas.repec.org/a/aea/aejmac/v1y2009i1...,10.1257/mac.1.1.178,2009-01-01
478,American Economic Journal: Macroeconomics,Border Effect or Country Effect? Seattle May N...,"['Yuriy Gorodnichenko', 'Linda L. Tesar']","1(1), pages 219-241, January.",N\A,This paper reexamines the evidence on the bord...,https://ideas.repec.org/a/aea/aejmac/v1y2009i1...,10.1257/mac.1.1.219,2009-01-01
479,American Economic Journal: Macroeconomics,New Keynesian Models: Not Yet Useful for Polic...,"['V. V. Chari', 'Patrick J. Kehoe', 'Ellen R. ...","1(1), pages 242-266, January.",N\A,Macroeconomists have largely converged on meth...,https://ideas.repec.org/a/aea/aejmac/v1y2009i1...,10.1257/mac.1.1.242,2009-01-01
480,American Economic Journal: Macroeconomics,Convergence in Macroeconomics: The Labor Wedge,['Robert Shimer'],"1(1), pages 280-297, January.",N\A,I review research on the behavior of the labor...,https://ideas.repec.org/a/aea/aejmac/v1y2009i1...,10.1257/mac.1.1.280,2009-01-01


## 10. EER

In [91]:
# clean eer
df_eer = pd.read_csv("data/journals/raw_data/eer.csv", encoding = "utf-8-sig").astype("str")
df_eer["abstract"] = df_eer["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_eer = df_eer.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_eer["doi"] = df_eer["doi"].apply(lambda x: x.strip())
df_eer = df_eer[df_eer["doi"].str[:3] == "10."].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

months = ["January","February","March","April","May","June",
          "July","August","September","October","November","December"]
dates = []
for idx in df_eer.index:
    year = df_eer.loc[idx,"year"]
    date_parts = df_eer.loc[idx,"volume"].split("(")
    vol = int(date_parts[0])
    month_text =  date_parts[1].split(", ")[-1].replace(".","")

    if month_text in months:
        if month_text == "January":
            month = "01"
        elif month_text == "February":
            month = "02"
        elif month_text == "March":
            month = "03"
        elif month_text == "April":
            month = "04"
        elif month_text == "May":
            month = "05"
        elif month_text == "June":
            month = "06"
        elif month_text == "July":
            month = "07"
        elif month_text == "August":
            month = "08"
        elif month_text == "September":
            month = "09"
        elif month_text == "October":
            month = "10"
        elif month_text == "November":
            month = "11"
        elif month_text == "December":
            month = "12"
    # 2016-2023
    elif int(year) >= 2016:
        if vol % 10 == 0:
            month = "11"
        elif vol % 10 == 9:
            month = "10"
        elif vol % 10 == 8:
            month = "09"
        elif vol % 10 == 7:
            month = "08"
        elif vol % 10 == 6:
            month = "07"
        elif vol % 10 == 5:
            month = "06"
        elif vol % 10 == 4:
            month = "05"
        elif vol % 10 == 3:
            month = "04"
        elif vol % 10 == 2:
            month = "02"
        elif vol % 10 == 1:
            month = "01"
    # 2013-2015
    elif int(year) >= 2013:
        if vol % 8 == 0:
            month = "11"
        elif vol % 8 == 7:
            month = "10"
        elif vol % 8 == 6:
            month = "08"
        elif vol % 8 == 5:
            month = "07"
        elif vol % 8 == 4:
            month = "05"
        elif vol % 8 == 3:
            month = "04"
        elif vol % 8 == 2:
            month = "02"
        elif vol % 8 == 1:
            month = "01"
    # 2005-2012
    elif int(year) >= 2005:
        issue = int(date_parts[1][0])
        if issue == 8:
            month = "11"
        elif issue == 7:
            month = "10"
        elif issue == 6:
            month = "08"
        elif issue == 5:
            month = "07"
        elif issue == 4:
            month = "05"
        elif issue == 3:
            month = "04"
        elif issue == 2:
            month = "02"
        elif issue == 1:
            month = "01"
    # 2003-2004
    elif int(year) >= 2003:
        issue = int(date_parts[1][0])
        if issue == 6:
            month = "12"
        elif issue == 5:
            month = "10"
        elif issue == 4:
            month = "08"
        elif issue == 3:
            month = "06"
        elif issue == 2:
            month = "04"
        elif issue == 1:
            month = "02"
    # 2000-2002
    elif int(year) == 2002:
        issue = date_parts[1].split(")")[0]
        if issue == "10":
            month = "12"
        elif issue == "9":
            month = "10"
        elif issue == "8":
            month = "09"
        elif issue == "7":
            month = "07"
        elif issue == "6":
            month = "06"
        elif issue == "4-5":
            month = "05"
        elif issue == "3":
            month = "03"
        elif issue == "2":
            month = "02"
        elif issue == "1":
            month = "01"
    elif int(year) == 2001:
        issue = date_parts[1].split(")")[0]
        if issue == "10":
            month = "12"
        elif issue == "9":
            month = "10"
        elif issue == "8":
            month = "08"
        elif issue == "7":
            month = "06"
        elif issue == "4-6":
            month = "05"
        elif issue == "3":
            month = "03"
        elif issue == "2":
            month = "02"
        elif issue == "1":
            month = "01"
    elif int(year) == 2000:
        issue = date_parts[1].split(")")[0]
        if issue == "10":
            month = "12"
        elif issue == "9":
            month = "10"
        elif issue == "8":
            month = "08"
        elif issue == "7":
            month = "06"
        elif issue == "4-6":
            month = "05"
        elif issue == "3":
            month = "03"
        elif issue == "2":
            month = "02"
        elif issue == "1":
            month = "01"
    else:
        issue = date_parts[1][0]
        if issue == "9":
            month = "09"
        elif issue == "8":
            month = "08"
        elif issue == "7":
            month = "07"
        elif issue == "6":
            month = "06"
        elif issue == "5":
            month = "05"
        elif issue == "4":
            month = "04"
        elif issue == "3":
            month = "03"
        elif issue == "2":
            month = "02"
        elif issue == "1":
            month = "01"

    dates.append(year+"-"+month+"-"+"15")

df_eer["date"] = dates
df_eer_s = df_eer.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_eer_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
1631,European Economic Review,Intergenerational exchange mobility and econom...,"['Markandya, Anil']","17(3), pages 307-324.",N\A,The concepts of exchange and structural mobili...,https://ideas.repec.org/a/eee/eecrev/v17y1982i...,10.1016/S0014-2921(82)80066-4,1982-03-15
1632,European Economic Review,The estimation of welfare levels of a cardinal...,"['Buyze, Jeannine']","17(3), pages 325-332.",N\A,In order to measure an individual's welfare fu...,https://ideas.repec.org/a/eee/eecrev/v17y1982i...,10.1016/S0014-2921(82)80067-6,1982-03-15
1633,European Economic Review,"Earnings and education in Greece, 1960–1977","['Psacharopoulos, George']","17(3), pages 333-347.",N\A,This paper analyses the structure of earnings ...,https://ideas.repec.org/a/eee/eecrev/v17y1982i...,10.1016/S0014-2921(82)80068-8,1982-03-15
1634,European Economic Review,The testing and estimation of complete demand ...,"['Ray, Ranjan']","17(3), pages 349-369.",N\A,A recent demand system (AIDS) is extended to i...,https://ideas.repec.org/a/eee/eecrev/v17y1982i...,10.1016/S0014-2921(82)80069-X,1982-03-15
1635,European Economic Review,Modelling consumers' expenditure,"['Rossi, Nicola', 'Schiantarelli, Fabio']","17(3), pages 371-391.",N\A,The purpose of this paper is to model the dyna...,https://ideas.repec.org/a/eee/eecrev/v17y1982i...,10.1016/S0014-2921(82)80070-6,1982-03-15


## 11. EL

In [92]:
# clean el
df_el = pd.read_csv("data/journals/raw_data/el.csv", encoding = "utf-8-sig").astype("str")
df_el["abstract"] = df_el["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_el = df_el.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_el["doi"] = df_el["doi"].apply(lambda x: x.strip())
df_el = df_el[df_el["doi"].str[:3] == "10."].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_el.index:
    year = df_el.loc[idx,"year"]
    date_parts = df_el.loc[idx,"volume"].split("(")
    vol = int(date_parts[0])

    # 2015-2023
    if int(year) >= 2015:
        if vol % 12 == 5:
            month = "12"
        elif vol % 12 == 4:
            month = "11"
        elif vol % 12 == 3:
            month = "10"
        elif vol % 12 == 2:
            month = "09"
        elif vol % 12 == 1:
            month = "08"
        elif vol % 12 == 0:
            month = "07"
        elif vol % 12 == 11:
            month = "06"
        elif vol % 12 == 10:
            month = "05"
        elif vol % 12 == 9:
            month = "04"
        elif vol % 12 == 8:
            month = "03"
        elif vol % 12 == 7:
            month = "02"
        elif vol % 12 == 6:
            month = "01"
    else:
        issue = int(date_parts[1][0])
        if vol % 4 == 1 and issue == 3:
            month = "12"
        elif vol % 4 == 1 and issue == 2:
            month = "11"
        elif vol % 4 == 1 and issue == 1:
            month = "10"
        elif vol % 4 == 0 and issue == 3:
            month = "09"
        elif vol % 4 == 0 and issue == 2:
            month = "08"
        elif vol % 4 == 0 and issue == 1:
            month = "07"
        elif vol % 4 == 3 and issue == 3:
            month = "06"
        elif vol % 4 == 3 and issue == 2:
            month = "05"
        elif vol % 4 == 3 and issue == 1:
            month = "04"
        elif vol % 4 == 2 and issue == 3:
            month = "03"
        elif vol % 4 == 2 and issue == 2:
            month = "02"
        elif vol % 4 == 2 and issue == 1:
            month = "01"
            
    dates.append(year+"-"+month+"-"+"15")

df_el["date"] = dates
df_el_s = df_el.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_el_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
4938,Economics Letters,A robust test for multivariate normality,"['Jönsson, Kristian']","113(2), pages 199-201.",N\A,The size of the Jarque–Bera test for multivari...,https://ideas.repec.org/a/eee/ecolet/v113y2011...,10.1016/j.econlet.2011.06.018,2011-11-15
4939,Economics Letters,A differential measure of the real wage index,"['Baye, Michael R.', 'Black, Dan A.']","36(3), pages 295-298.",N\A,In this paper we derive an approximation of th...,https://ideas.repec.org/a/eee/ecolet/v36y1991i...,10.1016/0165-1765(91)90036-K,1991-09-15
4940,Economics Letters,"Aggregate price indexes, cointegration, and te...","['Johnson, Paul A.']","36(3), pages 305-309.",N\A,In this paper I show that cointegration tests ...,https://ideas.repec.org/a/eee/ecolet/v36y1991i...,10.1016/0165-1765(91)90038-M,1991-09-15
4941,Economics Letters,Valuation effects of Canadian stock split anno...,"['Kryzanowski, Lawrence', 'Zhang, Hao']","36(3), pages 317-322.",N\A,The abnormal returns for two types of announce...,https://ideas.repec.org/a/eee/ecolet/v36y1991i...,10.1016/0165-1765(91)90040-R,1991-09-15
4942,Economics Letters,An index of relative deprivation,"['Paul, Satya']","36(3), pages 337-341.",N\A,This paper proposes an index of relative depri...,https://ideas.repec.org/a/eee/ecolet/v36y1991i...,10.1016/0165-1765(91)90043-K,1991-09-15


## 12. EM

In [93]:
# clean em
df_em = pd.read_csv("data/journals/raw_data/em.csv", encoding = "utf-8-sig").astype("str")
df_em["abstract"] = df_em["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_em = df_em.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_em["doi"] = df_em["doi"].apply(lambda x: x.strip())
df_em = df_em[df_em["doi"].str[:3] == "10."].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_em.index:
    year = df_em.loc[idx,"year"]
    date_parts = df_em.loc[idx,"volume"].split("(")
    try:
        vol = int(date_parts[0])
    except:
        try:
            date_parts = df_em.loc[idx-1,"volume"].split("(")
            vol = int(date_parts[0])
        except:
            try:
                date_parts = df_em.loc[idx-2,"volume"].split("(")
                vol = int(date_parts[0])
            except:
                date_parts = df_em.loc[idx-3,"volume"].split("(")
                vol = int(date_parts[0])

    # 2021-2023
    if int(year) >= 2021:
        if vol % 12 == 9:
            month = "12"
        elif vol % 12 == 8:
            month = "11"
        elif vol % 12 == 7:
            month = "10"
        elif vol % 12 == 6:
            month = "09"
        elif vol % 12 == 5:
            month = "08"
        elif vol % 12 == 4:
            month = "07"
        elif vol % 12 == 3:
            month = "06"
        elif vol % 12 == 2:
            month = "05"
        elif vol % 12 == 1:
            month = "04"
        elif vol % 12 == 0:
            month = "03"
        elif vol % 12 == 11:
            month = "02"
        elif vol % 12 == 10:
            month = "01"
    # 2020
    elif int(year) == 2020:
        if vol % 10 == 3:
            month = "12"
        elif vol % 10 == 2:
            month = "11"
        elif vol % 10 == 1:
            month = "09"
        elif vol % 10 == 0:
            month = "08"
        elif vol % 10 == 9:
            month = "07"
        elif vol % 10 == 8:
            month = "06"
        elif vol % 10 == 7:
            month = "05"
        elif vol % 10 == 6:
            month = "03"
        elif vol % 10 == 5:
            month = "02"
        elif vol % 10 == 4:
            month = "01"
    # 2015-2019
    elif int(year) >= 2015:
        if vol % 8 == 3:
            month = "12"
        elif vol % 8 == 2:
            month = "11"
        elif vol % 8 == 1:
            month = "09"
        elif vol % 8 == 0:
            month = "08"
        elif vol % 8 == 7:
            month = "06"
        elif vol % 8 == 6:
            month = "04"
        elif vol % 8 == 5:
            month = "02"
        elif vol % 8 == 4:
            month = "01"
    # 2014
    elif int(year) == 2014:
        if vol % 9 == 8 or vol % 9 == 7:
            month = "12"
        elif vol % 9 == 6:
            month = "10"
        elif vol % 9 == 5:
            month = "08"
        elif vol % 9 == 4:
            month = "06"
        elif vol % 9 == 3:
            month = "04"
        elif vol % 9 == 2:
            month = "02"
        elif vol % 9 == 1:
            month = "02"
        elif vol % 9 == 0:
            month = "01"
    # 2013
    elif int(year) == 2013:
        if vol % 6 == 5:
            month = "09"
        elif vol % 6 == 4:
            month = "08"
        elif vol % 6 == 3:
            month = "07"
        elif vol % 6 == 2:
            month = "05"
        elif vol % 6 == 1:
            month = "03"
        elif vol % 6 == 0:
            month = "01"
    else:
        issue = date_parts[1][0]
        if issue == "6":
            month = "11"
        elif issue == "5":
            month = "09"
        elif issue == "4":
            month = "07"
        elif issue == "3":
            month = "05"
        elif issue == "2":
            month = "03"
        elif issue == "1":
            month = "01"
        
    dates.append(year+"-"+month+"-"+"15")

df_em["date"] = dates
df_em_s = df_em.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_em_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
3884,Economic Modelling,The effectiveness of the sunshine effect in Ta...,"['Lee, Yuan-Ming', 'Wang, Kuan-Min']","28(1), pages 710-727.",N\A,This study constructs a variety of GARCH model...,https://ideas.repec.org/a/eee/ecmode/v28y2011i...,10.1016/j.econmod.2010.05.008,2011-01-15
3885,Economic Modelling,The optimality of a gulf currency union: Commo...,"['Rafiq, M.S.']","28(1), pages 728-740.",N\A,A high degree of shared national elements that...,https://ideas.repec.org/a/eee/ecmode/v28y2011i...,10.1016/j.econmod.2010.05.007,2011-01-15
3886,Economic Modelling,Long-term macroeconometric models,"['Welfe, Wladyslaw']","28(1), pages 741-753.",N\A,Long-term forecasts and scenario analysis shou...,https://ideas.repec.org/a/eee/ecmode/v28y2011i...,10.1016/j.econmod.2010.05.002,2011-01-15
3887,Economic Modelling,The effects of the monetary policy regime shif...,"['Reschreiter, Andreas']","28(1), pages 754-759.",N\A,This paper studies the effects of the monetary...,https://ideas.repec.org/a/eee/ecmode/v28y2011i...,10.1016/j.econmod.2010.04.009,2011-01-15
3888,Economic Modelling,WITHDRAWN: More powerful non-linear panel unit...,"['Lau, Marco Chi-Keung']","28(1), pages 760-760.",N\A,This article has been withdrawn at the request...,https://ideas.repec.org/a/eee/ecmode/v28y2011i...,10.1016/j.econmod.2010.06.007,2011-01-15


## 13. JDE

In [94]:
# clean jde
df_jde = pd.read_csv("data/journals/raw_data/jde.csv", encoding = "utf-8-sig").astype("str")
df_jde["abstract"] = df_jde["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_jde = df_jde.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_jde["doi"] = df_jde["doi"].apply(lambda x: x.strip())
df_jde = df_jde[df_jde["doi"].str[:3] == "10."].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_jde.index:
    year = df_jde.loc[idx,"year"]
    date_parts = df_jde.loc[idx,"volume"].split("(")
    try:
        vol = int(date_parts[0])
    except:
        try:
            date_parts = df_jde.loc[idx-1,"volume"].split("(")
            vol = int(date_parts[0])
        except:
            try:
                date_parts = df_jde.loc[idx-2,"volume"].split("(")
                vol = int(date_parts[0])
            except:
                date_parts = df_jde.loc[idx-3,"volume"].split("(")
                vol = int(date_parts[0])
    
    # 2013-2023
    if int(year) >= 2013:
        if vol % 6 == 3:
            month = "11"
        elif vol % 6 == 2:
            month = "09"
        elif vol % 6 == 1:
            month = "06"
        elif vol % 6 == 0:
            month = "05"
        elif vol % 6 == 5:
            month = "03"
        elif vol % 6 == 4:
            month = "01"
    else:
        issue = date_parts[1][0]
        if vol % 3  == 0 and issue == "2":
            month = "11"
        elif vol % 3 == 0 and issue == "1":
            month = "09"
        elif vol % 3 == 2 and issue == "2":
            month = "07"
        elif vol % 3 == 2 and issue == "1":
            month = "05"
        elif vol % 3 == 1 and issue == "2":
            month = "03"
        elif vol % 3 == 1 and issue == "1":
            month = "01"
        
    dates.append(year+"-"+month+"-"+"15")

df_jde["date"] = dates
df_jde_s = df_jde.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_jde_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
1198,Journal of Development Economics,Agricultural terms of trade and distributional...,"['Andrews, Margaret S.']","17(1), pages 117-129.",N\A,The comparative-static effects of a change in ...,https://ideas.repec.org/a/eee/deveco/v17y1985i...,10.1016/0304-3878(85)90025-2,1985-05-15
1199,Journal of Development Economics,"Profit, rent and the terms of trade","['Gibson, Bill', 'McLeod, Darryl']","17(1), pages 131-139.",N\A,Andrews (1985) investigates the theoretical co...,https://ideas.repec.org/a/eee/deveco/v17y1985i...,10.1016/0304-3878(85)90026-4,1985-05-15
1200,Journal of Development Economics,"Profit, rent and the terms of trade","['Andrews, Margaret S.']","17(1), pages 141-149.",N\A,"In response to my article in this issue, Gibso...",https://ideas.repec.org/a/eee/deveco/v17y1985i...,10.1016/0304-3878(85)90027-6,1985-05-15
1201,Journal of Development Economics,More on the employment effects of innovation,"['Hall, P.H.', 'Heffernan, S.A.']","17(1), pages 151-162.",N\A,This paper discusses in the context of a fully...,https://ideas.repec.org/a/eee/deveco/v17y1985i...,10.1016/0304-3878(85)90028-8,1985-05-15
1202,Journal of Development Economics,More on the employment effects of innovation,"['Hagen, Everett E.']","17(1), pages 163-173.",N\A,This article argues the lack of relevance to r...,https://ideas.repec.org/a/eee/deveco/v17y1985i...,10.1016/0304-3878(85)90029-X,1985-05-15


## 14. JEEA

In [95]:
# clean jeea
df_jeea = pd.read_csv("data/journals/raw_data/jeea.csv", encoding = "utf-8-sig").astype("str")
df_jeea["abstract"] = df_jeea["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_jeea = df_jeea.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_jeea["doi"] = df_jeea["doi"].apply(lambda x: x.strip().replace("http://hdl.handle.net/",""))
df_jeea = df_jeea[(df_jeea["doi"].str[:3] == "10.") & (df_jeea["doi"].str.contains("issue") == False)].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_jeea.index:
    year = df_jeea.loc[idx,"year"]
    date_parts = df_jeea.loc[idx,"volume"].split("(")
    try:
        vol = int(date_parts[0])
    except:
        try:
            date_parts = df_jeea.loc[idx-1,"volume"].split("(")
            vol = int(date_parts[0])
        except:
            try:
                date_parts = df_jeea.loc[idx-2,"volume"].split("(")
                vol = int(date_parts[0])
            except:
                date_parts = df_jeea.loc[idx-3,"volume"].split("(")
                vol = int(date_parts[0])
    
    
    issue = date_parts[1][0]
    if issue == "6":
        month = "12"
    elif issue == "5":
        month = "10"
    elif issue == "4":
        month = "08"
    elif issue == "3":
        month = "06"
    elif issue == "2":
        month = "04"
    elif issue == "1":
        month = "02"
        
    dates.append(year+"-"+month+"-"+"15")

df_jeea["date"] = dates
df_jeea_s = df_jeea.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_jeea_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
502,Journal of the European Economic Association,Do Immigrants Cause Crime?,"['Milo Bianchi', 'Paolo Buonanno', 'Paolo Pino...","10(6), pages 1318-1347, December.",N\A,We examine the empirical relationship between ...,https://ideas.repec.org/a/bla/jeurec/v10y2012i...,10.1111/j.1542-4774.2012.01085.x,2012-12-15
503,Journal of the European Economic Association,Risk Aversion And Expected Utility Theory: An ...,"['Matilde Bombardini', 'Francesco Trebbi']","10(6), pages 1348-1399, December.",N\A,No abstract is available for this item.,https://ideas.repec.org/a/bla/jeurec/v10y2012i...,10.1111/j.1542-4774.2012.01086.x,2012-12-15
504,Journal of the European Economic Association,Can We Infer Hospital Quality From Medical Gra...,"['Matilde P. Machado', 'Ricardo Mora', 'Antoni...","10(6), pages 1400-1424, December.",N\A,"In this paper, we propose an alternative metho...",https://ideas.repec.org/a/bla/jeurec/v10y2012i...,10.1111/j.1542-4774.2012.01087.x,2012-12-15
505,Journal of the European Economic Association,"The Relationship Between Child Health, Develop...","['Martin Salm', 'Daniel Schunk']","10(6), pages 1425-1449, December.",N\A,No abstract is available for this item.,https://ideas.repec.org/a/bla/jeurec/v10y2012i...,10.1111/j.1542-4774.2012.01089.x,2012-12-15
506,Journal of the European Economic Association,Uninsured Countercyclical Risk: An Aggregation...,"['R. Anton Braun', 'Tomoyuki Nakajima']","10(6), pages 1450-1474, December.",N\A,We consider an incomplete markets economy with...,https://ideas.repec.org/a/bla/jeurec/v10y2012i...,10.1111/j.1542-4774.2012.01091.x,2012-12-15


## 15. JEL

In [None]:
# clean jel
df_jel = pd.read_csv("data/journals/raw_data/jel.csv", encoding = "utf-8-sig").astype("str")
df_jel["abstract"] = df_jel["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_jel = df_jel.drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_jel["doi"] = df_jel["doi"].apply(lambda x: x.strip().replace("http://hdl.handle.net/",""))
df_jel = df_jel[(df_jel["doi"].str[:3] == "10.") & (df_jel["doi"].str.contains("issue") == False)].drop_duplicates(subset = ["doi"]).reset_index(drop=True)
df_jel["journal"] = df_jel["journal"].apply(lambda x: "Journal of Economic Literature")

dates = []
for idx in df_jel.index:
    year = df_jel.loc[idx,"year"]
    if df_jel.loc[idx,"volume"] == "N\A":
        dates.append(year+"-06-15")
    else:
        date_parts = df_jel.loc[idx,"volume"].split("(")
        try:
            vol = int(date_parts[0])
        except:
            try:
                date_parts = df_jel.loc[idx-1,"volume"].split("(")
                vol = int(date_parts[0])
            except:
                try:
                    date_parts = df_jel.loc[idx-2,"volume"].split("(")
                    vol = int(date_parts[0])
                except:
                    date_parts = df_jel.loc[idx-3,"volume"].split("(")
                    vol = int(date_parts[0])
        
        
        issue = date_parts[1][0]
        if issue == "4":
            month = "12"
        elif issue == "3":
            month = "09"
        elif issue == "2":
            month = "06"
        elif issue == "1":
            month = "03"

            
        dates.append(year+"-"+month+"-"+"15")

df_jel["date"] = dates
df_jel_s = df_jel.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_jel_s.tail()

In [41]:
153 % 6

3

In [None]:
for item in df_eer.volume:
    print(item)

In [52]:
# concatenate journals
df = pd.concat([df_rfs_s,df_res_s,df_qje_s,df_jpe_s,df_e_s,
                df_aer_s,df_jf_s,df_jep_s,df_aejm_s,df_eer_s,
                df_el_s,df_em_s,df_jde_s,df_jeea_s]).reset_index(drop=True)
len(df)

37364

In [53]:
df.nunique()

journal        14
title       36743
authors     30668
volume      28116
jel          5742
abstract    32924
url         37364
doi         37364
date         1142
dtype: int64

In [54]:
df_selected = df[["doi","journal","volume","date","title","abstract","url"]]
df_selected.tail()

Unnamed: 0,doi,journal,volume,date,title,abstract,url
37359,10.1111/j.1542-4774.2012.01085.x,Journal of the European Economic Association,"10(6), pages 1318-1347, December.",2012-12-15,Do Immigrants Cause Crime?,We examine the empirical relationship between ...,https://ideas.repec.org/a/bla/jeurec/v10y2012i...
37360,10.1111/j.1542-4774.2012.01086.x,Journal of the European Economic Association,"10(6), pages 1348-1399, December.",2012-12-15,Risk Aversion And Expected Utility Theory: An ...,No abstract is available for this item.,https://ideas.repec.org/a/bla/jeurec/v10y2012i...
37361,10.1111/j.1542-4774.2012.01087.x,Journal of the European Economic Association,"10(6), pages 1400-1424, December.",2012-12-15,Can We Infer Hospital Quality From Medical Gra...,"In this paper, we propose an alternative metho...",https://ideas.repec.org/a/bla/jeurec/v10y2012i...
37362,10.1111/j.1542-4774.2012.01089.x,Journal of the European Economic Association,"10(6), pages 1425-1449, December.",2012-12-15,"The Relationship Between Child Health, Develop...",No abstract is available for this item.,https://ideas.repec.org/a/bla/jeurec/v10y2012i...
37363,10.1111/j.1542-4774.2012.01091.x,Journal of the European Economic Association,"10(6), pages 1450-1474, December.",2012-12-15,Uninsured Countercyclical Risk: An Aggregation...,We consider an incomplete markets economy with...,https://ideas.repec.org/a/bla/jeurec/v10y2012i...


In [55]:
# save article
df_selected.to_csv("article.csv", index = False, encoding = "utf-8-sig")
df_selected.to_csv("article_no_header.csv", index = False, header = False, encoding = "utf-8-sig")

## Clean and update author.csv, email_affiliation.csv, affiliation.csv

In [57]:
# clean author
df_email = pd.read_csv("email_affiliation.csv", encoding = "utf-8-sig").replace(np.NaN,"")
emaildict = df_email.to_dict("list")
df_au = pd.read_csv("author.csv", encoding = "utf-8-sig")

  df_au = pd.read_csv("author.csv", encoding = "utf-8-sig")


In [65]:
# open chrome service
chromedriver_path = "D:/chromedriver.exe"
s = Service(chromedriver_path)
driver = webdriver.Chrome(service=s)

In [67]:
base_url = "https://scholar.google.com/scholar?hl=zh-CN&q="
start_idx = 2913
for i, e in enumerate(sorted(df_au["emailsuffix"].unique())[start_idx:]):
    if e not in emaildict["subemail"]:
        driver.get(base_url+e)
        driver.refresh()
        soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
        try:
            h3 = soup.find("div", class_ = "gs_r").h3
            if h3.text == "以下机构的用户个人学术档案：":
                div = soup.find("div", class_ = "gs_r").div
                name = div.a.text
                email = div.span.text.replace(" - ", "")
                time.sleep(random.randrange(5,10))
            else:
                raise ValueError("x")
        except:
            try:
                h2 = soup.find("div", class_ = "gs_r").h2
                if "显示的是以下查询字词的结果：" in h2.text.strip():
                    driver.get("https://scholar.google.com"+soup.find("div", class_ = "gs_r").div.a.get("href"))
                    driver.refresh()
                    soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
                    div = soup.find("div", class_ = "gs_r").div
                    name = div.a.text
                    email = div.span.text.replace(" - ", "")
                    time.sleep(random.randrange(5,10))
                else:
                    raise ValueError("x")
            except:
                driver.get(base_url+".".join(e.split(".")[1:]))
                driver.refresh()
                soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
                try:
                    h3 = soup.find("div", class_ = "gs_r").h3
                    if h3.text == "以下机构的用户个人学术档案：":
                        div = soup.find("div", class_ = "gs_r").div
                        name = div.a.text
                        email = div.span.text.replace(" - ", "")
                        time.sleep(random.randrange(5,10))
                    else:
                        raise ValueError("x")
                except:
                    try:
                        h2 = soup.find("div", class_ = "gs_r").h2
                        if "显示的是以下查询字词的结果：" in h2.text.strip():
                            driver.get("https://scholar.google.com"+soup.find("div", class_ = "gs_r").div.a.get("href"))
                            driver.refresh()
                            soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
                            div = soup.find("div", class_ = "gs_r").div
                            name = div.a.text
                            email = div.span.text.replace(" - ", "")
                            time.sleep(random.randrange(5,10))
                        else:
                            raise ValueError("x")
                    except:
                        try:
                            if soup.find("h1").text == "请进行人机身份验证":
                                print("Blocked!!!")
                                break
                            else:
                                name = ""
                                email = ""
                            time.sleep(random.randrange(5,10))
                        except:
                            name = ""
                            email = ""
                            time.sleep(random.randrange(5,10))
        # append info
        try:
            affid = "".join([w[0] for w in name.lower().split(" ")])+"."+email
        except:
            affid = ""

        if "PDF" in affid:
            name = ""
            email = ""
            affid = ""

        emaildict["name"].append(name)
        emaildict["email"].append(email)
        emaildict["subemail"].append(e)
        emaildict["affiliationid"].append(affid)
        print(i+start_idx, e, name, email, affid)

2944 unizar.es   
2951 uns.edu.ar   
2952 unsl.edu.ar   
2954 unsyiah.ac.id   
2956 unza.zm University of Zambia unza.zm uoz.unza.zm
2957 uoa.gr University of Athens uoa.gr uoa.uoa.gr
2958 uob.edu.bh University of Bahrain uob.edu.bh uob.uob.edu.bh
2959 uob.edu.ly University of Benghazi uob.edu.ly uob.uob.edu.ly
2961 uoc.gr University of Crete uoc.gr uoc.uoc.gr
2962 uofcanada.edu.eg   
2964 uoh.edu.pk   
2965 uoi.gr University of Ioannina uoi.gr uoi.uoi.gr
2966 uok.ac.ir University of Kurdistan uok.ac.ir uok.uok.ac.ir
2967 uok.edu.pk University of Karachi uok.edu.pk uok.uok.edu.pk
2968 uom.edu.gr University of Macedonia uom.gr uom.uom.gr
2969 uom.gr University of Macedonia uom.gr uom.uom.gr
2971 uon.edu.au University of Newcastle newcastle.edu.au uon.newcastle.edu.au
2972 uonbi.ac.ke University of Nairobi uonbi.ac.ke uon.uonbi.ac.ke
2973 uop.gr University of Peloponnese uop.gr uop.uop.gr
2975 uos.ac.kr University of Seoul uos.ac.kr uos.uos.ac.kr
2976 uos.de   
2979 uowm.gr   
2980 uowma

In [68]:
len(emaildict["name"])

3284

In [69]:
df_email = pd.DataFrame(emaildict).drop_duplicates(subset=["subemail"],keep="last")
len(df_email)

3284

In [70]:
df_email.nunique()

name             1360
email            1358
subemail         3284
affiliationid    1360
dtype: int64

In [71]:
df_aff = df_email[["affiliationid","name","email"]][df_email["affiliationid"]!=""].drop_duplicates()
len(df_aff)

1359

In [72]:
# add affiliationid to author
for idx in df_au.index:
    try:
        df_au.loc[idx, "affiliationid"] = df_email.set_index("subemail").loc[df_au.loc[idx, "emailsuffix"],"affiliationid"]
    except:
        df_au.loc[idx, "affiliationid"] = ""

In [73]:
df_email.to_csv("email_affiliation.csv", index = False, encoding = "utf-8-sig")
df_email.to_csv("email_affiliation_no_header.csv", index = False, header = False, encoding = "utf-8-sig")
df_aff.to_csv("affiliation.csv", index = False, encoding = "utf-8-sig")
df_aff.to_csv("affiliation_no_header.csv", index = False, header = False, encoding = "utf-8-sig")
df_au.to_csv("author.csv", index = False, encoding = "utf-8-sig")
df_au.to_csv("author_no_header.csv", index = False, header = False, encoding = "utf-8-sig")

## Database

In [74]:
# use this line to run the script directly in Jupyter
! sqlite3 econtop.db < create-econtop.sql

In [75]:
conn = sqlite3.connect("econtop.db")
cur = conn.cursor()

### University Level

In [76]:
# select university level of publications
q = """
    SELECT art.journal, art.title, art.abstract, art.date
    FROM author_article AS aa JOIN author AS au JOIN article AS art JOIN affiliation AS af
    ON aa.authorid = au.authorid AND
    aa.doi = art.doi AND
    au.affiliationid = af.affiliationid
    WHERE art.date >= '1000-01-01' AND
    art.date < '2105-01-01'
    """

# q = " SELECT authorid FROM author"
# generate valid abstracts with dates
df_af_art = pd.read_sql_query(q, conn).drop_duplicates(subset=["abstract"])
df_af_art.head()

Unnamed: 0,journal,title,abstract,date
0,Review of Financial Studies,Dynamic Equilibrium with Liquidity Constraints,This article studies an intertemporal economy ...,2003-05-15
1,Review of Financial Studies,Stochastic Discount Factor Bounds with Conditi...,Hansen and Jagannathan (1991) (hereafter HJ) d...,2003-05-15
2,Review of Financial Studies,Delta-Hedged Gains and the Negative Market Vol...,We investigate whether the volatility risk pre...,2003-05-15
4,Review of Financial Studies,"Differences of Opinion, Short-Sales Constraint...",We develop a theory of market crashes based on...,2003-05-15
6,Review of Financial Studies,Risk Adjustment and Trading Strategies,We assess the profitability of momentum strate...,2003-05-15


In [77]:
df_af_art.nunique()

journal        13
title       22340
abstract    22347
date          772
dtype: int64

In [78]:
df_af_art.to_csv("abstract.csv", index = False, encoding = "utf-8-sig")

In [50]:
# show top appearance
freq = {}
for doi in df_af_art.index:
    try:
        name = df_af_art.loc[doi][1]
        try:
            freq[name] += 1
        except:
            freq[name] = 1
    except:
        df_af_art_new = df_af_art.loc[doi]
        names = set(list(df_af_art_new.name))
        for name in names:
            try:
                freq[name] += 1
            except:
                freq[name] = 1

In [None]:
freq_sorted = sorted(freq.items(), key=lambda x:x[1], reverse = True)
freq_sorted

In [52]:
# collaboration freq
id_pair = {}
name_pair = {}

for doi in df_af_art.index:
    try:
        df_af_art.loc[doi][0]
    except:
        df_af_art_new = df_af_art.loc[doi].sort_values("affiliationid")
        length = len(df_af_art_new)
        for i in range(length-1):
            for j in range(length-1-i):
                try:
                    id_pair[df_af_art_new["affiliationid"][i]+"+"+df_af_art_new["affiliationid"][i+j+1]] += 1
                    name_pair[df_af_art_new["name"][i]+"+"+df_af_art_new["name"][i+j+1]] += 1
                except:
                    id_pair[df_af_art_new["affiliationid"][i]+"+"+df_af_art_new["affiliationid"][i+j+1]] = 1
                    name_pair[df_af_art_new["name"][i]+"+"+df_af_art_new["name"][i+j+1]] = 1

In [None]:
name_pair_sorted = sorted(name_pair.items(), key=lambda x:x[1], reverse = True)
name_pair_sorted

In [79]:
conn.close()