# Data Cleaning and Put into SQL

In [1]:
import pandas as pd
import sqlite3
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import bs4
import time
import random
import numpy as np

## Clean author_article.csv

In [20]:
# clean author_article
df_art_au = pd.read_csv("author_article.csv", encoding = "utf-8-sig").drop_duplicates()
df_art_au.to_csv("author_article.csv", encoding = "utf-8-sig", index = False)
df_art_au.to_csv("author_article_no_header.csv", encoding = "utf-8-sig", index = False, header = False)

## Clean article.csv

In [21]:
# select valid records with valid doi, issue format, and not duplicated
def select_valid(df):
    df_s = df[(df["doi"].str[:3] == "10.") & (df["volume"].str.contains("Supplement") == False) & 
              (df["volume"].str.contains("_Part_") == False) & 
              (df["volume"].str.contains("S1") == False) & 
              (df["volume"].str.contains("S2") == False) &
              (df["volume"].str.contains("issue ") == True)].drop_duplicates(subset=["doi"]).reset_index(drop = True)
    return df_s

# convert volume to date
def conv_volume_to_date(v,freq):
    v_list = v.split(", ")
    year = v_list[0]
    issue = v_list[2].replace("issue ","")
    if freq == 4:
        d = year+"-"+str(int(issue)*3-1)+"-15"
    elif freq == 6 or freq == 5:
        d = year+"-"+str(int(issue)*2)+"-01"
    elif freq == 12:
        d = year+"-"+issue+"-15"
    elif freq == 3:
        d = year+"-"+str(int(issue)*3)+"-01"
    elif freq == 2:
        d = year+"-"+str(int(issue)*6-2)+"-01"
    elif freq == 1:
        d = year+"-"+str(7)+"-01"
    
    if len(d) < 10:
        return d.split("-")[0]+"-"+"0"+d.split("-")[1]+"-"+d.split("-")[2]
    else:
        return d
    
# calculate issue frequency
def cal_freq(vs):
    dict = {}
    years = [int(v.split(", ")[0]) for v in vs]
    unique_years = set(years)
    issues = [int(v.split(", ")[2].replace("issue ","")) for v in vs]
    for unique_year in unique_years:
        max_issue = 0
        for i in range(len(years)):
            if unique_year == years[i]:
                if issues[i] > max_issue:
                    max_issue = issues[i]
        dict[unique_year] = max_issue

    return dict


### 1. RFS

In [22]:
# clean rfs
df_rfs = pd.read_csv("data/journals/raw_data/rfs.csv", encoding = "utf-8-sig").astype("str")
df_rfs["abstract"] = df_rfs["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_rfs_s = select_valid(df_rfs)
freq = cal_freq(df_rfs_s["volume"])

dates = []
for idx in df_rfs_s.index:
    year = int(df_rfs_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_rfs_s.loc[idx,"volume"],freq[year]))

df_rfs_s["date"] = dates
df_rfs_s.head()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
0,Review of Financial Studies,Dynamic Equilibrium with Liquidity Constraints,"['Jerome Detemple', 'Angel Serrat']","2003, vol. 16, issue 2, 597-629",[],This article studies an intertemporal economy ...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhg003,2003-05-15
1,Review of Financial Studies,Stochastic Discount Factor Bounds with Conditi...,"['Wayne Ferson', 'Andrew F. Siegel']","2003, vol. 16, issue 2, 567-595",[],Hansen and Jagannathan (1991) (hereafter HJ) d...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhg004,2003-05-15
2,Review of Financial Studies,Delta-Hedged Gains and the Negative Market Vol...,"['Gurdip Bakshi', 'Nikunj Kapadia']","2003, vol. 16, issue 2, 527-566",[],We investigate whether the volatility risk pre...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhg002,2003-05-15
3,Review of Financial Studies,"Differences of Opinion, Short-Sales Constraint...","['Harrison Hong', 'Jeremy Stein']","2003, vol. 16, issue 2, 487-525",[],We develop a theory of market crashes based on...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhg006,2003-05-15
4,Review of Financial Studies,Risk Adjustment and Trading Strategies,"['Dong-Hyun Ahn', 'Jennifer Conrad', 'Robert D...","2003, vol. 16, issue 2, 459-485",[],We assess the profitability of momentum strate...,https://econpapers.repec.org/article/ouprfinst...,10.1093/rfs/hhg001,2003-05-15


### 2. RES

In [23]:
# clean res
df_res = pd.read_csv("data/journals/raw_data/res.csv", encoding = "utf-8-sig").astype("str")
df_res["abstract"] = df_res["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_res_s = select_valid(df_res)
freq = cal_freq(df_res_s["volume"])

dates = []
for idx in df_res_s.index:
    year = int(df_res_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_res_s.loc[idx,"volume"],freq[year]))

df_res_s["date"] = dates
df_res_s.head()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
0,Review of Economic Studies,"The Dynamics of Return Migration, Human Capita...","['Jerome Adda', 'Christian Dustmann', 'Joseph-...","2022, vol. 89, issue 6, 2841-2871",[],This article develops and estimates a dynamic ...,https://econpapers.repec.org/article/ouprestud...,10.1093/restud/rdac003,2022-12-01
1,Review of Economic Studies,The Impact of Car Pollution on Infant and Chil...,"['Diane Alexander', 'Hannes Schwandt']","2022, vol. 89, issue 6, 2872-2910",[],"In 2008, Volkswagen introduced a new generatio...",https://econpapers.repec.org/article/ouprestud...,10.1093/restud/rdac007,2022-12-01
2,Review of Economic Studies,The Welfare Effects of Transportation Infrastr...,"['Treb Allen', 'Costas Arkolakis']","2022, vol. 89, issue 6, 2911-2957",[],"Each year in the US, hundreds of billions of d...",https://econpapers.repec.org/article/ouprestud...,10.1093/restud/rdac001,2022-12-01
3,Review of Economic Studies,Subjective Models of the Macroeconomy: Evidenc...,"['Peter Andrebriq', 'Carlo Pizzinelli', 'Chris...","2022, vol. 89, issue 6, 2958-2991",[],We study people’s subjective models of the mac...,https://econpapers.repec.org/article/ouprestud...,10.1093/restud/rdac008,2022-12-01
4,Review of Economic Studies,"Product Life Cycle, Learning, and Nominal Shocks","['David Argente', 'Chen Yeh']","2022, vol. 89, issue 6, 2992-3054",[],This article documents a new set of stylized f...,https://econpapers.repec.org/article/ouprestud...,10.1093/restud/rdac004,2022-12-01


### 3. QJE

In [24]:
# clean qje
df_qje = pd.read_csv("data/journals/raw_data/qje.csv", encoding = "utf-8-sig").astype("str")
df_qje["abstract"] = df_qje["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_qje_s = select_valid(df_qje)
freq = cal_freq(df_qje_s["volume"])

dates = []
for idx in df_qje_s.index:
    year = int(df_qje_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_qje_s.loc[idx,"volume"],freq[year]))

df_qje_s["date"] = dates
df_qje_s.head()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
0,The Quarterly Journal of Economics,Systemic Discrimination Among Large U.S. Emplo...,"['Patrick Kline', 'Evan K Rose', 'Christopher ...","2022, vol. 137, issue 4, 1963-2036",[],We study the results of a massive nationwide c...,https://econpapers.repec.org/article/oupqjecon...,10.1093/qje/qjac024,2022-11-15
1,The Quarterly Journal of Economics,Valuing the Global Mortality Consequences of C...,"['Tamma Carleton', 'Amir Jina', 'Michael Delga...","2022, vol. 137, issue 4, 2037-2105",[],"Using 40 countries’ subnational data, we estim...",https://econpapers.repec.org/article/oupqjecon...,10.1093/qje/qjac020,2022-11-15
2,The Quarterly Journal of Economics,Reshaping Global Trade: The Immediate and Long...,['Chenzi Xu'],"2022, vol. 137, issue 4, 2107-2161",[],I show that a disruption to the financial sect...,https://econpapers.repec.org/article/oupqjecon...,10.1093/qje/qjac016,2022-11-15
3,The Quarterly Journal of Economics,Sexual Harassment and Gender Inequality in the...,"['Olle Folke', 'Johanna Rickne']","2022, vol. 137, issue 4, 2163-2212",[],We describe how sexual harassment contributes ...,https://econpapers.repec.org/article/oupqjecon...,10.1093/qje/qjac018,2022-11-15
4,The Quarterly Journal of Economics,The Quality and Efficiency of Public and Priva...,"['Daniel Knutsson', 'Björn Tyrefors']","2022, vol. 137, issue 4, 2213-2262",[],Economic theory predicts that outsourcing publ...,https://econpapers.repec.org/article/oupqjecon...,10.1093/qje/qjac014,2022-11-15


### 4. JPE

In [25]:
# clean jpe
df_jpe = pd.read_csv("data/journals/raw_data/jpe.csv", encoding = "utf-8-sig").astype("str")
df_jpe["abstract"] = df_jpe["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_jpe["doi"] = df_jpe["doi"].apply(lambda x: x.replace("http://dx.doi.org/",""))
df_jpe_s = select_valid(df_jpe)
freq = cal_freq(df_jpe_s["volume"])

dates = []
for idx in df_jpe_s.index:
    year = int(df_jpe_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_jpe_s.loc[idx,"volume"],freq[year]))

df_jpe_s["date"] = dates

# modify date for 2023
for idx in df_jpe_s.index:
    year = int(df_jpe_s.loc[idx,"volume"].split(", ")[0])
    if year == 2023:
        df_jpe_s.loc[idx, "date"] = conv_volume_to_date(df_jpe_s.loc[idx,"volume"],12)

df_jpe_s.head()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
0,Journal of Political Economy,Collective Brand Reputation,"['Volker Nocke', 'Roland Strausz']","2023, vol. 131, issue 1, 1 - 58",[],We develop a theory of collective brand reputa...,https://econpapers.repec.org/article/ucpjpolec...,10.1086/720986,2023-01-15
1,Journal of Political Economy,The Mechanics of the Industrial Revolution,"['Morgan Kelly', 'Joel Mokyr', 'Cormac Ó Gráda']","2023, vol. 131, issue 1, 59 - 94",[],Although there are many competing explanations...,https://econpapers.repec.org/article/ucpjpolec...,10.1086/720890,2023-01-15
2,Journal of Political Economy,Optimal Cooperative Taxation in the Global Eco...,"['V. V. Chari', 'Juan Pablo Nicolini', 'Pedro ...","2023, vol. 131, issue 1, 95 - 130",[],How should countries cooperate in setting fisc...,https://econpapers.repec.org/article/ucpjpolec...,10.1086/720889,2023-01-15
3,Journal of Political Economy,Personalized Pricing and Consumer Welfare,"['Jean-Pierre Dubé', 'Sanjog Misra']","2023, vol. 131, issue 1, 131 - 189",[],We study the welfare implications of personali...,https://econpapers.repec.org/article/ucpjpolec...,10.1086/720793,2023-01-15
4,Journal of Political Economy,A Semistructural Methodology for Policy Counte...,['Martin Beraja'],"2023, vol. 131, issue 1, 190 - 201",[],I propose a methodology for constructing count...,https://econpapers.repec.org/article/ucpjpolec...,10.1086/720982,2023-01-15


### 5. E

In [26]:
# clean e
df_e = pd.read_csv("data/journals/raw_data/e.csv", encoding = "utf-8-sig").astype("str")
df_e["abstract"] = df_e["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_e["doi"] = df_e["doi"].apply(lambda x: x.replace("https://doi.org/",""))
df_e_s = select_valid(df_e)
freq = cal_freq(df_e_s["volume"])

dates = []
for idx in df_e_s.index:
    year = int(df_e_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_e_s.loc[idx,"volume"],freq[year]))

df_e_s["date"] = dates

# modify date for 2023
for idx in df_e_s.index:
    year = int(df_e_s.loc[idx,"volume"].split(", ")[0])
    if year == 2023:
        df_e_s.loc[idx, "date"] = conv_volume_to_date(df_e_s.loc[idx,"volume"],6)

df_e_s.head()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
0,Econometrica,Invidious Comparisons: Ranking and Selection a...,"['Jiaying Gu', 'Roger Koenker']","2023, vol. 91, issue 1, 1-41",[],"There is an innate human tendency, one might c...",https://econpapers.repec.org/article/wlyemetrp...,10.3982/ECTA19304,2023-02-01
1,Econometrica,A Comment on: “Invidious Comparisons: Ranking ...,['Keisuke Hirano'],"2023, vol. 91, issue 1, 43-46",[],,https://econpapers.repec.org/article/wlyemetrp...,10.3982/ECTA20364,2023-02-01
2,Econometrica,A Comment on: “Invidious Comparisons: Ranking ...,['Patrick Kline'],"2023, vol. 91, issue 1, 47-52",[],,https://econpapers.repec.org/article/wlyemetrp...,10.3982/ECTA20322,2023-02-01
3,Econometrica,A Comment on: “Invidious Comparisons: Ranking ...,"['Magne Mogstad', 'Joseph P. Romano', 'Azeem M...","2023, vol. 91, issue 1, 53-60",[],,https://econpapers.repec.org/article/wlyemetrp...,10.3982/ECTA20460,2023-02-01
4,Econometrica,Reply to: Comments on “Invidious Comparisons: ...,"['Jiaying Gu', 'Roger Koenker']","2023, vol. 91, issue 1, 61-66",[],,https://econpapers.repec.org/article/wlyemetrp...,10.3982/ECTA20537,2023-02-01


### 6. AER

In [27]:
# clean aer
df_aer = pd.read_csv("data/journals/raw_data/aer.csv", encoding = "utf-8-sig").astype("str")
df_aer["journal"] = df_aer["journal"].apply(lambda x: "American Economic Review")
df_aer["abstract"] = df_aer["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_aer["volume"] = df_aer["volume"].apply(lambda x: x.replace("\n","").replace("\t","").strip())
df_aer = df_aer.drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_aer.index:
    date_text = df_aer.loc[idx,"volume"].split(",")[2].strip()
    year = date_text.split(" ")[1]
    month_text = date_text.split(" ")[0]
    if month_text == "January":
        month = "01"
    elif month_text == "February":
        month = "02"
    elif month_text == "March":
        month = "03"
    elif month_text == "April":
        month = "04"
    elif month_text == "May":
        month = "05"
    elif month_text == "June":
        month = "06"
    elif month_text == "July":
        month = "07"
    elif month_text == "August":
        month = "08"
    elif month_text == "September":
        month = "09"
    elif month_text == "October":
        month = "10"
    elif month_text == "November":
        month = "11"
    elif month_text == "December":
        month = "12"

    dates.append(year+"-"+month+"-"+"15")

df_aer["date"] = dates
df_aer_s = df_aer.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_aer_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
4799,American Economic Review,Enabling or Limiting Cognitive Flexibility? Ev...,"['\n Silvia Saccardo ', '\n ...","American Economic Review vol. 113,no. 2, Febru...","[('C91', 'Design of Experiments: Laboratory, I...",Moral behavior is more prevalent when individu...,https://www.aeaweb.org/articles?id=10.1257/aer...,10.1257/aer.20201333,2023-02-15
4800,American Economic Review,"Droughts, Deluges, and (River) Diversions: Val...",['\n Will Rafey '],"American Economic Review vol. 113,no. 2, Febru...","[('D23', 'Organizational Behavior; Transaction...",This paper develops and applies a method to va...,https://www.aeaweb.org/articles?id=10.1257/aer...,10.1257/aer.20201434,2023-02-15
4801,American Economic Review,"Technology Gaps, Trade, and Income",['\n Thomas Sampson '],"American Economic Review vol. 113,no. 2, Febru...","[('D21', 'Firm Behavior: Theory'), ('D24', 'Pr...",This paper quantifies the contribution of tech...,https://www.aeaweb.org/articles?id=10.1257/aer...,10.1257/aer.20201940,2023-02-15
4802,American Economic Review,Electronic Food Vouchers: Evidence from an At-...,"['\n Abhijit Banerjee ', '\n ...","American Economic Review vol. 113,no. 2, Febru...","[('H53', 'National Government Expenditures and...",We compare how in-kind food assistance and an ...,https://www.aeaweb.org/articles?id=10.1257/aer...,10.1257/aer.20210461,2023-02-15
4803,American Economic Review,The Voice of Monetary Policy,"['\n Yuriy Gorodnichenko ', '\n ...","American Economic Review vol. 113,no. 2, Febru...","[('D83', 'Search; Learning; Information and Kn...",We develop a deep learning model to detect emo...,https://www.aeaweb.org/articles?id=10.1257/aer...,10.1257/aer.20220129,2023-02-15


### 7. JF

In [28]:
# clean jf
df_jf = pd.read_csv("data/journals/raw_data/jf.csv", encoding = "utf-8-sig").astype("str")
df_jf["abstract"] = df_jf["abstract"].apply(lambda x: x.replace("Abstract: ",""))
df_jf["doi"] = df_jf["doi"].apply(lambda x: x.replace("https://doi.org/",""))
df_jf_s = select_valid(df_jf)
freq = cal_freq(df_jf_s["volume"])

dates = []
for idx in df_jf_s.index:
    year = int(df_jf_s.loc[idx,"volume"].split(", ")[0])
    dates.append(conv_volume_to_date(df_jf_s.loc[idx,"volume"],freq[year]))

df_jf_s["date"] = dates

# modify date for 2023
for idx in df_jf_s.index:
    year = int(df_jf_s.loc[idx,"volume"].split(", ")[0])
    if year == 2023:
        df_jf_s.loc[idx, "date"] = conv_volume_to_date(df_jf_s.loc[idx,"volume"],6)

df_jf_s.head()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
0,Journal of Finance,Optimal Financial Transaction Taxes,['Eduardo Dávila'],"2023, vol. 78, issue 1, 5-61",[],This paper characterizes the optimal transacti...,https://econpapers.repec.org/article/blajfinan...,10.1111/jofi.13188,2023-02-01
1,Journal of Finance,"Less Mainstream Credit, More Payday Borrowing?...",['Julia Fonseca'],"2023, vol. 78, issue 1, 63-103",[],Governments regulate debt collectors to protec...,https://econpapers.repec.org/article/blajfinan...,10.1111/jofi.13189,2023-02-01
2,Journal of Finance,Disruption and Credit Markets,"['Bo Becker', 'Victoria Ivashina']","2023, vol. 78, issue 1, 105-139",[],"We show that over the past half‐century, innov...",https://econpapers.repec.org/article/blajfinan...,10.1111/jofi.13187,2023-02-01
3,Journal of Finance,How Risky Are U.S. Corporate Assets?,"['Tetiana Davydiuk', 'Scott Richard', 'Ivan Sh...","2023, vol. 78, issue 1, 141-208",[],We use market data on corporate bonds and equi...,https://econpapers.repec.org/article/blajfinan...,10.1111/jofi.13196,2023-02-01
4,Journal of Finance,International Yield Curves and Currency Puzzles,"['Mikhail Chernov', 'Drew Creal']","2023, vol. 78, issue 1, 209-245",[],The currency depreciation rate is often comput...,https://econpapers.repec.org/article/blajfinan...,10.1111/jofi.13191,2023-02-01


## 8. JEP

In [29]:
# clean jep
df_jep = pd.read_csv("data/journals/raw_data/jep.csv", encoding = "utf-8-sig").astype("str")
df_jep["journal"] = df_jep["journal"].apply(lambda x: "Journal of Economic Perspectives")
df_jep["abstract"] = df_jep["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_jep["volume"] = df_jep["volume"].apply(lambda x: x.replace("\n","").replace("\t","").strip())
df_jep = df_jep.drop_duplicates(subset = ["doi"]).reset_index(drop=True)

dates = []
for idx in df_jep.index:
    date_text = df_jep.loc[idx,"volume"].lower().split(" ")
    season = date_text[0]
    year = date_text[1]
    if season == "spring":
        md = "02-15"
    elif season == "summer":
        md = "05-15"
    elif season == "fall":
        md = "08-15"
    elif month_text == "winter":
        md = "11-15"

    dates.append(year+"-"+md)

df_jep["date"] = dates
df_jep_s = df_jep.drop_duplicates(subset = ["doi"]).reset_index(drop=True)[["journal","title","authors","volume","jel","abstract","url","doi","date"]]
df_jep_s.tail()

Unnamed: 0,journal,title,authors,volume,jel,abstract,url,doi,date
1830,Journal of Economic Perspectives,Emerging Market Sovereign Debt in the Aftermat...,['\n Kenneth Rogoff '],"Fall 2022 (Vol. 36, No.4 )","[('E23', 'Macroeconomics: Production'), ('E62'...","For emerging markets, fiscal space is a very r...",https://www.aeaweb.org/articles?id=10.1257/jep...,10.1257/jep.36.4.147,2022-08-15
1831,Journal of Economic Perspectives,Popular Personal Financial Advice versus the P...,['\n James J. Choi '],"Fall 2022 (Vol. 36, No.4 )","[('D15', 'Intertemporal Household Choice; Life...",I survey the advice given by the fifty most po...,https://www.aeaweb.org/articles?id=10.1257/jep...,10.1257/jep.36.4.167,2022-08-15
1832,Journal of Economic Perspectives,A Linear Panel Model with Heterogeneous Coeffi...,"['\n Liyang Sun ', '\n Jess...","Fall 2022 (Vol. 36, No.4 )","[('C23', 'Single Equation Models; Single Varia...",Linear panel models featuring unit and time fi...,https://www.aeaweb.org/articles?id=10.1257/jep...,10.1257/jep.36.4.193,2022-08-15
1833,Journal of Economic Perspectives,"Sadie T. M. Alexander: Black Women and a ""Tast...",['\n Nina Banks '],"Fall 2022 (Vol. 36, No.4 )","[('B31', 'History of Economic Thought: Individ...",The employment history of African American wom...,https://www.aeaweb.org/articles?id=10.1257/jep...,10.1257/jep.36.4.205,2022-08-15
1834,Journal of Economic Perspectives,Recommendations for Further Reading,['\n Timothy Taylor '],"Fall 2022 (Vol. 36, No.4 )","[('Y50', 'Further Reading (unclassified)')]",N\A,https://www.aeaweb.org/articles?id=10.1257/jep...,10.1257/jep.36.4.221,2022-08-15


In [None]:
for doi in df_jep.volume:
    print(doi)

In [30]:
# concatenate journals
df = pd.concat([df_rfs_s,df_res_s,df_qje_s,df_jpe_s,df_e_s,df_aer_s,df_jf_s,
                df_jep_s]).reset_index(drop=True)
len(df)

24813

In [31]:
df.nunique()

journal         8
title       24200
authors     19009
volume      18506
jel          5741
abstract    16279
url         24813
doi         24813
date         1109
dtype: int64

In [32]:
df_selected = df[["doi","journal","volume","date","title","abstract","url"]]
df_selected.head()

Unnamed: 0,doi,journal,volume,date,title,abstract,url
0,10.1093/rfs/hhg003,Review of Financial Studies,"2003, vol. 16, issue 2, 597-629",2003-05-15,Dynamic Equilibrium with Liquidity Constraints,This article studies an intertemporal economy ...,https://econpapers.repec.org/article/ouprfinst...
1,10.1093/rfs/hhg004,Review of Financial Studies,"2003, vol. 16, issue 2, 567-595",2003-05-15,Stochastic Discount Factor Bounds with Conditi...,Hansen and Jagannathan (1991) (hereafter HJ) d...,https://econpapers.repec.org/article/ouprfinst...
2,10.1093/rfs/hhg002,Review of Financial Studies,"2003, vol. 16, issue 2, 527-566",2003-05-15,Delta-Hedged Gains and the Negative Market Vol...,We investigate whether the volatility risk pre...,https://econpapers.repec.org/article/ouprfinst...
3,10.1093/rfs/hhg006,Review of Financial Studies,"2003, vol. 16, issue 2, 487-525",2003-05-15,"Differences of Opinion, Short-Sales Constraint...",We develop a theory of market crashes based on...,https://econpapers.repec.org/article/ouprfinst...
4,10.1093/rfs/hhg001,Review of Financial Studies,"2003, vol. 16, issue 2, 459-485",2003-05-15,Risk Adjustment and Trading Strategies,We assess the profitability of momentum strate...,https://econpapers.repec.org/article/ouprfinst...


In [33]:
# save article
df_selected.to_csv("article.csv", index = False, encoding = "utf-8-sig")
df_selected.to_csv("article_no_header.csv", index = False, header = False, encoding = "utf-8-sig")

## Clean and update author.csv, email_affiliation.csv, affiliation.csv

In [34]:
# clean author
df_email = pd.read_csv("email_affiliation.csv", encoding = "utf-8-sig").replace(np.NaN,"")
emaildict = df_email.to_dict("list")
df_au = pd.read_csv("author.csv", encoding = "utf-8-sig")

In [35]:
# open chrome service
chromedriver_path = "D:/chromedriver.exe"
s = Service(chromedriver_path)
driver = webdriver.Chrome(service=s)

In [37]:
base_url = "https://scholar.google.com/scholar?hl=zh-CN&q="
start_idx = 1
for i, e in enumerate(sorted(df_au["emailsuffix"].unique())[start_idx:]):
    if e not in list(df_email.subemail):
        driver.get(base_url+e)
        driver.refresh()
        soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
        try:
            h3 = soup.find("div", class_ = "gs_r").h3
            if h3.text == "以下机构的用户个人学术档案：":
                div = soup.find("div", class_ = "gs_r").div
                name = div.a.text
                email = div.span.text.replace(" - ", "")
                time.sleep(random.randrange(5,10))
            else:
                raise ValueError("x")
        except:
            try:
                h2 = soup.find("div", class_ = "gs_r").h2
                if "显示的是以下查询字词的结果：" in h2.text.strip():
                    driver.get("https://scholar.google.com"+soup.find("div", class_ = "gs_r").div.a.get("href"))
                    driver.refresh()
                    soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
                    div = soup.find("div", class_ = "gs_r").div
                    name = div.a.text
                    email = div.span.text.replace(" - ", "")
                    time.sleep(random.randrange(5,10))
                else:
                    raise ValueError("x")
            except:
                driver.get(base_url+".".join(e.split(".")[1:]))
                driver.refresh()
                soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
                try:
                    h3 = soup.find("div", class_ = "gs_r").h3
                    if h3.text == "以下机构的用户个人学术档案：":
                        div = soup.find("div", class_ = "gs_r").div
                        name = div.a.text
                        email = div.span.text.replace(" - ", "")
                        time.sleep(random.randrange(5,10))
                    else:
                        raise ValueError("x")
                except:
                    try:
                        h2 = soup.find("div", class_ = "gs_r").h2
                        if "显示的是以下查询字词的结果：" in h2.text.strip():
                            driver.get("https://scholar.google.com"+soup.find("div", class_ = "gs_r").div.a.get("href"))
                            driver.refresh()
                            soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
                            div = soup.find("div", class_ = "gs_r").div
                            name = div.a.text
                            email = div.span.text.replace(" - ", "")
                            time.sleep(random.randrange(5,10))
                        else:
                            raise ValueError("x")
                    except:
                        try:
                            if soup.find("h1").text == "请进行人机身份验证":
                                print("Blocked!!!")
                                break
                            else:
                                name = ""
                                email = ""
                            time.sleep(random.randrange(5,10))
                        except:
                            name = ""
                            email = ""
                            time.sleep(random.randrange(5,10))
        # append info
        try:
            affid = "".join([w[0] for w in name.lower().split(" ")])+"."+email
        except:
            affid = ""

        if "PDF" in affid:
            name = ""
            email = ""
            affid = ""

        emaildict["name"].append(name)
        emaildict["email"].append(email)
        emaildict["subemail"].append(e)
        emaildict["affiliationid"].append(affid)
        print(i+start_idx, e, name, email, affid)

3 abmjconsulting.com   
164 cablelabs.com   
213 chula.ac.th Chulalongkorn University chula.ac.th cu.chula.ac.th
223 cl.cam.ac.uk University of Cambridge cam.ac.uk uoc.cam.ac.uk
284 darpa.mil   
307 dlsu.edu.ph De La Salle University dlsu.edu.ph dlsu.dlsu.edu.ph
310 downstate.edu SUNY Downstate Medical Center downstate.edu sdmc.downstate.edu
362 econ.usyd.edu.au University of Sydney sydney.edu.au uos.sydney.edu.au
374 ecu.edu East Carolina University ecu.edu ecu.ecu.edu
398 enriques.eu   
405 epi.org   
416 eshcc.eur.nl Erasmus University Rotterdam eur.nl eur.eur.nl
442 fcjs.unl.edu.ar   
452 fibl.org   
506 gueron.org   
518 haverford.edu Haverford College haverford.edu hc.haverford.edu
526 hendrix.edu Hendrix College hendrix.edu hc.hendrix.edu
535 holycross.edu College of the Holy Cross holycross.edu cothc.holycross.edu
545 hud.gov   
551 hws.edu Hobart and William Smith Colleges hws.edu hawsc.hws.edu
571 ifad.org   
585 ijs.si Jozef Stefan Institute ijs.si jsi.ijs.si
623 ivie.es   


In [38]:
len(emaildict["name"])

1510

In [39]:
df_email = pd.DataFrame(emaildict).drop_duplicates(subset=["subemail"],keep="last")
len(df_email)

1510

In [40]:
df_email.nunique()

name              753
email             752
subemail         1510
affiliationid     753
dtype: int64

In [41]:
df_aff = df_email[["affiliationid","name","email"]][df_email["affiliationid"]!=""].drop_duplicates()
len(df_aff)

752

In [42]:
# add affiliationid to author
for idx in df_au.index:
    try:
        df_au.loc[idx, "affiliationid"] = df_email.set_index("subemail").loc[df_au.loc[idx, "emailsuffix"],"affiliationid"]
    except:
        df_au.loc[idx, "affiliationid"] = ""

In [43]:
df_email.to_csv("email_affiliation.csv", index = False, encoding = "utf-8-sig")
df_email.to_csv("email_affiliation_no_header.csv", index = False, header = False, encoding = "utf-8-sig")
df_aff.to_csv("affiliation.csv", index = False, encoding = "utf-8-sig")
df_aff.to_csv("affiliation_no_header.csv", index = False, header = False, encoding = "utf-8-sig")
df_au.to_csv("author.csv", index = False, encoding = "utf-8-sig")
df_au.to_csv("author_no_header.csv", index = False, header = False, encoding = "utf-8-sig")

## Database

In [44]:
# use this line to run the script directly in Jupyter
! sqlite3 econtop.db < create-econtop.sql

In [46]:
conn = sqlite3.connect("econtop.db")
cur = conn.cursor()

### University Level

In [47]:
# select university level of publications
q = """
    SELECT af.affiliationid, af.name, art.journal, aa.doi, art.date
    FROM author_article AS aa JOIN author AS au JOIN article AS art JOIN affiliation AS af
    ON aa.authorid = au.authorid AND
    aa.doi = art.doi AND
    au.affiliationid = af.affiliationid
    WHERE art.date >= '2000-01-01' AND
    art.date < '2005-01-01'
    """

# q = " SELECT authorid FROM author"
df_af_art = pd.read_sql_query(q, conn).set_index("doi")
df_af_art.head()

Unnamed: 0_level_0,affiliationid,name,journal,date
doi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10.1093/rfs/hhg003,bu.bu.edu,Boston University,Review of Financial Studies,2003-05-15
10.1093/rfs/hhg004,uosc.usc.edu,University of Southern California,Review of Financial Studies,2003-05-15
10.1093/rfs/hhg002,uoma.umass.edu,"University of Massachusetts, Amherst",Review of Financial Studies,2003-05-15
10.1093/rfs/hhg002,tu.temple.edu,Temple University,Review of Financial Studies,2003-05-15
10.1093/rfs/hhg006,hu.harvard.edu,Harvard University,Review of Financial Studies,2003-05-15


In [48]:
len(df_af_art)

2583

In [49]:
df_af_art.nunique()

affiliationid    351
name             351
journal            7
date              70
dtype: int64

In [50]:
# show top appearance
freq = {}
for doi in df_af_art.index:
    try:
        name = df_af_art.loc[doi][1]
        try:
            freq[name] += 1
        except:
            freq[name] = 1
    except:
        df_af_art_new = df_af_art.loc[doi]
        names = set(list(df_af_art_new.name))
        for name in names:
            try:
                freq[name] += 1
            except:
                freq[name] = 1

In [None]:
freq_sorted = sorted(freq.items(), key=lambda x:x[1], reverse = True)
freq_sorted

In [52]:
# collaboration freq
id_pair = {}
name_pair = {}

for doi in df_af_art.index:
    try:
        df_af_art.loc[doi][0]
    except:
        df_af_art_new = df_af_art.loc[doi].sort_values("affiliationid")
        length = len(df_af_art_new)
        for i in range(length-1):
            for j in range(length-1-i):
                try:
                    id_pair[df_af_art_new["affiliationid"][i]+"+"+df_af_art_new["affiliationid"][i+j+1]] += 1
                    name_pair[df_af_art_new["name"][i]+"+"+df_af_art_new["name"][i+j+1]] += 1
                except:
                    id_pair[df_af_art_new["affiliationid"][i]+"+"+df_af_art_new["affiliationid"][i+j+1]] = 1
                    name_pair[df_af_art_new["name"][i]+"+"+df_af_art_new["name"][i+j+1]] = 1

In [None]:
name_pair_sorted = sorted(name_pair.items(), key=lambda x:x[1], reverse = True)
name_pair_sorted

In [50]:
conn.close()