In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re

# Profiles as features for authors

Here, we match the faculty profile we created from external sources to the authors of papers published on finance journals and to those of citing papers on Google Scholar. 

Then, we use the profile information as features for the authors.

In [None]:
# Faculty profile 
author_profiles = pd.read_excel("Author_Profile.xlsx",index_col=0) 
# Papers published on JFE 
jfe = pd.read_excel("Published_Papers_JFE.xlsx",index_col=0)
# Citing papers on GS
df_ref = pd.read_excel("JFE_GS_DATA.xlsx")

In [None]:
#### Identify the names of the authors who published on JFE and clean the data

JFE_paper_aut = jfe['Authors'].str.split(', ').explode().drop_duplicates().tolist()
JFE_paper_aut1 = [i.split('\xa0')[0] for i in JFE_paper_aut]

def check_even(number):
    if len(number.split())>=3:
        return ' '.join(number.split()[1:-1])
    else:
        return np.nan

In [None]:
JFE_paper_aut1 = pd.DataFrame(JFE_paper_aut1, columns = ['Authors'])
JFE_paper_aut1['First Name'] = JFE_paper_aut1['Authors'].apply(lambda x: ' '.join(x.split()[:1]))
JFE_paper_aut1['Middle Name'] = JFE_paper_aut1['Authors'].apply(check_even)
JFE_paper_aut1['Last Name'] = JFE_paper_aut1['Authors'].apply(lambda x: ' '.join(x.split()[-1:]))
JFE_paper_aut2 = JFE_paper_aut1.applymap(lambda x: x.capitalize() if isinstance(x, str) else x).drop_duplicates()
JFE_paper_aut2

  JFE_paper_aut2 = JFE_paper_aut1.applymap(lambda x: x.capitalize() if isinstance(x, str) else x).drop_duplicates()


Unnamed: 0,Authors,First Name,Middle Name,Last Name
0,Xuewen liu,Xuewen,,Liu
1,Pengfei wang,Pengfei,,Wang
2,Zhongchao yang,Zhongchao,,Yang
3,Yongjin kim,Yongjin,,Kim
4,Lars-alexander kuehn,Lars-alexander,,Kuehn
...,...,...,...,...
3443,Simon benninga,Simon,,Benninga
3444,Mark helmantel,Mark,,Helmantel
3445,Oded sarig,Oded,,Sarig
3446,Robert bloomfield,Robert,,Bloomfield


In [None]:
JFE_authors_match = pd.merge(JFE_paper_aut2 ,author_profiles, how = 'left', on = ['Last Name','First Name'])

In [None]:
# Identify matched authors from faculty profiles
matched = JFE_authors_match[~JFE_authors_match['University'].isna()]

In [None]:
# Note that there are duplicated matches in the dataframe above. John Smith profile from both EFA and AFA can match. We want to account for this and identify number of unique authors that are matched. 
unique_matches = matched.loc[matched.Authors_x.drop_duplicates().index]
unique_matches

Unnamed: 0,Authors_x,First Name,Middle Name_x,Last Name,Authors_y,University,Source,Middle Name_y,Country
1,Pengfei wang,Pengfei,,Wang,Pengfei wang,Professor of economics hong kong university of...,Cepr,,China
3,Yongjin kim,Yongjin,,Kim,Yongjin kim,"city university of hong kong, department of e...",Afa,,Hong Kong
5,Kai li,Kai,,Li,Kai li,university of british columbia,Abfer,,Canada
7,Gareth campbell,Gareth,,Campbell,Gareth campbell,Professor in finance queen's university belfast,Cepr,,Unknown
9,John d. turner,John,D.,Turner,John turner,"Professor of finance and financial history, qu...",Cepr,,Unknown
...,...,...,...,...,...,...,...,...,...
4127,David c. parsley,David,C.,Parsley,David c parsley,"vanderbilt university, owen graduate school o...",Afa,C,USA
4128,Michael b. mikhail,Michael,B.,Mikhail,Michael mikhail,university of illinois-chicago,Afa,,USA
4131,John h. cochrane,John,H.,Cochrane,John cochrane,"stanford university, hoover institution",Afa,,USA
4136,Oded sarig,Oded,,Sarig,Oded sarig,"tel aviv university, department of finance",Afa,,Israel


We have about half of the authors on JFE identified from the profile list collected from external sources.

In [None]:
# Now, identify authors of citing papers on GS with the profiles. 
# Our motive here is to see how many authors of the citing papers are recognized finance faculty members. 
# We expect to have much less identification here than we did for the authors of papers published on JFE.

df_jf1 =  df_ref.dropna(axis=1, how='all').iloc[:,:5]

In [None]:
# Clean the data of authors of the GS citing papers

max_authors_jf1 = df_jf1['Authors'].astype(str).apply(lambda x: len(x.split(', '))).max()

for i in range(max_authors_jf1):
    column_name = f'Author {i+1}'
    df_jf1[column_name] = df_jf1['Authors'].astype(str).apply(lambda x: x.split(', ')[i] if i < len(x.split(', ')) else '')

# Create new columns for each author
for i in range(max_authors_jf1):
    column_name = f'Author {i+1}'
    # Create new columns for first, middle, and last names
    df_jf1[f'First Name{i+1}'] = df_jf1[column_name].apply(lambda x: ' '.join(x.split()[:1]))
    df_jf1[f'Middle Name{i+1}'] = df_jf1[column_name].apply(check_even)
    df_jf1[f'Last Name{i+1}'] = df_jf1[column_name].apply(lambda x: ' '.join(x.split()[-1:]))

gs_full1 = pd.concat([df_jf1.iloc[:, :5], df_jf1.iloc[:, 12:]], axis=1)

gs_full_authors  = gs_full1['Authors'].str.split(', ').explode().drop_duplicates().tolist()
gs_full_authors1 = [str(i).split('\xa0')[0] for i in gs_full_authors]
gs_full_df = pd.DataFrame(gs_full_authors1, columns = ['Authors'])
gs_full_df['First Name'] = gs_full_df['Authors'].apply(lambda x: ' '.join(x.split()[:1]))
gs_full_df['Middle Name'] = gs_full_df['Authors'].apply(check_even)
gs_full_df['Last Name'] = gs_full_df['Authors'].apply(lambda x: ' '.join(x.split()[-1:]))

gs_full_df1 = gs_full_df.applymap(lambda x: x.capitalize() if isinstance(x, str) else x)

In [None]:
gs_full_df1.drop_duplicates()

Unnamed: 0,Authors,First Name,Middle Name,Last Name
0,A krishnamurthy,A,,Krishnamurthy
1,W li,W,,Li
2,W li,W,,Li
3,Z li,Z,,Li
4,S xu,S,,Xu
...,...,...,...,...
172979,Ас наседкина,Ас,,Наседкина
172980,F şahut,F,,Şahut
172981,C viorica,C,,Viorica
172982,粟芳， 初立苹,粟芳，,,初立苹


In [None]:
GS_afa = pd.merge(gs_full_df1.drop_duplicates(),author_profiles, how = 'left', on = ['Last Name','First Name'])

In [None]:
matched_gs = GS_afa[~GS_afa['University'].isna()]

In [None]:
len(matched_gs)

58

58/150539 matches is a poor result. 

To improve upon this, we observe that the there is a wide range of formats of the names of the authors of citing papers. Specifically, we can see a common usage of initials unlike the names listed on our faculty profile list. 

We account for this in the following by matching the names based not only on the entire first and last name, but also allowing for initials for first names. However, this approach might be misleading for common last names. 

In [None]:
gs_full_df1 = gs_full_df1.drop_duplicates()

# Function to check if the first name is the initial(s) of the first name in AFA_clean
def is_initials(first_name_trial, first_name_afa):
    # Handle cases where either first name is NaN
    if pd.isna(first_name_trial) or pd.isna(first_name_afa):
        return False
    
    # Check if the trial first name is the initials of AFA first name
    return all(item[0] == item[1] for item in zip(first_name_trial, first_name_afa))

# Iterate over the rows and match with AFA_clean
for index, row in gs_full_df1.iterrows():
    # Find matching last names in AFA_clean
    matches = author_profiles[author_profiles['Last Name'].str.lower() == row['Last Name'].lower()]

    for _, match in matches.iterrows():
        # Check if the first name is the initials of the AFA_clean first name
        if is_initials(row['First Name'], match['First Name']):
            # Merge the first and middle names from AFA_clean to trial_gs
            gs_full_df1.at[index, 'First Name'] = match['First Name']
            gs_full_df1.at[index, 'Middle Name'] = match['Middle Name']
            break  # Once a match is found, no need to check further

GS_afa = pd.merge(gs_full_df1,author_profiles, how = 'left', on = ['Last Name','First Name'])

In [None]:
matched_gs = GS_afa[~GS_afa['University'].isna()]
unique_matches_gs = matched_gs.loc[matched_gs.Authors_x.drop_duplicates().index]

In [None]:
unique_matches_gs

Unnamed: 0,Authors_x,First Name,Middle Name_x,Last Name,Authors_y,University,Source,Middle Name_y,Country
0,A krishnamurthy,Arvind,,Krishnamurthy,Arvind krishnamurthy,stanford university,Abfer,,USA
4,W li,Wendy,C.y.,Li,Wendy c.y. li,Executive director moon economics institute,Cepr,C.y.,Unknown
5,W li,Wendy,C.y.,Li,Wendy c.y. li,Executive director moon economics institute,Cepr,C.y.,Unknown
6,Z li,Zhan,,Li,Zhan li,"Postdoctoral researcher in economics, national...",Cepr,,China
8,K li,Kai,,Li,Kai li,university of british columbia,Abfer,,Canada
...,...,...,...,...,...,...,...,...,...
152487,C eckel,Carsten,,Eckel,Carsten eckel,Professor of economics bibliothek wirtscharfts...,Cepr,,Germany
152525,S winston smith,Stanley,D,Smith,Stanley d smith,university of central florida,Afa,D,USA
152538,R stehrer,Robert,,Stehrer,Robert stehrer,Scientific director the vienna institute for i...,Cepr,,Austria
152566,J kren,Janez,,Kren,Janez kren,Doctoral researcher ku leuven,Cepr,,Unknown


Now, the number of authors identified increases significantly from 0.0004% to 7.5%.