# Codes for MACS 30122 Final Project

In [2]:
# import libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import winsound

import recordlinkage
from recordlinkage.index import Full

import csv
import pandas as pd
import numpy as np
import re
import math
import sqlite3
import time
import random

import plotly.express as px
import plotly.graph_objects as go
import networkx as nx
import plotly.figure_factory as ff

from bertopic import BERTopic
from umap import UMAP
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

## 1. Data Scraping and Wrangling

### 1.1 Article Scraping

- 3 Main Sources of Aritcles: 1): Offcial Websites 2): econpapers.repec.org 3): ideas.repec.org
- Store every journal (20 in total) separately in a csv file.
- Take the example of ideas.repec.org (using requests): 

In [None]:
# Functions to scrape ideas.repec.org

def find_url_list(start_url, page_number):
    '''
    Get the papers url list for all the journal database/

    Inputs:
        start_url(str): The url of the first page of the journal database.
        page_number(int): The number of pages of the journal database.

    Outputs:
        Volume(DataFrame): A DataFrame record the title and url for all\
            papers of a specified journal.
    '''
    Volume = {}
    Volume = read_one_page(start_url)

    for i in range(2, page_number):
        url = start_url[:-5]
        page = "{}.html".format(i)
        url = url + page
        Volume_one_page = read_one_page(start_url)
        Volume["Title"].extend(Volume_one_page["Title"])
        Volume["Link"].extend(Volume_one_page["Link"])

    Volume = pd.DataFrame(data=Volume)
    return Volume


def read_one_page(url):
    '''
    Crawl the paper title and paper url in one page of the journal database.

    Inputs:
        url(str): The url of one page of the journal database.

    Outputs:
        Volume(Dictionary): The dict record the title and url for each paper.
    '''
    
    Volume = {"Title":[],"Link":[]}
    # Read the journal database page and find all the paper links.
    response = requests.get(url)
    soup = BeautifulSoup(response.text,"html")
    papers = soup.find_all("li", class_ = "list-group-item downgate")
    papers.extend(soup.find_all("li", class_ = "list-group-item downfree"))
    papers.extend(soup.find_all("li", class_ = "list-group-item downnone"))
    # Record all the papers title and link in Volume.
    for paper in papers:
        title = paper.a.text
        href = paper.a.get("href")
        Volume["Title"].append(title)
        Volume["Link"].append("https://ideas.repec.org" + href)
    
    return Volume


def crawl_all_url(Volume, start = None, end = None, sleep_time = 0.5):
    '''
    Crawl all the paper informations recorded in the Volume.

    Inputs:
        Volume(DataFrame): A DataFrame record the title and url for all\
            papers of a specified journal.
        start(int): The start Volume index of the paper we want to start to crawl.
        end(int): The end Volume index of the paper we want to end the crawl.
        sleep_time(int): The seelp time for the crawl for each papaer.

    Outputs:
        df(DataFrame): The DataFrame record all the crawl information for all\
            papers in the journal.
    '''

    df = pd.DataFrame({"journal":[], "title":[],"authors":[],"abstract":[],\
                       "cite":[], "JEL":[],"url":[],"vol":[], "doi":[], "year":[]})
    count = start
    # Iterate over all volumes
    for _, row in Volume.iloc[start: end].iterrows():
        title, link = row["Title"], row["Link"]
        print(count, "Currently at", title, "link =", link)

        # Iterate over articles of the volumes
        article = obtain_articles_infos(link)
        df = pd.concat([df, article],ignore_index=True)
        time.sleep(sleep_time)
        count += 1
    
    return df


def obtain_articles_infos(url):
    '''
    Obtain and record the article's journal name, title, authors, abstract,\
        cite, url, vol, publish time and the doi on the page of a single url.
        (This code is designed for The Economic Journal in ideas.repec.org\
        database as an example, we also crawl the other journals in other databases)
    
        
    Inputs:
        url(str): The string of the URL we want to crawl.

    Outputs:
        info(DataFrame): The DataFrame with all the recorded information we\
                         said above.
    '''
    
    # Define the structure of the DataFrame.
    info = {"journal":[], "title":[],"authors":[],"abstract":[], "cite":[],\
            "JEL":[],"url":[],"vol":[], "doi":[], "year":[]}
    journal = "The Economic Journal"

    # Request the url and get the soup.
    response = requests.get(url)
    soup = BeautifulSoup(response.text,"html")

    # Crawl the title of the journal
    title = soup.find("h1").text
    
    # ideas.repec.org have three type of keyword in cite and vol, try each keyword.
    try:
        cite = soup.find("li", class_="list-group-item downgate").text
    except:
        try:
            cite = soup.find("li", class_="list-group-item downfree").text
        except:
            try:
                cite = soup.find("li", class_="list-group-item downnone").text
            except:
                cite = "N\A"
                vol = "N\A"

    # If we crawl the cite successfully, clean and record cite and vol.
    if cite != "N\A":
        vol = re.split(r"vol.", cite)[1]
        cite = re.sub('(\t|\n)'," ",cite)
        cite = re.sub('\s+'," ",cite).strip()
        vol = re.sub('(\t|\n)'," ",vol)
        vol = re.sub('\s+'," ",vol).strip()
        vol.strip(".")

    # Crawl, clean and record the author names.
    try:
        authors = soup.find_all("li", class_="authorname")
        authors = [author.text for author in authors]
    except:
        authors = []        

    # Crawl, clean and record the Abstract.
    try:
        abstract = soup.find("div", id="abstract-body").text
    except:
        abstract = "N\A"

    # Crawl, clean and record the doi.
    try:
        doi = soup.find("div", id="biblio-body").text
        [doi] = re.findall(r"DOI:...+", doi)
        doi = re.sub(r"DOI:","", doi)
    except:
        doi = "N\A"

    # If there no "doi" in ideas.repec.org, record the url of the offical website.
    if doi == "N\A":
        try:
            doi = soup.find("span", style="word-break:break-all").text
        except:
            doi = "N\A"

    # Crawl, clean and record the publish time.
    try:
        year = soup.find("i", style="word-break:break-all").text
        [year] = re.findall(r"y:....", year)
        year = re.sub("y:","",year)
    except:
        year = "N\A"

    # Only journals in AEA(American Economic Association) have JEL.
    jel = "N\A"

    # Make all the information into DataFrame.
    info["journal"].append(journal)
    info["title"].append(title)
    info["authors"].append(authors)
    info["abstract"].append(abstract)
    info["cite"].append(cite)
    info["JEL"].append(jel)
    info["url"].append(url)
    info["vol"].append(vol)
    info["doi"].append(doi)
    info["year"].append(year)
    info = pd.DataFrame(info)

    return info



In [None]:
# implementation
# Get the url for all the papers in one journal.
Volume = find_url_list("https://ideas.repec.org/s/ecj/econjl.html", 18)

# Crawl the information of a certain papers in the journal. (the first 10 papers)
df = crawl_all_url(Volume, start = 0, end = 10, sleep_time = 0.5)

# Write the DataFrame to .csv
df.to_csv("The Economic Journal_test.csv", encoding="utf-8")

### 1.2 Article_author Link Scraping (Selenium)

- We scrape Google Scholars by searching for article doi and get for the authors.
- We generate/update a file containing article_links.
- We use Google userid as *authorid* and doi as *aritlceid*.
- Besides, we also download the Google Scholars profiles locally as html during this process.

In [None]:
# open chrome service
chromedriver_path = "D:/chromedriver.exe"
s = Service(chromedriver_path)
driver = webdriver.Chrome(service=s)

In [None]:
# initialize ids to record
authorids = []
articleids = []

# get existing author_article links
base_url = "https://scholar.google.com/scholar?hl=zh-CN&q="
start_idx = 0
existing_doi = list(pd.read_csv("author_article.csv", encoding = "utf-8-sig")["doi"])

for i, doi in enumerate(df["doi"][start_idx:]):

    # scrape if this article has not been scraped yet
    if ("10." == doi[:3]) and (doi not in existing_doi) and ("issue" not in doi):
        try:
            driver.get(base_url+doi)
            driver.refresh()
            soup = bs4.BeautifulSoup(driver.page_source, "html.parser")

            # get author links
            authordiv = soup.find("div", "gs_a")
            authorlinks = ["https://scholar.google.com/"+tag.get("href") for tag in authordiv.find_all("a")]

            # open author links to download profiles locally
            if authorlinks != []:
                for link in authorlinks:
                    # get authorid
                    authorid = link.split("user=")[1].split("&")[0]
                    driver.get(link)
                    driver.refresh()
                    soup = bs4.BeautifulSoup(driver.page_source, "html.parser")

                    # save to html, use authorid as filename
                    with open("data/authors/html/"+authorid+".html", "w", encoding = "utf-8-sig") as file:
                        file.write(str(soup))
                    file.close()

                    # append authorid
                    authorids.append(authorid)

                    # append articleid
                    articleids.append(doi)
                    print(str(i+start_idx)+","+authorid+","+doi)
                    time.sleep(random.randrange(1,10))     # random sleep for some time
            else:
                time.sleep(random.randrange(1,10))

        # no article is found
        except:
            # not blocked then go to next doi
            try:
                time.sleep(random.randrange(1,10))
                res = soup.find("div", id = "gs_res_ccl_top").text

            # get blocked, stop process and save to files
            except:
                with open("author_article.csv", "a", newline="") as file:
                    writer = csv.writer(file)
                    for j in range(len(authorids)):
                        writer.writerow([authorids[j],articleids[j]])
                file.close()
                print("Blocked!!!")         # print block information
                winsound.Beep(2500,1000)    # produce sound notification
                break

### 1.3 Author Information Scraping

- This step does not require use of requests or selenium since we have downloaded author profiles locally before.
- We scrape directly from local html files and save the following variables:
- *name, google userid, emailsuffix, miscellaneous information about titles, positions, h-index, citations for every year since 1980*

In [None]:
# get existing author records
authordic = pd.read_csv("author.csv", encoding = "utf-8-sig").to_dict("list")

# get author html files locally
files = os.listdir("data/authors/html/")


for i, file in enumerate(files):

    # if the local file has not been scraped, scrape it
    if file.replace(".html","") not in authordic["authorid"]:
        print(i,file)
        with open("data/authors/html/"+file, encoding = "utf-8-sig") as f:
            content = f.read()
            soup = BeautifulSoup(content, 'html.parser')
        f.close()

        # authorid
        authorid = file.replace(".html","")
        # name
        name = soup.find('div', id="gsc_prf_in").text
        # emailsuffix
        emailsuffix = re.findall("[a-zA-Z0-9._-]+", soup.find('div', id="gsc_prf_ivh").text)[0]
        # misc
        misc = soup.find('div',class_="gsc_prf_il").text
        # index
        indexspan = soup.find("table").find_all("td")
        for i,s in enumerate(indexspan):
            if s.text == "Citations":
                t2302cite = indexspan[i+1].text
                s18t2302cite = indexspan[i+2].text
            elif s.text == "h-index":
                t2302hindex = indexspan[i+1].text
                s18t2302hindex = indexspan[i+2].text
            elif s.text == "i10-index":
                t2302i10index = indexspan[i+1].text
                s18t2302i10index = indexspan[i+2].text

        # citations
        try:
            yeartags = soup.find("div", class_="gsc_md_hist_b").find_all("span", class_= "gsc_g_t")
            years = dict([(str(len(yeartags)-i), "cite"+tag.text) for i, tag in enumerate(yeartags)])
            cites = dict([(tag.get("style").split(":")[-1], tag.text) for tag in soup.find("div", class_="gsc_md_hist_b").find_all("a", class_= "gsc_g_a")])

            yearcitedic = {}
            for idx in years.keys():
                try:
                    yearcitedic[years[idx]] = cites[idx]
                except:
                    yearcitedic[years[idx]] = 0

            for i in range(2024-1980):
                if "cite"+str(1980+i) not in yearcitedic.keys():
                    authordic["cite"+str(1980+i)].append("")
                else:
                    authordic["cite"+str(1980+i)].append(yearcitedic["cite"+str(1980+i)])  # append citations number

        # no yearly citations info is found
        except:
            for i in range(2024-1980):
                authordic["cite"+str(1980+i)].append("")

        # append other info
        authordic["authorid"].append(authorid)
        authordic["authorname"].append(name)
        authordic["misc"].append(misc)
        authordic["emailsuffix"].append(emailsuffix.lower())
        authordic["t2302cite"].append(t2302cite)
        authordic["s18t2302cite"].append(s18t2302cite)
        authordic["t2302hindex"].append(t2302hindex)
        authordic["s18t2302hindex"].append(s18t2302hindex)
        authordic["t2302i10index"].append(t2302i10index)
        authordic["s18t2302i10index"].append(s18t2302i10index)
        authordic["affiliationid"].append("")   # set affiliationid empty for now, later update with email_affiliation.csv

# save data to local
df = pd.DataFrame(authordic)
df.to_csv("author.csv", index = False, encoding = "utf-8-sig")
df.to_csv("author_no_header.csv", index = False, header = False, encoding = "utf-8-sig")

### 1.4 Affiliation Scraping (Selenium)

- When we get the author.csv, we want to match these author records to their current affiliations.
- The miscellaneous information line about title, position, institution is too messy to use.
- The solution is we search their emailsuffix in Google Scholar to obtain the standard affiliation names.
- Actually, each institution will have only one main email suffix and different versions of subemail suffix.
- Thus we need to record the subemail-affliationid links as well.
- We also get the affiliation dataset containining, *name, affiliationid, main email suffix*.
- The affiliation id is generated by first letters of words in school name + . + emailsuffix.

In [None]:
# load existing email_affiliation links and author file
df_email = pd.read_csv("email_affiliation.csv", encoding = "utf-8-sig").replace(np.NaN,"")
emaildict = df_email.to_dict("list")
df_au = pd.read_csv("author.csv", encoding = "utf-8-sig")

# set base url and start_idx
base_url = "https://scholar.google.com/scholar?hl=zh-CN&q="
start_idx = 1

for i, e in enumerate(sorted(df_au["emailsuffix"].unique())[start_idx:]):

    # scrape if the subemail is not recorded
    if e not in emaildict["subemail"]:
        driver.get(base_url+e)
        driver.refresh()
        soup = BeautifulSoup(driver.page_source, "html.parser")
        h3 = soup.find("div", class_ = "gs_r").h3

        # get name and main email suffix
        div = soup.find("div", class_ = "gs_r").div
        name = div.a.text
        email = div.span.text.replace(" - ", "")
        time.sleep(random.randrange(5,10))
                
        # generate affiliation id
        affid = "".join([w[0] for w in name.lower().split(" ")])+"."+email

        # append to emaildictt
        emaildict["name"].append(name)
        emaildict["email"].append(email)
        emaildict["subemail"].append(e)
        emaildict["affiliationid"].append(affid)

        # monitor process
        print(i+start_idx, e, name, email, affid)

### 1.5 Match Affiliation with Its Economic Research Rank

- We match affiliations with their economic research rank (if any) evaluated by ideas.repec.org based on the last 10 years' performance.
- We use record_linkage to compute the string simiarity scores of names in our dataset and in the rank file.
- We select the highest matched string in the rank file for each institution in our dataset.
- We use 0.9 as a threshold for a correct match.
- For records below 0.9 and above 0.8, we do eyeballing to select correct matches.

In [3]:
# Affiliations to be matched are stored in affiliations_df
# IDEAS ranking of economic institutions is stored in university_df

###  Preprocessing
path = "/Users/zhangyicheng/Library/CloudStorage/Dropbox/My Mac (张苡铖’s MacBook Pro (2))/Desktop/MACS122/Project/"
affiliations_df = pd.read_csv(path +"affiliation.csv")
affiliations_df["institution"] = affiliations_df.name

# Take out the location part in the name
for i, row in affiliations_df.iterrows():
    ls = row.loc["name"].split(",")
    ele = ls[0] 
    row.loc['institution'] = ele

# To make the comparison more sensitive, strip "university" and "of" from the string
affiliations_df["institution"] = affiliations_df["institution"]\
                                    .apply(lambda x: x.lower())\
                                    .apply(lambda x: x.strip())\
                                    .apply(lambda x: x.replace("university",""))\
                                    .apply(lambda x: x.replace("of",""))


path = "/Users/zhangyicheng/Library/CloudStorage/Dropbox/My Mac (张苡铖’s MacBook Pro (2))/Desktop/MACS122/Project/code/"
university_df = pd.read_csv(path +"university_ranking.csv")
university_df = university_df.rename(columns={"rank": "Rank"})

university_df["institution"] = university_df["University"]\
                                    .apply(lambda x: x.lower())\
                                    .apply(lambda x: x.strip())\
                                    .apply(lambda x: x.replace("university",""))\
                                    .apply(lambda x: x.replace("of",""))\
                                    .apply(lambda x: re.sub("\(.*\)","",x))
university_df = university_df.drop_duplicates()

###  Indexing
index_full = Full()
candidate_links = index_full.index(affiliations_df, university_df)

our_comparison = recordlinkage.Compare()
# compare all our columns
our_comparison.string("institution", "institution", method = "jarowinkler", label = "jw_uni_name")
features = our_comparison.compute(candidate_links, affiliations_df, university_df)

thr_matches = features[(features["jw_uni_name"] >= 0)]

# Store the matching info to a dictionary of list
jwdic = {}
for i, row in thr_matches.iterrows():
    author_index = i[0]
    if author_index in jwdic:
        jwdic[author_index].append((row.jw_uni_name, i[1]))                                     
    else: 
        jwdic[author_index] = []

###  Get the best match (highest match score) for each author
links = {"author_index":[], "university_index":[], "match_score":[]}
for key, values in jwdic.items():
    max_score = 0
    for value in values:
        if value[0] > max_score:
            max_score = value[0]
            index = value[1]
        else:
            continue
    links["author_index"].append(key)
    links["university_index"].append(index)
    links["match_score"].append(max_score)

links_df = pd.DataFrame(links)

# Concatenate the two dataframe 
links_df["author_ins"] = links_df["author_index"]
links_df["univer_ins"] = links_df["author_index"]
links_df["Rank"] = links_df["author_index"]

for r, row in links_df.iterrows():
    author_uni = affiliations_df.iloc[int(row.author_index)].institution
    univer = university_df.iloc[int(row.university_index)].University
    rank = university_df.iloc[int(row.university_index)].Rank
    links_df["author_ins"].at[r] = author_uni
    links_df["univer_ins"].at[r] = univer
    links_df["Rank"].at[r] = rank

pd_con = pd.concat([affiliations_df, links_df], axis=1)
affilication_ranking_linked = pd_con[["affiliationid", "name", "email", "match_score", "univer_ins", "Rank"]]
affilication_ranking_linked.head(10)





Unnamed: 0,affiliationid,name,email,match_score,univer_ins,Rank
0,au.aalto.fi,Aalto University,aalto.fi,0.796296,Carleton University,309
1,kriot.kth.se,KTH Royal Institute of Technology,kth.se,0.747008,Beijing Institute of Technology,78
2,uocla.ucla.edu,"University of California, Los Angeles",ucla.edu,0.933333,University of California-Davis,39
3,aei.aei.org,American Enterprise Institute,aei.org,0.759988,Economic and Social Research Institute,212
4,miot.mit.edu,Massachusetts Institute of Technology,mit.edu,1.0,Massachusetts Institute of Technology,8
5,ciot.caltech.edu,California Institute of Technology,caltech.edu,0.867773,Beijing Institute of Technology,78
6,hu.harvard.edu,Harvard University,harvard.edu,1.0,Harvard University,5
7,su.stanford.edu,Stanford University,stanford.edu,1.0,Stanford University,14
8,uop.upenn.edu,University of Pennsylvania,upenn.edu,1.0,University of Pennsylvania,55
9,ai.amazon.com,Amazon Inc.,amazon.com,0.693182,Chapman University,448


### 1.6 Clean Article Dataset

- The main task is to match the issue information of each article to a data (for later Dynamic TM).
- Other cleaning includes select only articles with doi available, and replace out string "Abstract:" in abstarcts.
- After cleaning and selecting all valid articles in all 20 journals, we concatenate them into a single csv for database storage purpose.
- Take the example of *Journal of Economic Perspectives*:

In [None]:
# clean jep
df_jep = pd.read_csv("data/journals/raw_data/jep.csv", encoding = "utf-8-sig").astype("str")

# replace out miscellaneous strings in abstracts, volumes, doi
df_jep["abstract"] = df_jep["abstract"].apply(lambda x: x.replace("\n","").replace("\t","").replace("Abstract","").strip())
df_jep["volume"] = df_jep["volume"].apply(lambda x: x.replace("\n","").replace("\t","").strip())
df_jep["doi"] = df_jep["doi"].apply(lambda x: x.strip())
df_jep = df_jep[df_jep["doi"].str[:3] == "10."].drop_duplicates(subset = ["doi"]).reset_index(drop=True)

# clean volume to get dates
dates = []
for idx in df_jep.index:
    date_text = df_jep.loc[idx,"volume"].lower().split(" ")
    season = date_text[0]
    year = date_text[1]
    # assign dates according to issuing season
    if season == "spring":
        md = "02-15"
    elif season == "summer":
        md = "05-15"
    elif season == "fall":
        md = "08-15"
    elif season == "winter":
        md = "11-15"

    dates.append(year+"-"+md)

# append dates
df_jep["date"] = dates
# select desired attributes
df_jep_s = df_jep[["journal","title","authors","volume","jel","abstract","url","doi","date"]]

### 1.7 Database Creation

- Our database will have 4 tables, *article, author, affiliation, author_article*.
- We write a sql file (create-econtop.sql) to create this database from csv files.
- We run the below code to create it in ipynb:

In [None]:
# use this line to run the script directly in Jupyter
! sqlite3 ../data/econtop.db < create-econtop.sql

## 2. Basic Descriptive Stats of Dataset

- We mainly focus on the publications during 2012-2022, i.e., the past decade.

In [2]:
# connect to database
conn = sqlite3.connect("../data/econtop.db")
cur = conn.cursor()

### 2.1 Show Number of Ariticles by Journals

In [3]:
# extract title, date, journal, from database, starting from 2012-2022
q = """
    SELECT art.doi, art.journal, art.title, art.date
    FROM author_article AS aa JOIN author AS au JOIN article AS art JOIN affiliation AS af
    ON aa.authorid = au.authorid AND
    aa.doi = art.doi AND
    au.affiliationid = af.affiliationid
    WHERE art.date >= '2012-01-01' AND
    art.date < '2023-01-01'
    """

# remove duplicated abstracts, set as strings
df = pd.read_sql_query(q, conn).drop_duplicates(subset=["doi"]).astype("str")
df["date"] = pd.to_datetime(df["date"])   # convert date to datetime
df.head()

Unnamed: 0,doi,journal,title,date
0,10.1093/rfs/hhr069,Review of Financial Studies,The Inventory Growth Spread,2012-01-15
2,10.1093/rfs/hhr109,Review of Financial Studies,Takeovers and Divergence of Investor Opinion,2012-01-15
3,10.1093/rfs/hhr081,Review of Financial Studies,Corporate Governance Objectives of Labor Union...,2012-01-15
4,10.1093/rfs/hhr076,Review of Financial Studies,Managerial Attributes and Executive Compensation,2012-01-15
7,10.1093/rfs/hhr092,Review of Financial Studies,The Road Less Traveled: Strategy Distinctivene...,2012-01-15


In [4]:
# grougby and count
df_count_by_journal = df.groupby("journal").count()[["doi"]].rename(columns={"doi":"cnt"})

fig = go.Figure()
fig.add_trace(
    go.Bar(
        y = df_count_by_journal.index,
        x = df_count_by_journal.cnt,
        text = df_count_by_journal.cnt,
        orientation = "h",
        marker={'color': df_count_by_journal.cnt,
        'colorscale': 'Peach'}
    )
)
fig.update_layout(title_text="Number of Collected Publications from 2012 to 2022 by Journal",
#                 template="plotly_dark",
                  yaxis={'categoryorder':'total ascending'},
                  width=800,
                  height=600
                  )
fig.show()

### 2.2 Show Top 20 Authors who have most Publications

In [5]:
# extract title, date, journal, from database, starting from 2012-2022
q = """
    SELECT art.doi, au.authorname, af.name, au.authorid, art.title, art.date
    FROM author_article AS aa JOIN author AS au JOIN article AS art JOIN affiliation AS af
    ON aa.authorid = au.authorid AND
    aa.doi = art.doi AND
    au.affiliationid = af.affiliationid
    WHERE art.date >= '2012-01-01' AND
    art.date < '2023-01-01'
    """

# remove duplicated abstracts, set as strings
df = pd.read_sql_query(q, conn).astype("str")
df["date"] = pd.to_datetime(df["date"])   # convert date to datetime
df.head()

Unnamed: 0,doi,authorname,name,authorid,title,date
0,10.1093/rfs/hhr069,Frederico Belo,INSEAD,4Weq9mEAAAAJ,The Inventory Growth Spread,2012-01-15
1,10.1093/rfs/hhr069,Xiaoji Lin,University of Minnesota,7QwB9o8AAAAJ,The Inventory Growth Spread,2012-01-15
2,10.1093/rfs/hhr109,Kose John,New York University,dJlFSXkAAAAJ,Takeovers and Divergence of Investor Opinion,2012-01-15
3,10.1093/rfs/hhr081,Ashwini Agrawal,London School of Economics,apUAKLgAAAAJ,Corporate Governance Objectives of Labor Union...,2012-01-15
4,10.1093/rfs/hhr076,Si Li,Wilfrid Laurier University,lmzraDAAAAAJ,Managerial Attributes and Executive Compensation,2012-01-15


In [6]:
# Top 20 Authors
df_count_by_author = df.groupby("authorid").count()[["doi"]].rename(columns={"doi":"cnt"}).sort_values("cnt", ascending=False)[:20]
# match authorid to name
names = []
for id in df_count_by_author.index:
    for idx in df.index:
        if id == df.loc[idx,"authorid"]:
            names.append(df.loc[idx,"authorname"]+", "+df.loc[idx,"name"])
            break

# reset index
df_count_by_author["name"] = names

fig = go.Figure()
fig.add_trace(
    go.Bar(
        y = df_count_by_author.name,
        x = df_count_by_author.cnt,
        text = df_count_by_author.cnt,
        orientation = "h",
        marker={'color': df_count_by_author.cnt,
        'colorscale': 'Tealgrn'}
    )
)
fig.update_layout(title_text="Number of Collected Publications from 2012 to 2022 by Authors",
#                 template="plotly_dark",
                  yaxis={'categoryorder':'total ascending'},
                  width=1200,
                  height=700
                  )
fig.show()

### 2.3 Show Top 20 Institutions who have authors with top publications

In [7]:
# extract title, date, journal, from database, starting from 2012-2022
q = """
    SELECT art.doi, af.name, art.title, art.date
    FROM author_article AS aa JOIN author AS au JOIN article AS art JOIN affiliation AS af
    ON aa.authorid = au.authorid AND
    aa.doi = art.doi AND
    au.affiliationid = af.affiliationid
    WHERE art.date >= '2012-01-01' AND
    art.date < '2023-01-01'
    """

# remove duplicated abstracts, set as strings
df = pd.read_sql_query(q, conn).drop_duplicates(subset=["doi","name"]).astype("str")
df["date"] = pd.to_datetime(df["date"])   # convert date to datetime
df.head()

Unnamed: 0,doi,name,title,date
0,10.1093/rfs/hhr069,INSEAD,The Inventory Growth Spread,2012-01-15
1,10.1093/rfs/hhr069,University of Minnesota,The Inventory Growth Spread,2012-01-15
2,10.1093/rfs/hhr109,New York University,Takeovers and Divergence of Investor Opinion,2012-01-15
3,10.1093/rfs/hhr081,London School of Economics,Corporate Governance Objectives of Labor Union...,2012-01-15
4,10.1093/rfs/hhr076,Wilfrid Laurier University,Managerial Attributes and Executive Compensation,2012-01-15


In [8]:
# Top 20 Institutions
df_count_by_aff = df.groupby("name").count()[["doi"]].rename(columns={"doi":"cnt"}).sort_values("cnt", ascending=False)[:20]

fig = go.Figure()
fig.add_trace(
    go.Bar(
        y = df_count_by_aff.index,
        x = df_count_by_aff.cnt,
        text = df_count_by_aff.cnt,
        orientation = "h",
        marker={'color': df_count_by_aff.cnt,
        'colorscale': 'Purpor'}
    )
)
fig.update_layout(title_text="Number of Collected Publications from 2012 to 2022 by Institutions",
#                 template="plotly_dark",
                  yaxis={'categoryorder':'total ascending'},
                  width=1000,
                  height=700
                  )
fig.show()

## 3. Collaboration Network Analysis

- This section does for the network analysis for Top Economics publications collaboration from 2012 to 2022.
- We create a new variable here as "Tier". Top 1-50 will be Tier0, Top 51-100 will be Tier1, so on and so forth.

3.1 Draw Data from Database and Append Tier Information

In [9]:
# extract journal, title, date, institution name, from 2012 to 2022
q = """
    SELECT  art.doi, art.journal, art.title, art.date, au.authorname, af.name, af.rank
    FROM author_article AS aa JOIN author AS au JOIN article AS art JOIN affiliation AS af
    ON aa.authorid = au.authorid AND
    aa.doi = art.doi AND
    au.affiliationid = af.affiliationid
    WHERE art.date >= '2012-01-01' AND
    art.date < '2023-01-01'
    """

df = pd.read_sql_query(q, conn).astype("str")
df["date"] = pd.to_datetime(df["date"])   # convert date to datetime
df.set_index("doi", inplace = True)
df.head()

Unnamed: 0_level_0,journal,title,date,authorname,name,rank
doi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10.1093/rfs/hhr069,Review of Financial Studies,The Inventory Growth Spread,2012-01-15,Frederico Belo,INSEAD,
10.1093/rfs/hhr069,Review of Financial Studies,The Inventory Growth Spread,2012-01-15,Xiaoji Lin,University of Minnesota,164.0
10.1093/rfs/hhr109,Review of Financial Studies,Takeovers and Divergence of Investor Opinion,2012-01-15,Kose John,New York University,50.0
10.1093/rfs/hhr081,Review of Financial Studies,Corporate Governance Objectives of Labor Union...,2012-01-15,Ashwini Agrawal,London School of Economics,1.0
10.1093/rfs/hhr076,Review of Financial Studies,Managerial Attributes and Executive Compensation,2012-01-15,Si Li,Wilfrid Laurier University,


In [10]:
# add institutions tiers to each record, not no rank, assign -1
df["tier"] = df["rank"].apply(lambda x: -1 if x == "" else int(x) // 50)

### 3.2 Institutional Level Analysis

#### 3.2.1 Get Institutional Collaboration Links

In [11]:
# get collaboration freq, dictionary {(ins1, ins2): num of colab, [...,...]: ...}, tuple as keys
# note that it is possible to have (ins1, ins1) because there can be co-authors from the same university!
links = {}

# iterate over all doi
for doi in df.index:
    try:
        df.loc[doi][0]  # check if is only single record, if not there is collaboration!
    except:
        df_new = df.loc[doi].sort_values("name")   # to prevent duplicated records like (A, B) and (B, A) in the results
        length = len(df_new)
        
        # go over all authors for this article
        for i in range(length-1):
            for j in range(length-1-i):
                try:
                    links[(df_new["name"][i], df_new["name"][i+j+1])] += 1
                except:
                    links[(df_new["name"][i], df_new["name"][i+j+1])] = 1

#### 3.2.2 Convert into a list of triple-element lists

In [13]:
# convert dict into a list of triple-element lists ([[ins1,ins2,num_coau],[,,],[,,],])

# all links for cross-rank scores analysis
links_all = [[key[0],key[1],links[key]] for key in links.keys()]

# cross-school links for network construction
links_diff = [[key[0],key[1],links[key]] for key in links.keys() if key[0] != key[1]]

#### 3.2.3 Institutional-level Network Density

In [16]:
# building institutioanl network here
G = nx.Graph()

G.add_weighted_edges_from(links_diff)  # add nodes, edges, and weights

# show density
print("Institutional: ", nx.density(G))

Institutional:  0.012322998537377243


#### 3.2.4 Institutional-level Network Centralities

In [17]:
# betweenness centrality
betweenness_centrality = nx.centrality.betweenness_centrality(G)
betweenness_list = (sorted(betweenness_centrality.items(), key=lambda item: item[1], reverse=True))[:10]

In [18]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        y = [item[0] for item in betweenness_list],
        x = [item[1] for item in betweenness_list],
        text = [round(item[1],4) for item in betweenness_list],
        orientation = "h",
#       marker_color = "gold",
        marker={'color': [item[1] for item in betweenness_list],
                'colorscale': 'Gnbu'}
    )
)
fig.update_layout(title_text="Top 10 Betweenness Centralities of Economic Instutitions",
#                 template="plotly_dark",
                  yaxis={'categoryorder':'total ascending'},
                  width=800,
                  height=600
                  )
fig.show()

In [19]:
# degree centrality
degree_centrality = nx.centrality.degree_centrality(G)  
degree_list = (sorted(degree_centrality.items(), key=lambda item: item[1], reverse=True))[:10]

In [20]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        y = [item[0] for item in degree_list],
        x = [item[1] for item in degree_list],
        text = [round(item[1],4) for item in degree_list],
        orientation = "h",
        marker={'color': [item[1] for item in degree_list],
                'colorscale': 'Peach'}
    )
)
fig.update_layout(title_text="Top 10 Degree Centralities of Economic Instutitions",
#                 template="plotly_dark",
                  yaxis={'categoryorder':'total ascending'},
                  width=800,
                  height=600
                  )
fig.show()

In [21]:
# eigenvector centrality
eigenvector_centrality = nx.centrality.eigenvector_centrality(G)
eigenvector_list = (sorted(eigenvector_centrality.items(), key=lambda item: item[1], reverse=True))[:10]

In [22]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        y = [item[0] for item in eigenvector_list],
        x = [item[1] for item in eigenvector_list],
        text = [round(item[1],4) for item in eigenvector_list],
        orientation = "h",
 #      marker_color = "skyblue",
        marker={'color': [item[1] for item in eigenvector_list],
                'colorscale': 'Purpor'}
    )
)
fig.update_layout(title_text="Top 10 Eigenvector Centralities of Economic Instutitions",
#                 template="plotly_dark",
                  yaxis={'categoryorder':'total ascending'},
                  width=800,
                  height=600
                  )
fig.show()

In [23]:
# add up 3 centrality and take average
average_centrality = [(key,
                      (degree_centrality[key]+
                       betweenness_centrality[key]+
                       eigenvector_centrality[key]) / 3) for key in degree_centrality.keys()]

centrality_list = (sorted(degree_centrality.items(), key=lambda item: item[1], reverse=True))[:10]

In [24]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        y = [item[0] for item in centrality_list],
        x = [item[1] for item in centrality_list],
        text = [round(item[1],4) for item in centrality_list],
        orientation = "h",
 #      marker_color = "skyblue",
        marker={'color': [item[1] for item in centrality_list],
                'colorscale': 'Reds'}
    )
)
fig.update_layout(title_text="Top 10 Average Centralities of Economic Instutitions",
#                 template="plotly_dark",
                  yaxis={'categoryorder':'total ascending'},
                  width=800,
                  height=600
                  )
fig.show()

#### 3.2.5 Institutional-level Cross-rank Score Analysis

- We define the cross-rank score of Institution A to be (sum of absolute difference in ranks against another Institution times the cooperation number) / (total collaborations).
- The cross-rank score reflects how likely an institution is to collaborate those of different levels.
- e.g., an institution which only has within-institution collaboration will have a 0 cross-rank score.

In [25]:
# calculate school's cross-rank score to measure cooperation freedom cross hieraichy
# we only consider the institutions which are ranked as Top 10% of all economic instutitions (around 200)

df_univ_rank = df[["name","rank"]][df["rank"] != ""].drop_duplicates().set_index("name").astype({"rank":"int"})
crscores = []   # list of cross-rank scores

for univ in df_univ_rank.index:
    sum_coau_num = 0
    sum_score = 0
    for link in links_all:
        if univ in link:
            try:
                univ_a = df_univ_rank.loc[link[0],"rank"]
                univ_b = df_univ_rank.loc[link[1],"rank"]
                # add coauthor number, add crscores
                sum_coau_num += link[2]   # add on number of coauthorships
                sum_score += abs(univ_a-univ_b) * link[2]   # add on rank difference * number of coauthorships
            except:
                pass   # the coauthor univ not ranked, pass

    # append to list
    try:
        crscores.append(sum_score / sum_coau_num)   # to deal with zero denominator
    except:
        crscores.append(0)

df_univ_rank["crscore"] = crscores   # append to df
df_univ_rank.head()    # show result

Unnamed: 0_level_0,rank,crscore
name,Unnamed: 1_level_1,Unnamed: 2_level_1
University of Minnesota,164,118.29595
New York University,50,90.403344
London School of Economics,1,96.413699
McMaster University,449,233.791667
Duke University,84,92.403131


In [27]:
# plot the distribution
fig = ff.create_distplot([df_univ_rank["crscore"]], 
                         ["Matched Top 10% Economic Institutions"],
                         bin_size=20,
                         show_rug = False
                         )

fig.update_layout(title_text = "Cross-rank Scores Distributuion",
                  xaxis_title = "Cross-rank Scores",
                  yaxis_title = "Density",
#                 template = "plotly_dark",
                  width = 600,
                  height = 500,
                  legend=dict(
                    yanchor="bottom",
                    y=1.00,
                    xanchor="left",
                    x=0.35
                    )
                  )
fig.show()

In [28]:
# show cross-rank against institition ranks
fig = px.scatter(df_univ_rank, 
                 x="rank", 
                 y="crscore", 
                 trendline="lowess",
                 trendline_options=dict(frac=0.5),
                 color="crscore",
                 color_continuous_scale=px.colors.sequential.Purp)

fig.update_layout(title_text = "Cross-rank Score vs Rank by Economic Research",
                  xaxis_title = "Rank",
                  yaxis_title = "Cross-rank Score",
#                 template = "plotly_dark",
                  width = 800,
                  height = 600,
                  legend=dict(
                    yanchor="bottom",
                    y=1.00,
                    xanchor="left",
                    x=0.35
                    )
                  )
fig.show()

#### 3.2.6 Tier-level Analysis

- Similar to the previous analysis of institutional level, here we do something similar at tier level.
- The tiers as mentioned before, cluster institutions by their ranks.
- We have 11 tiers in total.

In [29]:
# collaboration freq, dictionary {(tier1, tier2): num of colab, [...,...]: ...}, tuple as keys
links_tiers = {}

for doi in df.index:
    try:
        df.loc[doi][0]  # check if is only single record, if not there is collaboration!
    except:
        df_new = df.loc[doi].sort_values("tier")   # to prevent duplicated records like (A, B) and (B, A) in the results
        length = len(df_new)

        for i in range(length-1):
            for j in range(length-1-i):
                try:
                    links_tiers[(df_new["tier"][i], df_new["tier"][i+j+1])] += 1
                except:
                    links_tiers[(df_new["tier"][i], df_new["tier"][i+j+1])] = 1

In [30]:
# convert to list of lists
links_all_tiers = [[key[0],key[1],links_tiers[key]] for key in links_tiers.keys()]
links_diff_tiers = [[key[0],key[1],links_tiers[key]] for key in links_tiers.keys() if key[0] != key[1]]

In [31]:
# build tier level network
G_tier = nx.Graph()
G_tier.add_weighted_edges_from([link for link in links_diff_tiers if (link[0] != -1) and (link[1] != -1)])
# show density
print("Tier: ", nx.density(G_tier))

Tier:  1.0


In [32]:
# calculate tier-level cross-rank scores

df_tier_rank = df[["tier"]][df["tier"] != -1].drop_duplicates().set_index("tier")
crscores = []   # list of cross-rank scores
coau_num = []

for tier in df_tier_rank.index:
    sum_coau_num = 0
    sum_score = 0
    for link in links_all_tiers:
        if tier in link:
            univ_a = link[0]
            univ_b = link[1]
            sum_coau_num += link[2]   # add on number of coauthorships
            sum_score += 50 * abs(univ_a-univ_b) * link[2]   # add on rank difference * number of coauthorships

    # append to list
    try:
        crscores.append(round(sum_score / sum_coau_num,2))   # to deal with zero denominator
    except:
        crscores.append(0)

    coau_num.append(sum_coau_num)

df_tier_rank["crscore"] = crscores   # append to df
df_tier_rank["totalcoau"] = coau_num
df_tier_rank["txt"] = ["Tier"+str(tier)+": "+str(df_tier_rank.loc[tier,"crscore"]) for tier in df_tier_rank.index]

In [34]:
fig = px.scatter(df_tier_rank, 
                 x=df_tier_rank.index, 
                 y="crscore", 
#                trendline="lowess",
#                trendline_options=dict(frac=0.5),
                 color="crscore",
				 size="totalcoau",
				 size_max=150,
                 color_continuous_scale=px.colors.sequential.Purp,
                 text="txt"
                 )

fig.update_traces(marker=dict(
                  line=dict(width=3,
                  color='Skyblue')),
                  textposition='middle center',
				 )
				 
fig.update_layout(title_text = "Cross-rank Score vs Tier by Economic Research",
                  xaxis_title = "Tier",
                  yaxis_title = "Cross-rank Score",
#                 template = "plotly_dark",
                  width = 1200,
                  height = 800,
                  legend=dict(
                  yanchor="bottom",
                  y=1.00,
                  xanchor="left",
                  x=0.35
                  )
                  )
fig.show()

#### 3.2.7 Save Network to gexf file

- Save Network to gexf file for further processing in Gephi
- The generated figures can be found in present/graph folder, slides, and final report.

In [None]:
# save to gexf file, and further process in Gephi interface
nx.write_gexf(G, 'raw_institutional_network.gexf')
nx.write_gexf(G_tier, 'raw_tier_network.gexf')

## 4. Dynamic Topic Modeling

- In this section, we use BERTopic technique to conduct TM analysis of abstracts collected from 2012-2022.

### 4.1 Get and Clean Date

In [41]:
# connect to database
conn = sqlite3.connect("../data/econtop.db")
cur = conn.cursor()

In [56]:
# extract title, abstract, date, and journal from database, starting from 2012-2022
q = """
    SELECT art.doi, art.journal, art.title, art.abstract, art.date, af.name, af.rank
    FROM author_article AS aa JOIN author AS au JOIN article AS art JOIN affiliation AS af
    ON aa.authorid = au.authorid AND
    aa.doi = art.doi AND
    au.affiliationid = af.affiliationid
    WHERE art.date >= '2012-01-01' AND
    art.date < '2023-01-01'
    """

# remove duplicated abstracts, set as strings
df = pd.read_sql_query(q, conn).drop_duplicates(subset=["abstract"]).astype("str")
df["date"] = pd.to_datetime(df["date"])   # convert date to datetime
df = df[(df["abstract"] != "nan") & (df["abstract"] != "N\A")].dropna(subset=["abstract"])
df.head()

Unnamed: 0,doi,journal,title,abstract,date,name,rank
0,10.1093/rfs/hhr069,Review of Financial Studies,The Inventory Growth Spread,Previous studies show that firms with low inve...,2012-01-15,INSEAD,
2,10.1093/rfs/hhr109,Review of Financial Studies,Takeovers and Divergence of Investor Opinion,We test several hypotheses on how takeover pre...,2012-01-15,New York University,50.0
3,10.1093/rfs/hhr081,Review of Financial Studies,Corporate Governance Objectives of Labor Union...,Labor union pension funds have become increasi...,2012-01-15,London School of Economics,1.0
4,10.1093/rfs/hhr076,Review of Financial Studies,Managerial Attributes and Executive Compensation,We study the role of firm- and manager-specifi...,2012-01-15,Wilfrid Laurier University,
7,10.1093/rfs/hhr092,Review of Financial Studies,The Road Less Traveled: Strategy Distinctivene...,We investigate whether skilled hedge fund mana...,2012-01-15,"University of California, Irvine",


In [57]:
# generate time bins
timebins = []
for i in range(2023-2012):
    timebins.append((pd.to_datetime(str(i+2012)+"-01-01"), pd.to_datetime(str(i+2013)+"-01-01")))

# sample 1400 articles from each year
df_s = pd.concat([df[(df["date"] < bin[1]) & (df["date"] >= bin[0])].sample(n=1400, random_state=0) for bin in timebins]).reset_index(drop=True)
df_s.tail(5)

Unnamed: 0,doi,journal,title,abstract,date,name,rank
15395,10.1093/restud/rdab057,Review of Economic Studies,A Theory of Monetary Union and Financial Integ...,"Since the creation of the euro, capital flows ...",2022-08-01,Centre de Recerca en Economia Internacional,
15396,10.1016/j.econmod.2022.105874,Economic Modelling,"COVID-19 regulations, culture, and the environ...",The economic and social disruptions caused by ...,2022-08-15,Nanyang Technological University,457.0
15397,10.1016/j.euroecorev.2022.104090,European Economic Review,Signalling creditworthiness with fiscal austerity,Sovereign borrowers may tighten their fiscal s...,2022-05-15,Bocconi University,
15398,10.1016/j.econmod.2022.105891,Economic Modelling,Unemployment claims during COVID-19 and econom...,Governments want to know how effective COVID-1...,2022-08-15,University of Macedonia,
15399,10.1016/j.econlet.2022.110806,Economics Letters,The personal saving rate: Data revisions and f...,Revisions to the U.S. personal saving rate are...,2022-10-15,University of Richmond,


In [58]:
# clear stopwords in abstracts
sw = stopwords.words("english")+["jel"]
df_s.abstract = df_s.abstract.apply(lambda x: " ".join([word for word in word_tokenize(x.lower()) if word not in sw]))

In [59]:
# get cleaned abstracts text and dates
abstracts = df_s.abstract.to_list()
dates = df_s['date'].to_list()

### 4.2 Run BERTopic TM on All Abstracts

In [60]:
# n_neighbors: smaller local strcuture lose connection, bigger all connection lose local structure
# n_component: dimensionality of reduced dimension space
# min_dist: how tightly UMAP is allowed to pack points together
umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.01, metric='euclidean', random_state=1)

topic_model = BERTopic(min_topic_size=70,verbose=True,umap_model=umap_model)
topics, probs = topic_model.fit_transform(abstracts)
topics_over_time = topic_model.topics_over_time(abstracts, dates, evolution_tuning=True, global_tuning=True,  nr_bins=60)

Batches:   0%|          | 0/482 [00:00<?, ?it/s]

2023-03-06 17:41:16,849 - BERTopic - Transformed documents to Embeddings
2023-03-06 17:41:26,934 - BERTopic - Reduced dimensionality
2023-03-06 17:41:27,620 - BERTopic - Clustered reduced embeddings
60it [00:10,  5.97it/s]


#### 4.2.1 Intertopic Distance Map

In [47]:
topic_model.visualize_topics()

#### 4.2.2 Dynamic Topic Trends Over Time

In [48]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics = 10, height=600)

#### 4.2.3 Top 5 Keywords in Each Topic

In [49]:
topic_model.visualize_barchart(n_words = 5, top_n_topics = 8, width=300, height=300, title="Topic Key Words in Each Economic Topics")

#### 4.2.4 Similarity Matrix of Topics

In [50]:
fig = topic_model.visualize_heatmap(top_n_topics = 10, width = 600, height = 600)
fig

### 4.3 Further Split of the Dominating Topic 0

In [62]:
# get the most significant topic an abstract belongs to 
df_s["topic"] = topics
df_s.head()

Unnamed: 0,doi,journal,title,abstract,date,name,rank,topic
0,10.1257/aer.102.7.3774,American Economic Review,Growth Dynamics: The Myth of Economic Recovery...,comment highlights different ways coding crisi...,2012-12-15,Spanish National Research Council,,18
1,10.1093/rfs/hhs063,Review of Financial Studies,"Asymmetric Information, Portfolio Managers, an...",propose model delegated asset management expla...,2012-07-15,University of Toronto,71.0,0
2,10.1093/rfs/hhs054,Review of Financial Studies,Dynamic Compensation Contracts with Private Sa...,article studies dynamic agency problem risk-av...,2012-05-15,University of Chicago,9.0,-1
3,10.1016/j.econlet.2011.12.032,Economics Letters,Welfare of naive and sophisticated players in ...,abdulkadiroglu et al . ( 2011 ) show naive par...,2012-05-15,Pompeu Fabra University,,-1
4,10.1093/rfs/hhr131,Review of Financial Studies,Optimal Corporate Governance and Compensation ...,"model long-run firm performance , management c...",2012-02-15,University of Oxford,28.0,0


In [52]:
# get topic 0 abstracts
abstracts = df_s[df_s["topic"] == 0].abstract.to_list()
dates = df_s[df_s["topic"] == 0]['date'].to_list()

In [53]:
# n_neighbors: smaller local strcuture lose connection, bigger all connection lose local structure
# n_component: dimensionality of reduced dimension space
# min_dist: how tightly UMAP is allowed to pack points together
umap_model = UMAP(n_neighbors=15, n_components=10, 
                  min_dist=0.01, metric='euclidean', random_state=1)

topic_model = BERTopic(min_topic_size=70,verbose=True,umap_model=umap_model)
topics, probs = topic_model.fit_transform(abstracts)
topics_over_time = topic_model.topics_over_time(abstracts, dates, evolution_tuning=True, global_tuning=True,  nr_bins=60)

Batches:   0%|          | 0/172 [00:00<?, ?it/s]

2023-03-06 17:36:06,096 - BERTopic - Transformed documents to Embeddings
2023-03-06 17:36:14,471 - BERTopic - Reduced dimensionality
2023-03-06 17:36:14,704 - BERTopic - Clustered reduced embeddings
60it [00:03, 19.15it/s]


#### 4.3.1 Keywords in Each Subtopic of Topic 0

In [54]:
topic_model.visualize_barchart(n_words = 5, top_n_topics = 8, width=300, height=300, title="Subtopics in Topic 0")

### 4.4 Institutional-level Topics Distribution

- In this section, we see for a specific institution, what research topics it is interested in.
- Also, we see under each topic, who are the most contributors.

#### 4.4.1 Example: Fed Topic Distribution

In [63]:
df_grouby_univ = df_s[(df_s["name"] == "US Federal Reserve Board") & 
                      (df_s["topic"] != -1)].groupby("topic").count()[["doi"]].rename(columns={"doi":"cnt"}).sort_values(by=["cnt"], ascending=False)
df_grouby_univ["keywords"] = list(pd.Series(df_grouby_univ.index).apply(lambda x: str(x)+": "+" ".join([t[0] for t in topic_model.get_topic(topic=x)[:5]])))
df_grouby_univ.head()

Unnamed: 0_level_0,cnt,keywords
topic,Unnamed: 1_level_1,Unnamed: 2_level_1
0,88,0: model financial policy risk shocks
2,6,2: wage workers unemployment labor job
3,4,3: trade export firms countries exports
7,3,7: students school schools education student
12,3,12: inequality income wealth consumption house...


In [64]:
# plot the result
fig = px.pie(df_grouby_univ, 
             values="cnt", 
             names="keywords",
             color="cnt",
             color_discrete_sequence=px.colors.diverging.Spectral,
             width=800,
#            height=400,
             title="US Federal Reserve Board: Topics Distribution of Top Econ Publications")
fig.update_traces(textinfo='percent')
fig.show()

#### 4.4.2 Example: Top Contributors for Topic 4

In [65]:
df_univ_by_topics = df_s[df_s["topic"] == 4].groupby(["name"]).count()[["doi"]].rename(columns={"doi":"cnt"}).sort_values(by=["cnt"], ascending=False)

In [66]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        y = df_univ_by_topics.index[:10],
        x = df_univ_by_topics.cnt[:10],
        text = df_univ_by_topics.cnt[:10],
        orientation = "h",
        marker={'color': df_univ_by_topics.cnt[:10],
                'colorscale': 'Magenta'}
    )
)
fig.update_layout(title_text="Top 10 Instutitons for Topic 4: consumers, price, firms, competition, market",
#                 template="plotly_dark",
                  yaxis={'categoryorder':'total ascending'},
                  width=800,
                  height=600
                  )
fig.show()

## 5. Combining SNA and TM

- In this section, we want combine SNA and TM to ask a question: do centralities research the same topics?
- We use discrepancy score as a metric to measure whether 2 institutions are similar in their topics.
- Discrepancy score is defined as the sum of absolute difference of 2 institutions' proportions of any topic.
- e.g, A's topic distribution is [Topic0: 0.7, Topic1: 0.3], B's is [Topic0: 1, Topic1: 0], then the discrepancy score of this pair = |0.7-1|+|0.3-0| = 0.6.
- We do this for each pair in Top 10 centralities.

In [68]:
# set top 10 centralities and topics
centralities = ["New York University","University of California, Berkeley", "University of Chicago",
                "Harvard University", "Columbia University", "London School of Economics",
                "World Bank", "Stanford University", "Massachusetts Institute of Technology",
                "University of Pennsylvania"]
centralities_abbr = ["NYU","UCB", "UChicago",
                    "Harvard", "ColumbiaU", "LSE",
                    "World Bank", "Stanford", "MIT",
                    "UPenn"]
topics = list(range(22))

In [69]:
# get topic distributions for all centralities
df_groupby_inst = pd.DataFrame({"topic":[],"cnt":[],"name":[],"keywords":[],"proportion":[]}).set_index("topic")

for c in centralities:
    df_temp = df_s[(df_s["name"] == c) & (df_s["topic"] != -1)].groupby("topic").count()[["doi"]].rename(columns={"doi":"cnt"})
    # fill in missing topics
    for topic in topics:
        if topic not in df_temp.index:
            df_temp.loc[topic] = [0]

    df_temp["name"] = c
    df_temp["keywords"] = list(pd.Series(df_temp.index).apply(lambda x: str(x)+": "+" ".join([t[0] for t in topic_model.get_topic(topic=x)[:5]])))

    total = sum(df_temp["cnt"])
    df_temp["proportion"] = df_temp.cnt.apply(lambda x: x / total)
    df_temp.sort_index(inplace=True)

    df_groupby_inst = pd.concat([df_groupby_inst, df_temp])

df_groupby_inst.head()

Unnamed: 0_level_0,cnt,name,keywords,proportion
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,85.0,New York University,0: model financial policy risk shocks,0.544872
1,19.0,New York University,1: game agent games information preferences,0.121795
2,8.0,New York University,2: wage workers unemployment labor job,0.051282
3,6.0,New York University,3: trade export firms countries exports,0.038462
4,1.0,New York University,4: consumers price firms competition market,0.00641


### 5.1 Example: UChicago's Discrepancy against Other Centralities

In [70]:
# calculate the discrepancy score
df_ds = pd.DataFrame({"name":[],"ds":[]})
df_uchicago = df_groupby_inst[df_groupby_inst["name"] == "University of Chicago"]
for c in centralities:
    if c != "University of Chicago":
        df_b = df_groupby_inst[df_groupby_inst["name"] == c]
        ds = round(sum([abs(df_uchicago.loc[t,"proportion"] - df_b.loc[t,"proportion"]) for t in df_uchicago.index]),4)
        df_ds.loc[len(df_ds)] = [c, ds]

df_ds = df_ds.sort_values(by=["ds"]).reset_index(drop=True)
df_ds.head(10)

Unnamed: 0,name,ds
0,Columbia University,0.3368
1,London School of Economics,0.3442
2,Stanford University,0.3652
3,New York University,0.3678
4,Harvard University,0.3888
5,University of Pennsylvania,0.4737
6,"University of California, Berkeley",0.5521
7,Massachusetts Institute of Technology,0.6245
8,World Bank,0.7026


In [71]:
# show the plot
fig = go.Figure()
fig.add_trace(
    go.Bar(
        y = df_ds.name,
        x = df_ds.ds,
        text = df_ds.ds,
        orientation = "h",
        marker={'color': df_ds.ds,
                'colorscale': 'Peach'}
    )
)
fig.update_layout(title_text="Discrepancy Score: UChicago vs Other Institutions",
#                 template="plotly_dark",
                  yaxis={'categoryorder':'total descending'},
                  width=800,
                  height=600
                  )
fig.show()

### 5.2 Discrepancy Score Matrix

- Get all pairs of discrepancy scores and form into a matrix.

In [72]:
# create a matrix to store discrepancy score
ds_matrix = np.zeros((10,10))

# calculate the discrepancy score
for i,a in enumerate(centralities):
    df_a = df_groupby_inst[df_groupby_inst["name"] == a]
    for j,b in enumerate(centralities):
        if b != a:
            df_b = df_groupby_inst[df_groupby_inst["name"] == b]
            ds = round(sum([abs(df_a.loc[t,"proportion"] - df_b.loc[t,"proportion"]) for t in df_a.index]),4)
            ds_matrix[i,j] = ds

In [73]:
# create heatmap
fig = px.imshow(ds_matrix,
                labels=dict(x="Centrality", y="Centrality", color="Discrepancy Score"),
                x=centralities_abbr,
                y=centralities_abbr,
                width=1000,
                height=800,
                text_auto=True
               )
fig.update_xaxes(side="top")
fig.show()