# Topic Modeling for Abstract - BERTopic

In [20]:
# import libraries
from bertopic import BERTopic
import pandas as pd
import sqlite3
from umap import UMAP
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to C:\Users\Jerry
[nltk_data]     CHENG\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# connect to database
conn = sqlite3.connect("econtop.db")
cur = conn.cursor()

In [3]:
# extract title, abstract, date, and journal from database, starting from 2012-2022
q = """
    SELECT art.journal, art.title, art.abstract, art.date
    FROM author_article AS aa JOIN author AS au JOIN article AS art JOIN affiliation AS af
    ON aa.authorid = au.authorid AND
    aa.doi = art.doi AND
    au.affiliationid = af.affiliationid
    WHERE art.date >= '2012-01-01' AND
    art.date < '2023-01-01'
    """

# remove duplicated abstracts, set as strings
df = pd.read_sql_query(q, conn).drop_duplicates(subset=["abstract"]).astype("str")
df["date"] = pd.to_datetime(df["date"])   # convert date to datetime
df.head()

Unnamed: 0,journal,title,abstract,date
0,Review of Financial Studies,The Inventory Growth Spread,Previous studies show that firms with low inve...,2012-01-15
2,Review of Financial Studies,Takeovers and Divergence of Investor Opinion,We test several hypotheses on how takeover pre...,2012-01-15
3,Review of Financial Studies,Corporate Governance Objectives of Labor Union...,Labor union pension funds have become increasi...,2012-01-15
4,Review of Financial Studies,Managerial Attributes and Executive Compensation,We study the role of firm- and manager-specifi...,2012-01-15
7,Review of Financial Studies,The Road Less Traveled: Strategy Distinctivene...,We investigate whether skilled hedge fund mana...,2012-01-15


In [4]:
len(df)

17835

In [14]:
# generate time bins
timebins = []
for i in range(2023-2012):
    timebins.append((pd.to_datetime(str(i+2012)+"-01-01"), pd.to_datetime(str(i+2013)+"-01-01")))

# sample 
df_s = pd.concat([df[(df["date"] < bin[1]) & (df["date"] >= bin[0])].sample(n=1400, random_state=0) for bin in timebins]).reset_index(drop=True)
df_s.tail(5)

Unnamed: 0,journal,title,abstract,date
15395,Review of Economic Studies,A Theory of Monetary Union and Financial Integ...,"Since the creation of the euro, capital flows ...",2022-08-01
15396,Economic Modelling,"COVID-19 regulations, culture, and the environ...",The economic and social disruptions caused by ...,2022-08-15
15397,European Economic Review,Signalling creditworthiness with fiscal austerity,Sovereign borrowers may tighten their fiscal s...,2022-05-15
15398,Economic Modelling,Unemployment claims during COVID-19 and econom...,Governments want to know how effective COVID-1...,2022-08-15
15399,Economics Letters,The personal saving rate: Data revisions and f...,Revisions to the U.S. personal saving rate are...,2022-10-15


In [15]:
# df_s = df
sw = stopwords.words("english")+["jel"]
df_s.abstract = df_s.abstract.apply(lambda x: " ".join([word for word in word_tokenize(x.lower()) if word not in sw]))

In [16]:
abstracts = df_s.abstract.to_list()
dates = df_s['date'].to_list()

In [24]:
umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=0)

topic_model = BERTopic(min_topic_size=50,verbose=True,umap_model=umap_model)
topics, probs = topic_model.fit_transform(abstracts)
topics_over_time = topic_model.topics_over_time(abstracts, dates, global_tuning=True, evolution_tuning=True, nr_bins=30)

Batches:   0%|          | 0/482 [00:00<?, ?it/s]

2023-02-25 18:10:35,436 - BERTopic - Transformed documents to Embeddings
2023-02-25 18:10:44,677 - BERTopic - Reduced dimensionality
2023-02-25 18:10:45,263 - BERTopic - Clustered reduced embeddings
30it [00:05,  5.03it/s]


In [25]:
topic_model.visualize_topics()

In [26]:
topic_model.visualize_topics_over_time(topics_over_time, topics=[0,1,2,3,4,5,6,7,8,9])

In [27]:
topic_model.get_topics()

{-1: [('growth', 0.01332837719835727),
  ('data', 0.012622724311517255),
  ('countries', 0.012332411574622313),
  ('economic', 0.012173933195447465),
  ('find', 0.011740407846997522),
  ('model', 0.011664764568511338),
  ('using', 0.011053951714759942),
  ('effects', 0.011036393433095159),
  ('income', 0.01096135412776372),
  ('paper', 0.010595958675500602)],
 0: [('model', 0.017391281691893953),
  ('financial', 0.016360997440543028),
  ('policy', 0.013630123803159498),
  ('risk', 0.012972489549086865),
  ('shocks', 0.012449110217549643),
  ('monetary', 0.012392575385174619),
  ('market', 0.012287313717136534),
  ('debt', 0.01161327994327117),
  ('inflation', 0.011456884609441798),
  ('paper', 0.011349912312273027)],
 1: [('information', 0.017175153320380755),
  ('game', 0.015251248028771336),
  ('agent', 0.015101048262838613),
  ('agents', 0.015016160946649579),
  ('preferences', 0.014864005562516797),
  ('equilibrium', 0.0147508933171382),
  ('show', 0.013116274632981198),
  ('games'

In [28]:
topic_model.visualize_barchart()

In [36]:
fig = topic_model.visualize_heatmap()
fig.write_html("similarity.html")

In [30]:
topic_model.get_representative_docs()[0]

['paper revisits relationship interest rates exchange rates small open emerging economy using wavelet-based methodologies . based data romania , results confirm theoretical predictions interest rate - exchange rate relationship turmoil policy changes . short term , relationship negative , confirming sticky-price models , long term , relationship positive , confirming purchasing power parity theory . beginning turmoil , exchange rate movements generally take lead interest rates first month , monetary authorities take lead afterwards . results reveal small open emerging economy direct inflation targeting monetary policy regime , relationship exchange rates interest rate fundamentally different advanced economy . also , results stress necessity central bank must pay simultaneous attention variables order achieve monetary policy targets .',
 'paper studies monetary policy jointly affects asset prices real economy united states . develop estimator uses high-frequency surprises proxy structu

In [31]:
# get the most significant topic an abstract belongs to 
df_rep = pd.DataFrame({"Document": abstracts, "Topic": topics})
df_rep.head(20)

Unnamed: 0,Document,Topic
0,"fisher et al . ( 2012 ) â€ “ â€ “ henceforth ,...",-1
1,propose model delegated asset management expla...,0
2,article studies dynamic agency problem risk-av...,1
3,"using 219-year sample , find us output growth ...",0
4,"model long-run firm performance , management c...",0
5,study examines whether pre-crisis internationa...,0
6,report experiment subjects indifferent real-mo...,1
7,compare market prices risk economies identical...,0
8,presence background risk increases self-protec...,1
9,widely recognized 2006 massachusetts health re...,12


In [38]:
# sum of all freq equal to the number of docs
sum([topic_model.get_topic_freq(topic=i-1) for i in range(26)])

15400

In [39]:
topic_model.get_topic_freq(topic=0)

5501

In [33]:
sum([item == 0 for item in df_rep["Topic"]])

5501