# Topic Modeling for Abstract - BERTopic

In [6]:
# import libraries
from bertopic import BERTopic
import pandas as pd
import sqlite3
from umap import UMAP
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import math

[nltk_data] Downloading package stopwords to C:\Users\Jerry
[nltk_data]     CHENG\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# connect to database
conn = sqlite3.connect("econtop.db")
cur = conn.cursor()

In [76]:
# extract title, abstract, date, and journal from database, starting from 2012-2022
q = """
    SELECT art.doi, art.journal, art.title, art.abstract, art.date, af.name, af.rank
    FROM author_article AS aa JOIN author AS au JOIN article AS art JOIN affiliation AS af
    ON aa.authorid = au.authorid AND
    aa.doi = art.doi AND
    au.affiliationid = af.affiliationid
    WHERE art.date >= '2012-01-01' AND
    art.date < '2023-01-01'
    """

# remove duplicated abstracts, set as strings
df = pd.read_sql_query(q, conn).drop_duplicates(subset=["abstract"]).astype("str")
df["date"] = pd.to_datetime(df["date"])   # convert date to datetime
df = df[(df["abstract"] != "nan") & (df["abstract"] != "N\A")].dropna(subset=["abstract"])
df.head()

Unnamed: 0,doi,journal,title,abstract,date,name,rank
0,10.1093/rfs/hhr069,Review of Financial Studies,The Inventory Growth Spread,Previous studies show that firms with low inve...,2012-01-15,INSEAD,
2,10.1093/rfs/hhr109,Review of Financial Studies,Takeovers and Divergence of Investor Opinion,We test several hypotheses on how takeover pre...,2012-01-15,New York University,50.0
3,10.1093/rfs/hhr081,Review of Financial Studies,Corporate Governance Objectives of Labor Union...,Labor union pension funds have become increasi...,2012-01-15,London School of Economics,1.0
4,10.1093/rfs/hhr076,Review of Financial Studies,Managerial Attributes and Executive Compensation,We study the role of firm- and manager-specifi...,2012-01-15,Wilfrid Laurier University,
7,10.1093/rfs/hhr092,Review of Financial Studies,The Road Less Traveled: Strategy Distinctivene...,We investigate whether skilled hedge fund mana...,2012-01-15,"University of California, Irvine",


In [77]:
len(df)

17833

In [78]:
# generate time bins
timebins = []
for i in range(2023-2012):
    timebins.append((pd.to_datetime(str(i+2012)+"-01-01"), pd.to_datetime(str(i+2013)+"-01-01")))

# sample 
df_s = pd.concat([df[(df["date"] < bin[1]) & (df["date"] >= bin[0])].sample(n=1400, random_state=0) for bin in timebins]).reset_index(drop=True)
df_s.tail(5)

Unnamed: 0,doi,journal,title,abstract,date,name,rank
15395,10.1093/restud/rdab057,Review of Economic Studies,A Theory of Monetary Union and Financial Integ...,"Since the creation of the euro, capital flows ...",2022-08-01,Centre de Recerca en Economia Internacional,
15396,10.1016/j.econmod.2022.105874,Economic Modelling,"COVID-19 regulations, culture, and the environ...",The economic and social disruptions caused by ...,2022-08-15,Nanyang Technological University,457.0
15397,10.1016/j.euroecorev.2022.104090,European Economic Review,Signalling creditworthiness with fiscal austerity,Sovereign borrowers may tighten their fiscal s...,2022-05-15,Bocconi University,
15398,10.1016/j.econmod.2022.105891,Economic Modelling,Unemployment claims during COVID-19 and econom...,Governments want to know how effective COVID-1...,2022-08-15,University of Macedonia,
15399,10.1016/j.econlet.2022.110806,Economics Letters,The personal saving rate: Data revisions and f...,Revisions to the U.S. personal saving rate are...,2022-10-15,University of Richmond,


In [79]:
# df_s = df
sw = stopwords.words("english")+["jel"]
# set lemmatizer to deal with plural/ing/ed forms
# lemmatizer = WordNetLemmatizer()
# df_s.abstract = df_s.abstract.apply(lambda x: " ".join([lemmatizer.lemmatize(word) 
#                                                        for word in word_tokenize(x.lower()) if lemmatizer.lemmatize(word) not in sw]))
df_s.abstract = df_s.abstract.apply(lambda x: " ".join([word for word in word_tokenize(x.lower()) if word not in sw]))

## Run TM on All Abstracts

In [80]:
abstracts = df_s.abstract.to_list()
dates = df_s['date'].to_list()

In [81]:
# n_neighbors: smaller local strcuture lose connection, bigger all connection lose local structure
# n_component: dimensionality of reduced dimension space
# min_dist: how tightly UMAP is allowed to pack points together
umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.01, metric='euclidean', random_state=1)

topic_model = BERTopic(min_topic_size=70,verbose=True,umap_model=umap_model)
topics, probs = topic_model.fit_transform(abstracts)
topics_over_time = topic_model.topics_over_time(abstracts, dates, evolution_tuning=True, global_tuning=True,  nr_bins=60)

Batches:   0%|          | 0/482 [00:00<?, ?it/s]

2023-03-06 00:41:50,490 - BERTopic - Transformed documents to Embeddings
2023-03-06 00:41:59,786 - BERTopic - Reduced dimensionality
2023-03-06 00:42:00,456 - BERTopic - Clustered reduced embeddings
60it [00:08,  7.30it/s]


In [82]:
topic_model.visualize_topics()

In [83]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics = 10, height=600)

In [None]:
topic_model.get_topics()

In [84]:
topic_model.visualize_barchart(n_words = 5, top_n_topics = 8, width=300, height=300, title="Topic Key Words in Each Economic Topics")

In [85]:
fig = topic_model.visualize_heatmap(top_n_topics = 10, width = 600, height = 600)
fig
# fig.write_html("similarity.html")

## Further Split of the Dominating Topic 0

In [86]:
# get the most significant topic an abstract belongs to 
df_s["topic"] = topics
df_s.head()

Unnamed: 0,doi,journal,title,abstract,date,name,rank,topic
0,10.1257/aer.102.7.3774,American Economic Review,Growth Dynamics: The Myth of Economic Recovery...,comment highlights different ways coding crisi...,2012-12-15,Spanish National Research Council,,18
1,10.1093/rfs/hhs063,Review of Financial Studies,"Asymmetric Information, Portfolio Managers, an...",propose model delegated asset management expla...,2012-07-15,University of Toronto,71.0,0
2,10.1093/rfs/hhs054,Review of Financial Studies,Dynamic Compensation Contracts with Private Sa...,article studies dynamic agency problem risk-av...,2012-05-15,University of Chicago,9.0,-1
3,10.1016/j.econlet.2011.12.032,Economics Letters,Welfare of naive and sophisticated players in ...,abdulkadiroglu et al . ( 2011 ) show naive par...,2012-05-15,Pompeu Fabra University,,-1
4,10.1093/rfs/hhr131,Review of Financial Studies,Optimal Corporate Governance and Compensation ...,"model long-run firm performance , management c...",2012-02-15,University of Oxford,28.0,0


In [33]:
abstracts = df_s[df_s["topic"] == 0].abstract.to_list()
dates = df_s[df_s["topic"] == 0]['date'].to_list()

In [44]:
# n_neighbors: smaller local strcuture lose connection, bigger all connection lose local structure
# n_component: dimensionality of reduced dimension space
# min_dist: how tightly UMAP is allowed to pack points together
umap_model = UMAP(n_neighbors=15, n_components=10, 
                  min_dist=0.01, metric='euclidean', random_state=1)

topic_model = BERTopic(min_topic_size=70,verbose=True,umap_model=umap_model)
topics, probs = topic_model.fit_transform(abstracts)
topics_over_time = topic_model.topics_over_time(abstracts, dates, evolution_tuning=True, global_tuning=True,  nr_bins=60)

Batches:   0%|          | 0/172 [00:00<?, ?it/s]

2023-03-05 22:22:35,595 - BERTopic - Transformed documents to Embeddings
2023-03-05 22:22:45,676 - BERTopic - Reduced dimensionality
2023-03-05 22:22:45,904 - BERTopic - Clustered reduced embeddings
60it [00:03, 17.36it/s]


In [45]:
topic_model.visualize_topics()

In [46]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics = 10, height=600)

In [47]:
topic_model.visualize_barchart(n_words = 5, top_n_topics = 8, width=300, height=300, title="Subtopics in Topic 0")

In [51]:
fig = topic_model.visualize_heatmap(top_n_topics = 8, width = 550, height = 550)
fig
# fig.write_html("similarity.html")

## To see what topics some of the institutions are interested in

In [89]:
df_univ_by_topics = df_s[df_s["topic"] == 0].groupby(["name"]).count()[["doi"]].rename(columns={"doi":"cnt"}).sort_values(by=["cnt"], ascending=False)

In [90]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        y = df_univ_by_topics.index[:10],
        x = df_univ_by_topics.cnt[:10],
        text = df_univ_by_topics.cnt[:10],
        orientation = "h",
        marker={'color': df_univ_by_topics.cnt[:10],
                'colorscale': 'Magenta'}
    )
)
fig.update_layout(title_text="Top 10 Instutitons for Topic 4: consumers, price, firms, competition, market",
#                 template="plotly_dark",
                  yaxis={'categoryorder':'total ascending'},
                  width=800,
                  height=600
                  )
fig.show()

In [91]:
df_grouby_univ = df_s[(df_s["name"] == "US Federal Reserve Board") & (df_s["topic"] != -1)].groupby("topic").count()[["doi"]].rename(columns={"doi":"cnt"}).sort_values(by=["cnt"], ascending=False)
df_grouby_univ["keywords"] = list(pd.Series(df_grouby_univ.index).apply(lambda x: str(x)+": "+" ".join([t[0] for t in topic_model.get_topic(topic=x)[:5]])))
df_grouby_univ.head()

Unnamed: 0_level_0,cnt,keywords
topic,Unnamed: 1_level_1,Unnamed: 2_level_1
0,88,0: model financial policy risk shocks
2,6,2: wage workers unemployment labor job
3,4,3: trade export firms countries exports
7,3,7: students school schools education student
12,3,12: inequality income wealth consumption house...


In [92]:
fig = px.pie(df_grouby_univ, 
             values="cnt", 
             names="keywords",
             color="cnt",
             color_discrete_sequence=px.colors.diverging.Spectral,
             width=800,
#            height=400,
             title="US Federal Reserve Board: Topics Distribution of Top Econ Publications")
fig.update_traces(textinfo='percent')
fig.show()

In [113]:
df_grouby_univ = df_s[(df_s["name"] == "University of Pennsylvania") & (df_s["topic"] != -1)].groupby("topic").count()[["doi"]].rename(columns={"doi":"cnt"}).sort_values(by=["cnt"], ascending=False)
df_grouby_univ["keywords"] = list(pd.Series(df_grouby_univ.index).apply(lambda x: str(x)+": "+" ".join([t[0] for t in topic_model.get_topic(topic=x)[:5]])))
df_grouby_univ.head()

Unnamed: 0_level_0,cnt,keywords
topic,Unnamed: 1_level_1,Unnamed: 2_level_1
0,79,0: model financial policy risk shocks
1,10,1: game agent games information preferences
9,7,9: health insurance care retirement medicare
4,6,4: consumers price firms competition market
7,5,7: students school schools education student


In [114]:
fig = px.pie(df_grouby_univ, 
             values="cnt", 
             names="keywords",
             color="cnt",
             color_discrete_sequence=px.colors.diverging.Spectral,
             width=800,
             height=600,
             title="University of Pennsylvania: Topics Distribution of Top Econ Publications")
fig.update_traces(textinfo='percent')
fig.show()

## Comparison of Centralities' Topic Distribution

In [142]:
centralities = ["New York University","University of California, Berkeley", "University of Chicago",
                "Harvard University", "Columbia University", "London School of Economics",
                "World Bank", "Stanford University", "Massachusetts Institute of Technology",
                "University of Pennsylvania"]
centralities_abbr = ["NYU","UCB", "UChicago",
                    "Harvard", "ColumbiaU", "LSE",
                    "World Bank", "Stanford", "MIT",
                    "UPenn"]
topics = list(range(22))

In [143]:
df_groupby_inst = pd.DataFrame({"topic":[],"cnt":[],"name":[],"keywords":[],"proportion":[]}).set_index("topic")

for c in centralities:
    df_temp = df_s[(df_s["name"] == c) & (df_s["topic"] != -1)].groupby("topic").count()[["doi"]].rename(columns={"doi":"cnt"})
    # fill in missing topics
    for topic in topics:
        if topic not in df_temp.index:
            df_temp.loc[topic] = [0]

    df_temp["name"] = c
    df_temp["keywords"] = list(pd.Series(df_temp.index).apply(lambda x: str(x)+": "+" ".join([t[0] for t in topic_model.get_topic(topic=x)[:5]])))

    total = sum(df_temp["cnt"])
    df_temp["proportion"] = df_temp.cnt.apply(lambda x: x / total)
    df_temp.sort_index(inplace=True)

    df_groupby_inst = pd.concat([df_groupby_inst, df_temp])

df_groupby_inst.head()

Unnamed: 0_level_0,cnt,name,keywords,proportion
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,85.0,New York University,0: model financial policy risk shocks,0.544872
1,19.0,New York University,1: game agent games information preferences,0.121795
2,8.0,New York University,2: wage workers unemployment labor job,0.051282
3,6.0,New York University,3: trade export firms countries exports,0.038462
4,1.0,New York University,4: consumers price firms competition market,0.00641


In [144]:
# calculate the discrepancy score
df_ds = pd.DataFrame({"name":[],"ds":[]})
df_uchicago = df_groupby_inst[df_groupby_inst["name"] == "University of Chicago"]
for c in centralities:
    if c != "University of Chicago":
        df_b = df_groupby_inst[df_groupby_inst["name"] == c]
        ds = round(sum([abs(df_uchicago.loc[t,"proportion"] - df_b.loc[t,"proportion"]) for t in df_uchicago.index]),4)
        df_ds.loc[len(df_ds)] = [c, ds]

df_ds = df_ds.sort_values(by=["ds"]).reset_index(drop=True)
df_ds.head(10)

Unnamed: 0,name,ds
0,Columbia University,0.3368
1,London School of Economics,0.3442
2,Stanford University,0.3652
3,New York University,0.3678
4,Harvard University,0.3888
5,University of Pennsylvania,0.4737
6,"University of California, Berkeley",0.5521
7,Massachusetts Institute of Technology,0.6245
8,World Bank,0.7026


In [145]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        y = df_ds.name,
        x = df_ds.ds,
        text = df_ds.ds,
        orientation = "h",
        marker={'color': df_ds.ds,
                'colorscale': 'Peach'}
    )
)
fig.update_layout(title_text="Discrepancy Score: UChicago vs Other Institutions",
#                 template="plotly_dark",
                  yaxis={'categoryorder':'total descending'},
                  width=800,
                  height=600
                  )
fig.show()

In [157]:
# create a matrix to store discrepancy score
ds_matrix = np.zeros((10,10))

# calculate the discrepancy score
for i,a in enumerate(centralities):
    df_a = df_groupby_inst[df_groupby_inst["name"] == a]
    for j,b in enumerate(centralities):
        if b != a:
            df_b = df_groupby_inst[df_groupby_inst["name"] == b]
            ds = round(sum([abs(df_a.loc[t,"proportion"] - df_b.loc[t,"proportion"]) for t in df_a.index]),4)
            ds_matrix[i,j] = ds

In [158]:
# create heatmap
fig = px.imshow(ds_matrix,
                labels=dict(x="Centrality", y="Centrality", color="Discrepancy Score"),
                x=centralities_abbr,
                y=centralities_abbr,
                width=1000,
                height=800,
                text_auto=True
               )
fig.update_xaxes(side="top")
fig.show()

In [478]:
# get similariy scores among topics
sim_matrix = np.array([[1.0000000000000002,0.7147984475534911,0.7014265407639376,0.7006396257199394,0.7778786047625297,0.5922797684354025,0.5022375439392693,0.5531182528435046,0.6001441770957048,0.6000236844562628,0.4625063111998686,0.6409650879315685,0.7227618739555189,0.555888913090619,0.5581763156546831,0.600151910176733,0.592229711028879,0.5269338979616478,0.547177802969863,0.702776718479375,0.6435280282007646,0.6592394189551795],[0.7147984475534911,1.0000000000000002,0.6284216212907492,0.6320490711101101,0.6804351140485507,0.5383010283871257,0.5738934063448802,0.6201421904336015,0.6294546660414082,0.566983817052494,0.4504401945996297,0.4514997242208933,0.6067436943480378,0.6325297009597531,0.5798112478875728,0.5046660575559667,0.5308512925363487,0.5218729787726594,0.6567495082773667,0.6099382385810952,0.6130405139788322,0.5575090733972362],[0.7014265407639376,0.6284216212907492,1.0000000000000004,0.6609332232573086,0.6729068782699498,0.5061633122729859,0.4921872479800501,0.560295780574679,0.579782305444816,0.5661429379977096,0.6050782762250435,0.584749404966878,0.6869079862452971,0.5820507666292603,0.5838313301774724,0.5095852241440804,0.5702718115933102,0.5960230118171184,0.5500748885764796,0.6648252032810669,0.5989703814337206,0.6250668888446331],[0.7006396257199394,0.6320490711101101,0.6609332232573086,0.9999999999999999,0.7764920194289504,0.5144630801637335,0.4548392719816442,0.4376551486653905,0.44561409449929806,0.4406325828077863,0.5555709770108007,0.596077314120969,0.5988384129727505,0.4923721079923043,0.4453615289772871,0.5348859849786131,0.6269861881015292,0.5367347815342356,0.5221263234562231,0.6248715174932555,0.6095426447934813,0.508840112984511],[0.7778786047625297,0.6804351140485507,0.6729068782699498,0.7764920194289504,0.9999999999999996,0.522163487970389,0.5066375660264243,0.5007192907620102,0.5236222975116621,0.49166176360780944,0.4383826395830004,0.5233785367642482,0.641346095047594,0.5496025529163614,0.505388839227404,0.6219539795586883,0.6956947818026169,0.5206727045689543,0.5429115093813152,0.6735153440794649,0.6111579801155391,0.5634829268435186],[0.5922797684354025,0.5383010283871257,0.5061633122729859,0.5144630801637335,0.522163487970389,0.9999999999999999,0.4604255353650464,0.43793502148395347,0.503628961458128,0.42973176229642174,0.4283888814309866,0.4342395969823304,0.53466014998095,0.457750025584157,0.4134285261639455,0.46584466680974956,0.2969539009334088,0.5186201661086887,0.5196024113236934,0.5861059367305141,0.6054199448901427,0.48030883077476183],[0.5022375439392693,0.5738934063448802,0.4921872479800501,0.4548392719816442,0.5066375660264243,0.4604255353650464,0.9999999999999996,0.49034459163657174,0.514394710505574,0.4131424753009892,0.4619417518732485,0.4139491183643704,0.44927017482420134,0.48038035084794084,0.5004357017786182,0.3270087419837477,0.44235901378147924,0.430363432858759,0.594282711295201,0.40752581662021475,0.4944405864031689,0.45586974897823995],[0.5531182528435046,0.6201421904336015,0.560295780574679,0.4376551486653905,0.5007192907620102,0.43793502148395347,0.49034459163657174,0.9999999999999996,0.6035688122754659,0.4762564452741156,0.43320159794931795,0.3773128277169012,0.5121162390795809,0.5310415626895615,0.5008588489040519,0.4299054920020871,0.3643768599423658,0.46454818627782474,0.5144713867793365,0.5443445663791564,0.5517992645446195,0.5275232216684699],[0.6001441770957048,0.6294546660414082,0.579782305444816,0.44561409449929806,0.5236222975116621,0.503628961458128,0.514394710505574,0.6035688122754659,0.9999999999999994,0.6035421588661116,0.4896537623099643,0.4176780569812582,0.6084741752330263,0.6798275471926655,0.5265016590201328,0.4638790885681924,0.3835771707156729,0.5472753083558389,0.6245939542416146,0.535742357367783,0.5035541009322053,0.6158697913676973],[0.6000236844562628,0.566983817052494,0.5661429379977096,0.4406325828077863,0.49166176360780944,0.42973176229642174,0.4131424753009892,0.4762564452741156,0.6035421588661116,0.9999999999999999,0.4209121378108677,0.43880480059702837,0.555724172352225,0.4751212052153899,0.4557333731149137,0.507394118113269,0.34579421947632066,0.430261629309888,0.5049714512989767,0.5148631029697484,0.4829168514225163,0.5213726694445988],[0.4625063111998686,0.4504401945996297,0.6050782762250435,0.5555709770108007,0.4383826395830004,0.4283888814309866,0.4619417518732485,0.43320159794931795,0.4896537623099643,0.4209121378108677,1.0,0.3883062832676293,0.47991109306795515,0.4294628338291644,0.4719169399721731,0.373630509095245,0.3357652220086118,0.5583867567335257,0.5573541389561089,0.47781387417905374,0.5849969763665227,0.5114869075540638],[0.6409650879315685,0.4514997242208933,0.584749404966878,0.596077314120969,0.5233785367642482,0.4342395969823304,0.4139491183643704,0.3773128277169012,0.4176780569812582,0.43880480059702837,0.3883062832676293,1.0000000000000002,0.6445289302254054,0.3823617457849751,0.4173155661478176,0.44837276117831026,0.4512385478249868,0.44242005876232626,0.39297934834158954,0.48614121445411795,0.47472813801510083,0.5373132590176808],[0.7227618739555189,0.6067436943480378,0.6869079862452971,0.5988384129727505,0.641346095047594,0.53466014998095,0.44927017482420134,0.5121162390795809,0.6084741752330263,0.555724172352225,0.47991109306795515,0.6445289302254054,0.9999999999999998,0.5106811733030785,0.43534405189186076,0.4936846011897392,0.501048766571092,0.5806219525925604,0.5120045251765157,0.6662545437522298,0.572861282774108,0.8251717931649691],[0.555888913090619,0.6325297009597531,0.5820507666292603,0.4923721079923043,0.5496025529163614,0.457750025584157,0.48038035084794084,0.5310415626895615,0.6798275471926655,0.4751212052153899,0.4294628338291644,0.3823617457849751,0.5106811733030785,0.9999999999999998,0.4475717617963131,0.40396264206225746,0.4157966948868965,0.44197917566119355,0.586042888401972,0.4836532209238663,0.5021880123386631,0.48154316508847894],[0.5581763156546831,0.5798112478875728,0.5838313301774724,0.4453615289772871,0.505388839227404,0.4134285261639455,0.5004357017786182,0.5008588489040519,0.5265016590201328,0.4557333731149137,0.4719169399721731,0.4173155661478176,0.43534405189186076,0.4475717617963131,1.0,0.4502399630768949,0.38690183406697465,0.4066290498760969,0.5826451205855495,0.4456876652295271,0.5216934336627665,0.4631428825275372],[0.600151910176733,0.5046660575559667,0.5095852241440804,0.5348859849786131,0.6219539795586883,0.46584466680974956,0.3270087419837477,0.4299054920020871,0.4638790885681924,0.507394118113269,0.373630509095245,0.44837276117831026,0.4936846011897392,0.40396264206225746,0.4502399630768949,1.0,0.4084621707638882,0.43888430003206785,0.4387783139124419,0.7231455557188894,0.49830651122417696,0.42990637893943334],[0.592229711028879,0.5308512925363487,0.5702718115933102,0.6269861881015292,0.6956947818026169,0.2969539009334088,0.44235901378147924,0.3643768599423658,0.3835771707156729,0.34579421947632066,0.3357652220086118,0.4512385478249868,0.501048766571092,0.4157966948868965,0.38690183406697465,0.4084621707638882,1.0,0.3887277642480377,0.3968220703579022,0.3843326076284886,0.4092515174645326,0.41297481842897904],[0.5269338979616478,0.5218729787726594,0.5960230118171184,0.5367347815342356,0.5206727045689543,0.5186201661086887,0.430363432858759,0.46454818627782474,0.5472753083558389,0.430261629309888,0.5583867567335257,0.44242005876232626,0.5806219525925604,0.44197917566119355,0.4066290498760969,0.43888430003206785,0.3887277642480377,0.9999999999999998,0.5284727614396312,0.5987386361704146,0.6154958682392577,0.5984958338977379],[0.547177802969863,0.6567495082773667,0.5500748885764796,0.5221263234562231,0.5429115093813152,0.5196024113236934,0.594282711295201,0.5144713867793365,0.6245939542416146,0.5049714512989767,0.5573541389561089,0.39297934834158954,0.5120045251765157,0.586042888401972,0.5826451205855495,0.4387783139124419,0.3968220703579022,0.5284727614396312,0.9999999999999997,0.5300451691553256,0.5637201065241644,0.529620642955999],[0.702776718479375,0.6099382385810952,0.6648252032810669,0.6248715174932555,0.6735153440794649,0.5861059367305141,0.40752581662021475,0.5443445663791564,0.535742357367783,0.5148631029697484,0.47781387417905374,0.48614121445411795,0.6662545437522298,0.4836532209238663,0.4456876652295271,0.7231455557188894,0.3843326076284886,0.5987386361704146,0.5300451691553256,1.0,0.6626945175998177,0.5669410194954438],[0.6435280282007646,0.6130405139788322,0.5989703814337206,0.6095426447934813,0.6111579801155391,0.6054199448901427,0.4944405864031689,0.5517992645446195,0.5035541009322053,0.4829168514225163,0.5849969763665227,0.47472813801510083,0.572861282774108,0.5021880123386631,0.5216934336627665,0.49830651122417696,0.4092515174645326,0.6154958682392577,0.5637201065241644,0.6626945175998177,0.9999999999999996,0.5571531889215439],[0.6592394189551795,0.5575090733972362,0.6250668888446331,0.508840112984511,0.5634829268435186,0.48030883077476183,0.45586974897823995,0.5275232216684699,0.6158697913676973,0.5213726694445988,0.5114869075540638,0.5373132590176808,0.8251717931649691,0.48154316508847894,0.4631428825275372,0.42990637893943334,0.41297481842897904,0.5984958338977379,0.529620642955999,0.5669410194954438,0.5571531889215439,0.9999999999999997]])

In [479]:
min_sim = np.amin(sim_matrix)
max_sim = 1
for i in range(len(sim_matrix)):
    for j in range(len(sim_matrix)):
        sim_matrix[i][j] = (sim_matrix[i][j] - min_sim) / (max_sim - min_sim)

In [55]:
# calculate documents similarity given a time period
def cal_sim(data, univ):
    '''
    univ: university name
    '''
    df = data[data["name"] == univ]
    cnt = len(df)
    total_comb = math.comb(cnt,2)
    sum_sim = 0
    for i in range(cnt - 1):
        for j in range(cnt - i - 1):
            sum_sim += int(data["topic"][i] == data["topic"][j+i+1])
    try:
        ave_sim = sum_sim / total_comb
    except:
        ave_sim = 1
    return ave_sim

In [413]:
# predicted_topics, predicted_probs = topic_model.transform(abstracts[100])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2023-02-28 00:06:45,756 - BERTopic - Reduced dimensionality
2023-02-28 00:06:45,757 - BERTopic - Predicted clusters


In [56]:
df_ranked_aff = df_s[df_s["rank"] != ""].drop_duplicates(subset=["name"])[["name","rank"]]

In [57]:
len(df_ranked_aff)

201

In [58]:
# calculate similarity score per year over time
sim_score_by_univ = []
for univ in list(df_ranked_aff.name):
    sim_score_by_univ.append(cal_sim(df_s, univ))

In [59]:
df_ranked_aff["sim_score"] = sim_score_by_univ
df_ranked_aff = df_ranked_aff.astype({"rank":"int"}).set_index("name")
df_ranked_aff.head()

Unnamed: 0_level_0,rank,sim_score
name,Unnamed: 1_level_1,Unnamed: 2_level_1
University of Toronto,71,0.208739
University of Chicago,9,0.225666
University of Oxford,28,0.220448
University of Michigan,73,0.211125
Cardiff University,385,0.361905


In [60]:
fig = px.scatter(df_ranked_aff, 
                 x="rank", 
                 y="sim_score", 
                 trendline="lowess",
                 trendline_options=dict(frac=0.5))

fig.update_layout(title_text = "Sim Score vs rank",
                  xaxis_title = "rank",
                  yaxis_title = "Sim Score",
                  template = "plotly_dark",
                  width = 800,
                  height = 600,
                  legend=dict(
                    yanchor="bottom",
                    y=1.00,
                    xanchor="left",
                    x=0.35
                    )
                  )
fig.show()

In [38]:
# sum of all freq equal to the number of docs
sum([topic_model.get_topic_freq(topic=i-1) for i in range(26)])

15400

In [493]:
topic_model.get_topic_freq(topic=-1)

4001

In [281]:
sum([item == 20 for item in df_s["topic"]])

78

In [504]:
# select the best params
topic_model.get_topic(topic=-1)

[('model', 0.012883202647841381),
 ('growth', 0.012353077984280625),
 ('economic', 0.011588927976575852),
 ('firms', 0.01132543937807906),
 ('countries', 0.011183940862236376),
 ('show', 0.010724626462491265),
 ('results', 0.01024150923327465),
 ('income', 0.010203913712682889),
 ('social', 0.009226047289390658),
 ('also', 0.008846088013882203)]