In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

rows=75000
df=pd.read_csv('/content/drive/MyDrive/Topic Modeling/final_data.csv', nrows=rows)
df.head()

Unnamed: 0,short_description,category,category_encode,cleaned_short_description,topic
0,Health experts said it is too early to predict...,U.S. NEWS,35,health expert said early predict demand match ...,13
1,He was subdued by passengers and crew when he ...,U.S. NEWS,35,subdued passenger crew fled aircraft confronta...,19
2,"""Until you have a dog you don't understand wha...",COMEDY,5,dog understand eaten,7
3,"""Accidentally put grown-up toothpaste on my to...",PARENTING,22,accidentally grownup toothpaste toddler toothb...,41
4,Amy Cooper accused investment firm Franklin Te...,U.S. NEWS,35,amy cooper accused investment firm franklin te...,11


In [3]:
df.drop('topic', axis=1, inplace=True)
df.shape

(75000, 4)

In [4]:
%%capture

!pip install bertopic
!pip install sentence_transformers
!pip install cuml

In [6]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

plt.style.use('fivethirtyeight')
%matplotlib inline

In [7]:
print(df.shape)
mask=df['cleaned_short_description'].str.len() >= 1
df=df.loc[mask]
df.reset_index(drop=True, inplace=True)
df['cleaned_short_description']=df['cleaned_short_description'].astype('str')
print(df.shape)

(75000, 4)
(73361, 4)


In [8]:
embedding_model=SentenceTransformer("distilbert-base-nli-mean-tokens")

umap_model=UMAP(n_neighbors=15, n_components=5, metric='cosine', random_state=21, min_dist=0.0)

hdbscan_model=HDBSCAN(min_cluster_size=10, min_samples=10, metric='euclidean', prediction_data=True)

topic_model=BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model, calculate_probabilities=True)

topics, probabilities=topic_model.fit_transform(df['cleaned_short_description'])

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [9]:
topic_model.save(f"/content/drive/MyDrive/Topic Modeling/BARTopic_model_{rows}")

  self._set_arrayXarray(i, j, x)


In [10]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,43241,-1_said_year_woman_people
1,0,4732,0_trump_donald_administration_president
2,1,1150,1_food_eat_recipe_eating
3,2,642,2_bad_worse_scary_wrong
4,3,573,3_russia_russian_putin_vladimir
...,...,...,...
564,563,10,563_ceasefire_warring_gunsense_observance
565,564,10,564_book_publisher_upandcomer_kater
566,565,10,565_pittclooney_snakeskin_spud_despacitooo
567,566,10,566_hmmm_hmmmm_hmm_fam


In [11]:
topic_model.get_topic_info().head(42)

Unnamed: 0,Topic,Count,Name
0,-1,43241,-1_said_year_woman_people
1,0,4732,0_trump_donald_administration_president
2,1,1150,1_food_eat_recipe_eating
3,2,642,2_bad_worse_scary_wrong
4,3,573,3_russia_russian_putin_vladimir
5,4,495,4_climate_oil_change_coal
6,5,366,5_democratic_democrat_presidential_primary
7,6,365,6_black_panther_african_matter
8,7,315,7_twitter_facebook_instagram_google
9,8,277,8_chug_benghazi_omg_doh


In [12]:
top_topics=topic_model.get_topic_info().head(42)

In [14]:
top_topics[top_topics['Topic'] == -1]

Unnamed: 0,Topic,Count,Name
0,-1,43241,-1_said_year_woman_people


In [17]:
topic_model.visualize_barchart(top_n_topics=42)

In [38]:
topic_model.visualize_term_rank(log_scale=True)

In [24]:
topic_model.visualize_topics(top_n_topics=42)

In [25]:
topic_model.visualize_hierarchy(top_n_topics=42)

In [26]:
topic_model.visualize_heatmap(top_n_topics=42)

In [34]:
print(f"Maximum Topic Probability: {np.round(topic_model.probabilities_[0].max(), 6)}\nMinimum Topic Probability: {np.round(topic_model.probabilities_[0].min(), 6)}\nAverage Topic Probalility: {np.round(topic_model.probabilities_[0].mean(), 6)}")

Maximum Topic Probability: 0.001021
Minimum Topic Probability: 1.6e-05
Average Topic Probalility: 0.000175


In [35]:
topic_model.visualize_distribution(topic_model.probabilities_[0], min_probability=0.000175)

In [37]:
top_topic_words=topic_model.visualize_barchart(top_n_topics=42)
top_topic_words.write_html('/content/drive/MyDrive/Topic Modeling/top_topic_words.html')

term_score=topic_model.visualize_term_rank(log_scale=True)
term_score.write_html("/content/drive/MyDrive/Topic Modeling/term_score.html")

topic_distances=topic_model.visualize_topics(top_n_topics=42)
topic_distances.write_html("/content/drive/MyDrive/Topic Modeling/top_topics_distances.html")

topic_hierarchy=topic_model.visualize_hierarchy(top_n_topics=42)
topic_hierarchy.write_html("/content/drive/MyDrive/Topic Modeling/top_topics_hierarchy.html")

topic_heatmap=topic_model.visualize_heatmap(top_n_topics=42)
topic_heatmap.write_html("/content/drive/MyDrive/Topic Modeling/top_topics_heatmap.html")

chart=topic_model.visualize_distribution(topic_model.probabilities_[0], min_probability=0.000175) 
chart.write_html("/content/drive/MyDrive/Topic Modeling/news_topic_probability_distribution.html")

In [39]:
topic_prediction=topic_model.topics_[:]
df['topics']=topic_prediction
df.head()

Unnamed: 0,short_description,category,category_encode,cleaned_short_description,topics
0,Health experts said it is too early to predict...,U.S. NEWS,35,health expert said early predict demand match ...,-1
1,He was subdued by passengers and crew when he ...,U.S. NEWS,35,subdued passenger crew fled aircraft confronta...,529
2,"""Until you have a dog you don't understand wha...",COMEDY,5,dog understand eaten,1
3,"""Accidentally put grown-up toothpaste on my to...",PARENTING,22,accidentally grownup toothpaste toddler toothb...,1
4,Amy Cooper accused investment firm Franklin Te...,U.S. NEWS,35,amy cooper accused investment firm franklin te...,-1


In [41]:
def find_topic(docs, n_topics=5):
  similar_topics, similarity=topic_model.find_topics(docs, top_n=n_topics); 
  print(f"The top {n_topics} similar topics are {similar_topics}, and the similarities are {np.round(similarity, 2)}\n")

  for i in range(n_topics):
    print(f'The top keywords for topic {similar_topics[i]} are: \n{topic_model.get_topic(similar_topics[i])}\n')

In [42]:
document='''

Qatar 2022 is unravelling to be the most unpredictable World Cup in years. Upsets have become a staple this tournament, with heavyweights Argentina and 
Germany already tasting defeats against Saudi Arabia and Japan respectively. Even Canada and Ghana nearly pulled off the unthinkable against Belgium and 
Portugal – only to be denied by a penalty save and an unfortunate slip by Inaki Williams. The scorelines of the Saudi and Japan games illustrate only a 
fraction of the bigger picture. The hunger, passion and infectious energy shown by the players has made a huge impression on everyone watching the games. 
During those matches, it looked as if eleven warriors entered the field, fearless in their quest to bring the giants to its knees. The Green Falcons and the 
Blue Samurai have shown that perceived inferiority means very little as long as they have their hearts set on bringing glory to their nations. These results 
reemphasise the need for the FIFA World Cup (FWC) to be open to everyone. This might also be the catalyst to finally awaken Asian football from their eternal 
slumber and propel it towards success in the world stage. Playing against lower ranked teams not only keep the elites grounded in reality, it also helps spread
the love of the game to new shores. With time, footballing infrastructures can develop, and, with growing fan interest, these nations might become footballing
forces to be reckoned with. The USA is a glaring example in this regard. Coming into the 1950 Brazil World Cup with a ragtag team of immigrants looking for 
American citizenship, their chances were written off from the start. That squad comprised of schoolteachers, mailmen and dishwashers. Yet, they defied all 
odds in their match-up against England and recorded a historic 1-0 win. US football has never looked back since. Although they didn't appear in the World Cup 
again until 1990, the seeds were sown for football to develop. Their efforts in developing professionalism, tactics and infrastructure have led to the 
formation of somewhat of a golden generation in US football today. Humane romanticism towards underdog stories have been around since the beginning of time, 
and football is no exception either. The feeling of revolt is especially turbocharged if the teams are demeaned. The game between Wales and Iran is a case in 
point. In an interview with BBC radio, former Wales midfielder Robbie Savage claimed, "It's only Iran… I could still play against them and win." Yet, it was 
Iran who emerged victorious, scoring two goals in the dying minutes of the game. Jürgen Klinsmann, German legend and former manager of the US national team, 
also made derogatory comments after Iran's win against Wales, calling their fouls "their culture". Making derogatory comments about teams is indicative of 
nothing but self-entitlement. It is a blatant display of arrogance and unsportsmanlike behaviour. The Disgrace of Gijon in 1982 is a humiliating episode in 
the history of World Cup. Needing to win by a slim margin, West Germany and Austria treated fans with a slugfest. Both teams refrained from attacking after 
Germany scored and took the lead. This ultimately eliminated the free-flowing Algeria from the group stages. Algeria's 1982 campaign was pivotal for the 
future of African football. Not only did they become a regional powerhouse afterwards, but their World Cup exploits also attracted sponsors to invest in 
African football with fruitful results. Since Cameroon's quarter-final appearance in 1990, teams like Nigeria, Senegal, Ghana, Morocco and Egypt have been 
constant threats in international football. Currently, Asian teams are following a similar trajectory to what Africa did three decades ago. In fact, the Japan 
team which defeated Germany had eight players playing in the top two German football divisions. Playing top tier European leagues has allowed Asian players 
to develop a sound understanding of tactics and technical skills, which has translated well in their international duties. By allowing more teams to play, 
football can be globalised to a bigger extent. Thankfully for fans, things seem to be on the right track as there has been talk of expanding the World Cup 
to 48 teams from 2026. With expanded formats, fans hope it will be the start of a new era – one where success isn't only limited to a handful of teams.

'''

In [43]:
find_topic(docs=document, n_topics=3)

The top 3 similar topics are [170, 362, 69], and the similarities are [0.08 0.07 0.06]

The top keywords for topic 170 are: 
[('richest', 0.07745551098562825), ('wealthy', 0.07437315269974644), ('rich', 0.06913062100504944), ('millionaire', 0.05556253566979357), ('billionaire', 0.03195079228832774), ('powerful', 0.029605207714608405), ('wealthiest', 0.02877546354167724), ('overwhelmingly', 0.02692181160008152), ('fortune', 0.023343416649734605), ('womenwhose', 0.02247145298003518)]

The top keywords for topic 362 are: 
[('wwe', 0.2229044599013641), ('wrestler', 0.21718056825674784), ('wrestling', 0.18039578867791853), ('hulk', 0.1526481357315677), ('hogan', 0.14610527725206096), ('professional', 0.07255201897191242), ('jinder', 0.05019171249555303), ('knappenberger', 0.05019171249555303), ('mcmahon', 0.05019171249555303), ('mahals', 0.05019171249555303)]

The top keywords for topic 69 are: 
[('olympic', 0.14857991499353765), ('olympics', 0.09052483241486341), ('rio', 0.0692187623849708

In [45]:
model=BERTopic.load((f"/content/drive/MyDrive/Topic Modeling/BARTopic_model_{rows}"))

In [46]:
model.visualize_hierarchy(top_n_topics=42)