In [1]:
%env TOKENIZERS_PARALLELISM=false
!pip install bertopic

env: TOKENIZERS_PARALLELISM=false
Collecting bertopic
  Obtaining dependency information for bertopic from https://files.pythonhosted.org/packages/06/49/f395e2e4d21dd49803494c8aec6087db61ea0ba211c6e5e57540b23334eb/bertopic-0.15.0-py2.py3-none-any.whl.metadata
  Downloading bertopic-0.15.0-py2.py3-none-any.whl.metadata (20 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / - \ | / - done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25

In [2]:
import pandas as pd
df = pd.read_csv(filepath_or_buffer='/kaggle/input/global-news-dataset/data.csv', index_col=['article_id'],
                 parse_dates=['published_at']).drop(columns=['url', 'url_to_image'])
df['date'] = pd.to_datetime(df['published_at'].apply(func=lambda x: x.split()[0]))
df.head()

Unnamed: 0_level_0,source_id,source_name,author,title,description,published_at,content,category,full_content,date
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
81664,,Forbes,"Elizabeth Brownfield, Contributor, \n Elizabet...",Superstar Chef Yannick Alléno Brings Refined F...,Now open in Mayfair at Four Seasons Hotel Lond...,2023-11-01 03:27:21.000000,"Pavyllon London, at Four Seasons Hotel London ...",Monaco,"Pavyllon London, at Four Seasons Hotel London ...",2023-11-01
81666,,Eurosport.fr,,"Battu sur le fil à Nancy, Paris cède son trône","""Le Paris Basketball n\u0027est plus invincibl...",2023-10-05 21:36:49.000000,"T.J. Shorts, Paris Basketball\nCrédit: Imago",Monaco,,2023-10-05
81667,,CNA,,Nice claim top spot in Ligue 1 with late win a...,Nice moved into provisional first place in the...,2023-10-27 21:28:48.000000,Nice moved into provisional first place in the...,Monaco,Nice moved into provisional first place in the...,2023-10-27
81680,,Paul Tan's Automotive News,Mohan K Ramanujam,"Lotus reveals Type 136 First Edition e-bike, R...",Fusing Formula 1 engineering with electric bic...,2023-11-02 05:32:49.000000,Fusing Formula 1 engineering with electric bic...,Monaco,,2023-11-02
81683,,Autocar,Felix Page,New Mazda concept previews rotary-electric MX-...,Rotary-electric powertrain is said to allow fo...,2023-10-25 00:31:45.000000,Mazda has given a radical vision of what the f...,Monaco,,2023-10-25


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73133 entries, 81664 to 261138
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   source_id     14723 non-null  object        
 1   source_name   73133 non-null  object        
 2   author        64914 non-null  object        
 3   title         73093 non-null  object        
 4   description   72756 non-null  object        
 5   published_at  73133 non-null  object        
 6   content       73133 non-null  object        
 7   category      73100 non-null  object        
 8   full_content  26190 non-null  object        
 9   date          73133 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(9)
memory usage: 6.1+ MB


In [4]:
df.nunique()

source_id          61
source_name      2379
author          17608
title           70379
description     70683
published_at    59797
content         70291
category          257
full_content    25774
date               39
dtype: int64

In [5]:
from plotly.express import histogram
histogram(data_frame=df, x='date', color='source_id')

Our data has some odd distribution by source in time. The October articles mostly follow a weekly pattern, but the November articles are look very different.

In [6]:
from plotly.express import bar
bar(data_frame=df['source_name'].value_counts().to_frame().reset_index().sort_values(ascending=False, by='count').head(n=20),
    x='source_name', y='count', title='Top 20 sources by article count')

We could build a single topic model for the whole corpus, but since we have natural source buckets it probably makes sense to look at select sources individually.

In [7]:
from arrow import now
from bertopic import BERTopic
from plotly.express import histogram
from sklearn.feature_extraction.text import CountVectorizer

def topic_model(arg_df: pd.DataFrame, source: str, ):
    model = BERTopic(verbose=True, top_n_words=15, nr_topics=None, language='english', 
                     vectorizer_model=CountVectorizer(stop_words='english', lowercase=True, min_df=MIN_DF, max_df=MAX_DF, ))
    topics, probabilities = model.fit_transform(df[df['source_name'] == source]['content'].values)
    return model, topics, probabilities

MAX_DF = 0.7 # this is a guess and we should probably base it on something
MIN_DF = 2 # our documents are short and there is probably no point in keeping any word that appears twice or less
# we can dial this up and down for testing; a full run (1.0) takes nearly seven minutes
SAMPLE_FRACTION = 1.0

time_start = now()
etf_model, etf_topics, etf_probabilities = topic_model(arg_df=df, source='ETF Daily News')
print('After {} our model says we have {} topics.'.format(now() - time_start, len(set(etf_topics))))
print('And we have {} documents with no topic ({}%).'.format(etf_topics.count(-1), round(100 * etf_topics.count(-1)/len(etf_topics))))
etf_model.visualize_topics(height=800, width=1200)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/181 [00:00<?, ?it/s]

2023-11-13 15:17:54,137 - BERTopic - Transformed documents to Embeddings
2023-11-13 15:18:34,501 - BERTopic - Reduced dimensionality
2023-11-13 15:18:34,798 - BERTopic - Clustered reduced embeddings


After 0:02:27.152433 our model says we have 115 topics.
And we have 786 documents with no topic (14%).


In [8]:
time_start = now()
bbc_model, bbc_topics, bbc_probabilities = topic_model(arg_df=df, source='BBC News')
print('After {} our model says we have {} topics.'.format(now() - time_start, len(set(bbc_topics))))
print('And we have {} documents with no topic ({}%).'.format(bbc_topics.count(-1), round(100 * bbc_topics.count(-1)/len(bbc_topics))))
bbc_model.visualize_topics(height=800, width=1200)

Batches:   0%|          | 0/70 [00:00<?, ?it/s]

2023-11-13 15:19:29,361 - BERTopic - Transformed documents to Embeddings
2023-11-13 15:19:41,617 - BERTopic - Reduced dimensionality
2023-11-13 15:19:41,724 - BERTopic - Clustered reduced embeddings


After 0:00:59.803175 our model says we have 23 topics.
And we have 300 documents with no topic (13%).


Almost all of the BBC articles are either about sports or about the conflict in Gaza and Israel.

In [9]:
time_start = now()
phys_model, phys_topics, phys_probabilities = topic_model(arg_df=df, source='Phys.Org')
print('After {} our model says we have {} topics.'.format(now() - time_start, len(set(phys_topics))))
print('And we have {} documents with no topic ({}%).'.format(phys_topics.count(-1), round(100 * phys_topics.count(-1)/len(phys_topics))))
phys_model.visualize_topics(height=800, width=1200)

Batches:   0%|          | 0/25 [00:00<?, ?it/s]

2023-11-13 15:19:54,463 - BERTopic - Transformed documents to Embeddings
2023-11-13 15:19:59,707 - BERTopic - Reduced dimensionality
2023-11-13 15:19:59,747 - BERTopic - Clustered reduced embeddings


After 0:00:16.496377 our model says we have 17 topics.
And we have 188 documents with no topic (24%).


Funny how the social science articles end up in their own topic cluster.

In [10]:
time_start = now()
rt_model, rt_topics, rt_probabilities = topic_model(arg_df=df, source='RT')
print('After {} our model says we have {} topics.'.format(now() - time_start, len(set(rt_topics))))
print('And we have {} documents with no topic ({}%).'.format(rt_topics.count(-1), round(100 * rt_topics.count(-1)/len(rt_topics))))
rt_model.visualize_topics(height=800, width=1200)

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

2023-11-13 15:20:09,592 - BERTopic - Transformed documents to Embeddings
2023-11-13 15:20:13,777 - BERTopic - Reduced dimensionality
2023-11-13 15:20:13,809 - BERTopic - Clustered reduced embeddings


After 0:00:12.640852 our model says we have 12 topics.
And we have 181 documents with no topic (29%).


Very broadly the RT articles are either in one of the domestic issues clusters or in the foreign affairs cluster.