In [13]:
pip install bertopic



In [10]:
# Read the .CSV as a dataframe
import pandas as pd
from bertopic import BERTopic
import torch
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from string import punctuation
from transformers import BertTokenizerFast, AutoModelForSequenceClassification
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
df = pd.read_csv("/content/majestic_data.csv", encoding='ISO-8859-1')
df

Unnamed: 0.1,Unnamed: 0,version,headline,news_source,datePublished,region,articleSection,length,byline,text,highlight,translated,country,datePublished_clean
0,1,MAJESTIC,Het Lombokplein wordt te gevaarlijk'; Bus- en ...,de telegraaf,"April 19, 2023 Wednesday",Utrecht,REGIO; Blz. 12,423 words,Michiel,Bus- en tramkruising terug naar tekentafel doo...,,Bus and tram intersection back to drawing boar...,NL,19/04/2023
1,2,MAJESTIC,Utrecht is echt een fietshemel'; Route Vuelta ...,de telegraaf,"April 20, 2023 Thursday",Nederland,BINNENLAND; Blz. 12,276 words,Eric Roeske,Route Vuelta ligt er strak bij door Eric Roe...,,Route Vuelta is in good shape by Eric Roeske U...,NL,20/04/2023
2,4,MAJESTIC,"Als de auto wegvalt, crasht de economie'; Bran...",de telegraaf,"April 21, 2023 Friday",Haaglanden,REGIO; Blz. 13,447 words,Ingrid de Groot,Brandbrief honderden ondernemers over plannen ...,,Urgent letter to hundreds of entrepreneurs abo...,NL,21/04/2023
3,5,MAJESTIC,Beseffen jullie wel hoe perfÃ©ct Amsterdam is?...,nrc handelsblad,"April 22, 2023 Saturday",1ste Editie,AMSTERDAM; Blz. 1,1992 words,Stan Putman Foto Simon Lenskens,ABSTRACT Not Just Bikes De filmpjes die de Can...,,ABSTRACT Not Just Bikes The videos that Canadi...,NL,22/04/2023
4,6,MAJESTIC,Bewijs dat betaalbare woning nog steeds kan'; ...,de telegraaf,"April 23, 2023 Sunday",Nederland,FINANCIEEL; Blz. 25,627 words,Yteke de Jong,Aannemer Van Wijnen schroeft bouwproductie op ...,,Contractor Van Wijnen increases construction p...,NL,23/04/2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5356,5431,MAJESTIC,Mi dica per quale partito ha votato; Il sindac...,italiaoggi,"December 28, 2016 Wednesday",,PRIMO PIANO; Pag. 6,1081 words,Carlo Valentini Twitter: @cavalent,Bando pubblico: chi vuole essere assunto (per ...,La difesa: il prescelto deve avere la mia fiducia,Public announcement: who wants to be hired (fo...,IT,28/12/2016
5357,5432,MAJESTIC,"Multiutility sÃ¬, ma integrata; Le aziende loc...",italiaoggi,"March 6, 2009 Friday",,"RAPPORTO BELLUNO, TREVISO E VICENZA; Pag. 25",849 words,Samuele Nottegar,"Acqua, rifiuti, energia: in quale direzione si...","Le societÃ che si muovono tra acqua, rifiuti,...","Water, waste, energy: in which direction are V...",IT,06/03/2009
5358,5433,MAJESTIC,risiko agricolo,italiaoggi,"January 22, 2014 Wednesday",,ATTUALITÃ; Pag. 36,926 words,,Granarolo entra in un segmento strategico e in...,,Granarolo enters a strategic and rapidly growi...,IT,22/01/2014
5359,5434,MAJESTIC,commenti,italiaoggi,"August 15, 2018 Wednesday",,PRIMO PIANO; Pag. 11,676 words,,In Italia l'unico ponte che non crolla Ã¨ quel...,,In Italy the only bridge that doesn't collapse...,IT,15/08/2018


In [15]:
df = df.drop(columns=['region', 'version', 'length', 'articleSection', 'byline', 'Unnamed: 0', 'headline', 'text', 'highlight'])
df

Unnamed: 0,news_source,datePublished,translated,country,datePublished_clean
0,de telegraaf,"April 19, 2023 Wednesday",Bus and tram intersection back to drawing boar...,NL,19/04/2023
1,de telegraaf,"April 20, 2023 Thursday",Route Vuelta is in good shape by Eric Roeske U...,NL,20/04/2023
2,de telegraaf,"April 21, 2023 Friday",Urgent letter to hundreds of entrepreneurs abo...,NL,21/04/2023
3,nrc handelsblad,"April 22, 2023 Saturday",ABSTRACT Not Just Bikes The videos that Canadi...,NL,22/04/2023
4,de telegraaf,"April 23, 2023 Sunday",Contractor Van Wijnen increases construction p...,NL,23/04/2023
...,...,...,...,...,...
5356,italiaoggi,"December 28, 2016 Wednesday",Public announcement: who wants to be hired (fo...,IT,28/12/2016
5357,italiaoggi,"March 6, 2009 Friday","Water, waste, energy: in which direction are V...",IT,06/03/2009
5358,italiaoggi,"January 22, 2014 Wednesday",Granarolo enters a strategic and rapidly growi...,IT,22/01/2014
5359,italiaoggi,"August 15, 2018 Wednesday",In Italy the only bridge that doesn't collapse...,IT,15/08/2018


In [17]:
df.isna().sum()

news_source            0
datePublished          0
translated             0
country                0
datePublished_clean    0
dtype: int64

In [18]:
df.translated

0       Bus and tram intersection back to drawing boar...
1       Route Vuelta is in good shape by Eric Roeske U...
2       Urgent letter to hundreds of entrepreneurs abo...
3       ABSTRACT Not Just Bikes The videos that Canadi...
4       Contractor Van Wijnen increases construction p...
                              ...                        
5356    Public announcement: who wants to be hired (fo...
5357    Water, waste, energy: in which direction are V...
5358    Granarolo enters a strategic and rapidly growi...
5359    In Italy the only bridge that doesn't collapse...
5360    Let's not call them trips for the elderly. Whe...
Name: translated, Length: 5361, dtype: object

In [19]:
# Load stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english') + list(punctuation))
# Preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

df['processed_text'] = df['translated'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [20]:
# Initialize BERTopic
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)

# Fit the model on preprocessed text
topics, probabilities = topic_model.fit_transform(df['processed_text'])

# Add topics to the DataFrame
df['topic'] = topics

print(df)

2024-06-20 08:23:39,715 - BERTopic - Embedding - Transforming documents to embeddings.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/168 [00:00<?, ?it/s]

2024-06-20 08:24:10,069 - BERTopic - Embedding - Completed ✓
2024-06-20 08:24:10,070 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-20 08:24:44,017 - BERTopic - Dimensionality - Completed ✓
2024-06-20 08:24:44,020 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-20 08:24:45,844 - BERTopic - Cluster - Completed ✓
2024-06-20 08:24:45,854 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-20 08:24:48,160 - BERTopic - Representation - Completed ✓


          news_source                datePublished  \
0        de telegraaf     April 19, 2023 Wednesday   
1        de telegraaf      April 20, 2023 Thursday   
2        de telegraaf        April 21, 2023 Friday   
3     nrc handelsblad      April 22, 2023 Saturday   
4        de telegraaf        April 23, 2023 Sunday   
...               ...                          ...   
5356       italiaoggi  December 28, 2016 Wednesday   
5357       italiaoggi         March 6, 2009 Friday   
5358       italiaoggi   January 22, 2014 Wednesday   
5359       italiaoggi    August 15, 2018 Wednesday   
5360        la stampa        July 7, 2016 Thursday   

                                             translated country  \
0     Bus and tram intersection back to drawing boar...      NL   
1     Route Vuelta is in good shape by Eric Roeske U...      NL   
2     Urgent letter to hundreds of entrepreneurs abo...      NL   
3     ABSTRACT Not Just Bikes The videos that Canadi...      NL   
4     Contractor

In [21]:
timestamps = df.datePublished_clean.to_list()
text = df.processed_text.to_list()

topics_over_time = topic_model.topics_over_time(text, timestamps, nr_bins=100)

100it [00:42,  2.34it/s]


In [27]:
topic_info = topic_model.get_topic_info()
print(topic_info)

    Topic  Count                                     Name  \
0      -1   1831                  -1_also_city_one_people   
1       0    308                 0_madrid_lanes_bike_city   
2       1    256          1_including_flights_offers_trip   
3       2    224          2_mainz_traffic_city_department   
4       3    148         3_market_business_investors_said   
..    ...    ...                                      ...   
81     80     11           80_parking_spaces_space_garage   
82     81     11          81_climate_macron_pollution_air   
83     82     11       82_delivery_cajoo_deliveries_paris   
84     83     11  83_delivery_companies_vehicles_packages   
85     84     11    84_pollution_particles_turin_electric   

                                       Representation  \
0   [also, city, one, people, new, cycle, cycling,...   
1   [madrid, lanes, bike, city, lane, council, bic...   
2   [including, flights, offers, trip, holiday, ho...   
3   [mainz, traffic, city, department, 

In [28]:
topic_model.visualize_topics()

Our analysis reveals several distinct clusters, each characterized by different thematic information. There are approximately seven primary clusters of interest. In the top right quadrant, a cluster pertaining to military conflicts is observed. The term "cycle" appears to have a different connotation in this context and is thus deemed irrelevant to our primary analysis. Adjacent to this, in the upper-center-right area, is a cluster focused on France and urban cycling.

The model has also identified clusters corresponding to cycling in the Netherlands, Germany, Italy, the United Kingdom, and Spain, aligning with our initial dataset. Additionally, there is a cluster devoted to sports-related topics, prominently featuring the three major pro-tour races: the Tour de France, the Vuelta a España, and the Giro d'Italia. This cluster also includes topics related to football and the Olympic Games.

Closer to the center, there is a significant cluster encompassing topics related to institutions and infrastructure.

## Visualization Topics over Time

In [26]:
topic_model.visualize_topics_over_time(topics_over_time, topics=[0, 2, 3, 11, 14, 16, 18, 20, 21, 23, 26, 29, 34, 37,
                                                                 40, 42, 46, 49, 50, 53, 54, 55, 56, 58, 59, 60, 63,
                                                                 64, 67, 70, 71, 73, 76, 79, 80, 81, 82, 83, 84])

## Visualize Topic Hierarchy

The topics that were created can be hierarchically reduced. In order to understand the potential hierarchical structure of the topics, we can use scipy.cluster.hierarchy to create clusters and visualize how they relate to one another. This might help selecting an appropriate nr_topics when reducing the number of topics that you have created.

In [30]:
topic_model.visualize_hierarchy(top_n_topics=50)

In [31]:
topic_model.visualize_barchart(top_n_topics=15)

## Visualize Topic Similarity
Having generated topic embeddings, through both c-TF-IDF and embeddings, we can create a similarity matrix by simply applying cosine similarities through those topic embeddings. The result will be a matrix indicating how similar certain topics are to each other.

In [48]:
topic_model.visualize_heatmap(n_clusters=20, width=1000, height=1000, topics=[0, 2, 3, 11, 14, 16, 18, 20, 21, 23, 26, 29, 34, 37,
                                                                 40, 42, 46, 49, 50, 53, 54, 55, 56, 58, 59, 60, 63,
                                                                 64, 67, 70, 71, 73, 76, 79, 80, 81, 82, 83, 84])

In [47]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(text, calculate_tokens=True)

# Visualize the token-level distributions
df = topic_model.visualize_approximate_distribution(text[1], topic_token_distr[1])
df

100%|██████████| 6/6 [00:46<00:00,  7.75s/it]


Unnamed: 0,route,vuelta,good,shape,eric,roeske,utrecht,organization,spanish,vuelta.1,netherlands,visit,various,start,finish,locations,utrecht.1,breda,den,bosch,cycling,race,starts,friday,august,19,jaarbeurs,team,time,trial,team.1,presentation,day,vredenburg,team.2,time.1,trial.1,combination,old,new,visiting,ledig,erf,maliebaan,also,overvecht,leidsche,rijn,says,utrecht.2,ron,looij,dutch,project,manager,technology,safety,mobility,tour,spain,day.1,include,grebbeberg,den.1,bosch.1,mountain,jersey,amerongse,berg,finish.1,leuvenlaan,utrecht.3,science,park,proud,utrecht.4,city,three,receives,major,cycling.1,tours,spanish.1,marketing,director,charles,ojalvo,happy,vuelta.2,visiting.1,cathedral,city.1,two,years,utrecht.5,truly,cycling.2,heaven,therefore,city.2,future,still,learn,lot,spain.1,convinced,vuelta.3,netherlands.1,promote,cycling.3,spain.2,sustainability,important,utrecht.6,councilor,klaas,verschuure,received,delegation,city.3,office,tuesday,evening,thanks,route.1,allow,many,utrecht.7,residents,possible,enjoy,vuelta.4,week,ago,also.1,completed,route.2,informed,utrecht.8,route.3,nice,tight,bring,vuelta.5,team.3,time.2,trial.2,combination.1,old.1,new.1,pdf,file,document
0_madrid_lanes_bike_city,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.106,0.208,0.208,0.208,0.103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2_mainz_traffic_city_department,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105,0.105,0.105,0.105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5_utrecht_municipality_van_cyclists,0.0,0.0,0.0,0.225,0.458,0.7,0.988,0.763,0.53,0.287,0.0,0.0,0.0,0.311,0.552,0.789,1.011,0.701,0.459,0.222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.318,0.584,0.774,0.954,0.636,0.37,0.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.189,0.399,0.631,0.939,1.058,1.226,1.362,1.401,1.093,0.714,0.346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.556,0.914,1.285,1.554,0.998,0.639,0.269,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32,0.674,0.939,1.147,0.827,0.473,0.208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.432,0.866,1.356,1.768,1.337,0.903,0.412,0.0,0.0,0.0,0.0,0.317,0.658,0.982,1.293,0.975,0.635,0.311,0.0,0.0,0.0,0.0,0.0,0.0,0.103,0.23,0.23,0.23,0.127
6_olympic_cycling_team_london,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11,0.11,0.11,0.11,0.0,0.0,0.0,0.143,0.143,0.143,0.143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12_tour_france_de_stage,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159,0.345,0.575,0.786,0.627,0.441,0.211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14_bicycle_bicycles_parking_amsterdam,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101,0.101,0.101,0.101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17_amsterdam_city_van_netherlands,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.103,0.103,0.103,0.103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.102,0.102,0.102,0.102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105,0.206,0.206,0.206,0.1
18_accidents_road_speed_roads,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119,0.119,0.119,0.119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19_vuelta_contador_tour_spanish,0.191,0.342,0.342,0.342,0.151,0.0,0.228,0.456,0.69,0.861,0.633,0.405,0.171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12,0.12,0.12,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11,0.248,0.372,0.372,0.262,0.124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.126,0.236,0.337,0.337,0.211,0.1,0.109,0.219,0.365,0.521,0.412,0.302,0.156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191,0.369,0.522,0.725,0.534,0.356,0.203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.189,0.39,0.595,0.83,0.641,0.441,0.236,0.0,0.0,0.0,0.0,0.0,0.146,0.332,0.58,0.834,0.688,0.502,0.254,0.0,0.0,0.0,0.0,0.0,0.0
31_giro_race_italian_coppi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115,0.115,0.115,0.115,0.0,0.0,0.0,0.108,0.108,0.108,0.108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101,0.101,0.101,0.101,0.0,0.0,0.0,0.0,0.0,0.0


## Topics in the Netherlands


In [33]:
df_nl = df[df['country'] == 'NL']

In [34]:
# Initialize BERTopic for NL
topic_model_nl = BERTopic(language="english", calculate_probabilities=True, verbose=True)

# Fit the model on preprocessed text
topics_nl, probabilities = topic_model_nl.fit_transform(df_nl['processed_text'])

# Add topics to the DataFrame
df_nl['topic'] = topics_nl

2024-06-20 08:33:30,626 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

2024-06-20 08:33:35,450 - BERTopic - Embedding - Completed ✓
2024-06-20 08:33:35,453 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-20 08:33:40,064 - BERTopic - Dimensionality - Completed ✓
2024-06-20 08:33:40,067 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-20 08:33:40,159 - BERTopic - Cluster - Completed ✓
2024-06-20 08:33:40,168 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-20 08:33:40,623 - BERTopic - Representation - Completed ✓


In [35]:
topic_model_nl.visualize_topics()

In this model, using a subsample for the Netherlands, we can observe three primary clusters. The top-right cluster focuses on the institutional aspects of cycling. The left-center cluster contains various topics, predominantly related to policy matters. The down-center cluster encompasses discussions about cycling in different countries.

## Topics in UK

In [36]:
df_uk = df[df['country'] == 'UK']

In [37]:
# Initialize BERTopic for NL
topic_model_uk = BERTopic(language="english", calculate_probabilities=True, verbose=True)

# Fit the model on preprocessed text
topics_uk, probabilities = topic_model_uk.fit_transform(df_uk['processed_text'])

# Add topics to the DataFrame
df_uk['topic'] = topics_uk

2024-06-20 08:33:41,861 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

2024-06-20 08:33:48,122 - BERTopic - Embedding - Completed ✓
2024-06-20 08:33:48,125 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-20 08:33:53,916 - BERTopic - Dimensionality - Completed ✓
2024-06-20 08:33:53,922 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-20 08:33:54,037 - BERTopic - Cluster - Completed ✓
2024-06-20 08:33:54,044 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-20 08:33:55,167 - BERTopic - Representation - Completed ✓


In [38]:
topic_model_uk.visualize_topics()

This graph for UK topics does not exhibit strictly defined clusters; however, it encompasses the same themes as the previous model but within the UK context. The primary topics include transport infrastructure, government and institutions, and road cycling as a sport.

## Topics in Germany

In [39]:
df_de = df[df['country'] == 'DE']

# Initialize BERTopic for UK
topic_model_de = BERTopic(language="english", calculate_probabilities=True, verbose=True)

# Fit the model on preprocessed text
topics_de, probabilities = topic_model_de.fit_transform(df_de['processed_text'])

# Add topics to the DataFrame
df_de['topic'] = topics_de

2024-06-20 08:33:56,569 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

2024-06-20 08:34:01,387 - BERTopic - Embedding - Completed ✓
2024-06-20 08:34:01,389 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-20 08:34:05,696 - BERTopic - Dimensionality - Completed ✓
2024-06-20 08:34:05,698 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-20 08:34:05,808 - BERTopic - Cluster - Completed ✓
2024-06-20 08:34:05,815 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-20 08:34:06,370 - BERTopic - Representation - Completed ✓


In [40]:
topic_model_de.visualize_topics()

In the German subsample, clusters are more distinctly defined compared to the previous model. The top-left cluster pertains to transport, traffic, and construction matters. The right-center cluster focuses on urban cycling, while the down-center cluster addresses urban planning and public spaces.

## Topics in France

In [41]:
df_fr = df[df['country'] == 'FR']

# Initialize BERTopic for FR
topic_model_fr = BERTopic(language="english", calculate_probabilities=True, verbose=True)

# Fit the model on preprocessed text
topics_fr, probabilities = topic_model_fr.fit_transform(df_fr['processed_text'])

# Add topics to the DataFrame
df_fr['topic'] = topics_fr

2024-06-20 08:34:08,192 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/28 [00:00<?, ?it/s]

2024-06-20 08:34:13,396 - BERTopic - Embedding - Completed ✓
2024-06-20 08:34:13,399 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-20 08:34:17,171 - BERTopic - Dimensionality - Completed ✓
2024-06-20 08:34:17,173 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-20 08:34:17,244 - BERTopic - Cluster - Completed ✓
2024-06-20 08:34:17,249 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-20 08:34:17,702 - BERTopic - Representation - Completed ✓


In [42]:
topic_model_fr.visualize_topics()

In the French subsample, all the previously discussed topics are consolidated into one primary cluster, encompassing delivery and e-scooters, road cycling, and emissions. Additionally, there is a distinct cluster in the down-center that focuses on policy and strategies. This cluster combines discussions on attacks and war with topics related to sustainability.

## Topics in Italy

In [43]:
df_it = df[df['country'] == 'IT']
df_it

# Initialize BERTopic for UK
topic_model_it = BERTopic(language="english", calculate_probabilities=True, verbose=True)

# Fit the model on preprocessed text
topics_it, probabilities = topic_model_it.fit_transform(df_it['processed_text'])

# Add topics to the DataFrame
df_it['topic'] = topics_it

2024-06-20 08:34:19,129 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/28 [00:00<?, ?it/s]

2024-06-20 08:34:24,534 - BERTopic - Embedding - Completed ✓
2024-06-20 08:34:24,539 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-20 08:34:28,948 - BERTopic - Dimensionality - Completed ✓
2024-06-20 08:34:28,950 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-20 08:34:29,030 - BERTopic - Cluster - Completed ✓
2024-06-20 08:34:29,037 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-20 08:34:29,380 - BERTopic - Representation - Completed ✓


In [44]:
topic_model_it.visualize_topics()

The Italian subsample features several distinct clusters. The top-left cluster pertains to municipalities and ministries. The center-left cluster focuses on cycling infrastructure, combined with topics related to the Italian pro-tour race, the Giro d'Italia. The down-center cluster includes discussions on transport pollution, construction, and design.