In [21]:
import os
import pandas as pd
from bertopic import BERTopic
from src import preprocess_data, extract_date_days, generate_topic_labels, save_result_topics

# International news

Read data

In [2]:
file_path = os.path.join("..", "data.xlsx")

data = pd.read_excel(io = file_path,
                     sheet_name = "Міжнародні")

**Analyze data**

In [3]:
data.head()

Unnamed: 0,Дата,Джерело,Заголовок,Опис,Посилання,Автор,Популярність джерела,Мова,Країна
0,2025-02-13 13:07:18,Reuters,"""50% battle-ready"": Germany misses military ta...",The German army's battle-readiness is less tha...,https://www.reuters.com/world/europe/50-battle...,Sabine Siebold,101,en,GB
1,2025-02-13 13:09:45,The Straits Times,"""50% battle-ready"": Germany misses military ta...",BERLIN - The German army's battle-readiness is...,https://www.straitstimes.com/world/europe/50-b...,,1187,en,SG
2,2025-02-13 13:06:39,AOL,"""50% battle-ready"": Germany misses military ta...",The German army's battle-readiness is less tha...,https://www.aol.com/news/50-battle-ready-germa...,Sabine Siebold,295,en,US
3,2025-02-15 17:09:24,NDTV,"""Armed Forces Of Europe Must Be Created"": Zele...",Ukrainian President Volodymyr Zelensky called ...,https://www.ndtv.com/world-news/armed-forces-o...,Agence France-Presse,841,en,IN
4,2025-02-09 11:21:49,NDTV,"""Can't Confirm Or Deny"": Kremlin On Reports Of...",The Kremlin on Sunday declined to confirm or d...,https://www.ndtv.com/world-news/cant-confirm-o...,Agence France-Presse,841,en,IN


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9346 entries, 0 to 9345
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Дата                  9346 non-null   object
 1   Джерело               9339 non-null   object
 2   Заголовок             9346 non-null   object
 3   Опис                  7614 non-null   object
 4   Посилання             9346 non-null   object
 5   Автор                 8170 non-null   object
 6   Популярність джерела  9346 non-null   int64 
 7   Мова                  9346 non-null   object
 8   Країна                9346 non-null   object
dtypes: int64(1), object(8)
memory usage: 657.3+ KB


We have some news without any description, but all news contain the header.

In [5]:
data["Мова"].value_counts()

Мова
en    9346
Name: count, dtype: int64

All international news are in English. So we can use BERTopic specified on English to achieve better results.

# Train BERTopic model

In [6]:
topic_model = BERTopic(embedding_model = "all-MiniLM-L6-v2") # English BERTopic

Create texts for BERTopic as: 'Заголовок' + 'Опис'

In [7]:
texts = preprocess_data(data)

Create timestamps for BERTopic as date from 'Дата'

In [8]:
timestamps = extract_date_days(data)

In [9]:
set(timestamps)

{'2025-02-09',
 '2025-02-10',
 '2025-02-11',
 '2025-02-12',
 '2025-02-13',
 '2025-02-14',
 '2025-02-15',
 '2025-02-16'}

We have news only during 8 days.

Train model

In [10]:
topics, probs = topic_model.fit_transform(texts)

Output results

In [11]:
topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2106,-1_ukraine_putin_the_to,"[ukraine, putin, the, to, and, in, trump, with...",[Hegseth rules out NATO membership for Ukraine...
1,0,323,0_zelensky_zelenskyy_volodymyr_putin,"[zelensky, zelenskyy, volodymyr, putin, accept...",[Ukraine says it will not accept US-Russia pea...
2,1,176,1_plant_nuclear_radiation_chernobyl,"[plant, nuclear, radiation, chernobyl, drone, ...",[Ukraine says Russia drone attack hits Chernob...
3,2,151,2_gabbard_tulsi_intelligence_senate,"[gabbard, tulsi, intelligence, senate, directo...",[Senate confirms Gabbard as Trump's director o...
4,3,148,3_speech_vance_jd_free,"[speech, vance, jd, free, vice, threat, europe...",[JD Vance attacks Europe over free speech and ...


We have more than 2000 outliers.

In [12]:
topic_model.visualize_topics()

We have similar topics (duplicates). For example, about drone strike in Chernobyl.

**Reduce the amount of topics.**

In [13]:
topic_model.reduce_topics(texts, nr_topics=50)

<bertopic._bertopic.BERTopic at 0x1b81fae7d40>

# Results

In [14]:
topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2106,-1_the_ukraine_to_in,"[the, ukraine, to, in, and, trump, of, putin, ...",[Trump says he trusts Russia's Putin wants pea...
1,0,2469,0_ukraine_trump_to_war,"[ukraine, trump, to, war, putin, president, ru...",[Trump Says Russia Agreed To 'Immediately' Beg...
2,1,426,1_drones_russia_russian_ukrainian,"[drones, russia, russian, ukrainian, ukraine, ...",[Ukraine and Russia trade long-range attacks a...
3,2,391,2_fogel_marc_custody_teacher,"[fogel, marc, custody, teacher, american, rele...",[American School Teacher Marc Fogel Released F...
4,3,314,3_gaza_aluminum_tariffs_the,"[gaza, aluminum, tariffs, the, trump, is, of, ...",[US President Trump doubles down on plan to ta...


Set custom names to topics.

In [15]:
all_topic_ids = topic_model.get_topic_info()["Topic"].tolist()

topic_ids = [topic_id for topic_id in all_topic_ids if topic_id != -1] #exclude outliers (-1)

labels = generate_topic_labels(topic_model, topic_ids)

topic_model.set_topic_labels(labels)

**Plot interconnections between topics**

In [16]:
topic_model.visualize_topics(custom_labels=True)

**Topics over time**

**Plot the most important topics over time**

In [17]:
topics_over_time = topic_model.topics_over_time(texts,
                                                timestamps,
                                                nr_bins=8) # we have news during 8 days

In [18]:
topic_model.visualize_topics_over_time(topics_over_time,
                                       top_n_topics=5,
                                       custom_labels=True)

**Save results**

In [22]:
path_to_save = os.path.join("..", "output", "res_international.csv")

save_result_topics(topic_model, path_to_save)