## Importing & installing libs, loading the dataset

In [None]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import os
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

import torch

!pip install umap-learn
!pip install hdbscan
!pip install bertopic
!pip install sentence-transformers

from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

nltk.download('stopwords')

In [None]:
news_df = pd.read_csv('/kaggle/input/russian-invasion-of-ukraine-live-news-dataset/news.csv')
news_df = news_df[~news_df.text.isna()]
news_df.date = pd.to_datetime(news_df.date)
news_df = news_df[news_df.date >= pd.to_datetime('2023-01-01')] ## Take only news from this year

## Fitting BERTopic model

In [None]:
docs = news_df.text.to_list()

umap_model = UMAP(n_neighbors=12, n_components=5, metric='cosine', low_memory=False)
vectorizer_model = CountVectorizer(stop_words=stopwords.words('russian') + stopwords.words('english'))
hdbscan_model = HDBSCAN(min_cluster_size=35, min_samples=20, metric='euclidean', prediction_data=True)

topic_model = BERTopic(umap_model=umap_model,
                       vectorizer_model=vectorizer_model, 
                       hdbscan_model=hdbscan_model,
                       nr_topics=150, top_n_words=10, language='multilingual', verbose=True).fit(docs)

## Visualizing the results

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.visualize_hierarchy()

In [None]:
topic_model.visualize_topics()

In [None]:
nan