In [None]:
# Importing required libraries
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
import re
import pandas as pd
from datetime import datetime
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorize

In [None]:
# Input file
df = pd.read_excel('/scratch/project_2004147/visions/tweets.csv')



In [None]:
df = df[df[['text']].notnull().all(1)]
df.nunique()

In [None]:
# Filter text
df.text = df.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)
df.text = df.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1)
df.text = df.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)
df = df.loc[(df.text != ""), :]
tweets = df.text.to_list()

In [None]:
tweets[0]

In [None]:
# we add this to remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1), stop_words=("english", "spanish"))

In [None]:
# Main topic modelling settings
umap_model = UMAP(n_neighbors=15, 
                  n_components=5, 
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=100)
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda")

# Fine-tune your topic representations
representation_model = KeyBERTInspired()
topic_model = BERTopic(min_topic_size=50, vectorizer_model=vectorizer_model, 
                       calculate_probabilities=True, verbose=True,nr_topics="auto",
                      embedding_model=sentence_model,representation_model=representation_model,
                      umap_model=umap_model, language = "multilingual", top_n_words = 10)
topics, probs = topic_model.fit_transform(tweets)

In [None]:
# Hereinafter - various outputs of topic modelling
topic_model.visualize_distribution(probs[1])

In [None]:
fig_hierarchy = topic_model.visualize_hierarchy(); fig_hierarchy

In [None]:
fig = topic_model.visualize_topics(); fig

In [None]:
topic_distr, probs = topic_model.approximate_distribution(tweets)

In [None]:
heatmap = topic_model.visualize_heatmap(); heatmap

In [None]:
topic_model.topic_representations_

In [None]:
# Retreiving topics for tweets
df_tweets = topic_model.get_document_info(tweets)
df_tweets.head()

In [None]:
df_tweets.to_csv('df_tweets.csv')  