# Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

from bertopic.vectorizers import ClassTfidfTransformer

# import torch

from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence


sys.path.insert(0, "../")
from utils.quarter_label import labelQuarters

In [2]:
def get_topic_diversity(topic_model, topk=10):
    td = TopicDiversity(topk=topk)
    topic_list = list(topic_model.get_topics().values())
    td_score = td.score({'topics': topic_list})
    return td_score

In [3]:
tweet = pd.read_csv('../../src/cleaned_combined.csv')
tweet = tweet.dropna(subset=['full_text'])
tweet = tweet[['created_at', 'full_text']]
tweet['created_at'] = pd.to_datetime(tweet['created_at'])
tweet

Unnamed: 0,created_at,full_text
0,2019-11-07 16:20:08+00:00,suka bikin gaduh lebih orang teken petisi ui p...
1,2019-11-08 19:22:05+00:00,nadiem makarim pesan rektor itb kampus merdeka...
2,2019-11-09 01:31:35+00:00,kalau merdeka kampus potensi asai pikir berang...
3,2019-11-09 04:16:02+00:00,nadiem makarim minta kampus merdeka
4,2019-11-12 06:27:24+00:00,enggak punya kenang apaapa pertama kali lihat ...
...,...,...
82993,2024-04-29 15:25:47+00:00,demo mahasiswa minta tuukt turun unsoed nyata ...
82994,2024-04-29 15:53:37+00:00,sumpah adikadik moga kalian kuatkuat mental am...
82995,2024-04-29 16:33:36+00:00,amanat konstitusi cerdas hidup bangsa bukan ma...
82996,2024-04-29 22:58:27+00:00,gue heran kadang biaya setinggi sekarang gaji...


# Grouping Dataset

In [4]:
tweet_quarter = tweet.copy()

tweet_quarter = labelQuarters(tweet_quarter, 'created_at', 6)

tweet_quarter = tweet_quarter.groupby('quarter')

# Model

In [5]:
results = {}

In [6]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [7]:
%%time
for period, group in tweet_quarter:
    topic_model = BERTopic(verbose=True,
                            calculate_probabilities=True,
                            language='indonesian',
                            nr_topics=3,
                            ctfidf_model = ctfidf_model,
                            n_gram_range=(1, 2),
                            min_topic_size = 100
                            )
    text_data = group['full_text'].tolist()
    timestamps = group['created_at'].tolist()
    topics, probs = topic_model.fit_transform(text_data)
    results[period] = {
        'model' : topic_model,
            'text_data': text_data,
        'topics_over_time' : topic_model.topics_over_time(text_data, timestamps, nr_bins=3),
        'topics': topics,
        'probs': probs,
        'topic_info': topic_model.get_topic_info(),
    }

2024-05-25 22:48:31,242 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/263 [00:00<?, ?it/s]

2024-05-25 22:48:45,163 - BERTopic - Embedding - Completed ✓
2024-05-25 22:48:45,166 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-25 22:49:13,306 - BERTopic - Dimensionality - Completed ✓
2024-05-25 22:49:13,309 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-25 22:49:14,479 - BERTopic - Cluster - Completed ✓
2024-05-25 22:49:14,480 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-25 22:49:14,887 - BERTopic - Representation - Completed ✓
2024-05-25 22:49:14,888 - BERTopic - Topic reduction - Reducing number of topics
2024-05-25 22:49:15,326 - BERTopic - Topic reduction - Reduced number of topics from 13 to 4
3it [00:01,  1.93it/s]
2024-05-25 22:49:17,416 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/373 [00:00<?, ?it/s]

2024-05-25 22:49:36,119 - BERTopic - Embedding - Completed ✓
2024-05-25 22:49:36,123 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-25 22:49:47,578 - BERTopic - Dimensionality - Completed ✓
2024-05-25 22:49:47,580 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-25 22:49:49,148 - BERTopic - Cluster - Completed ✓
2024-05-25 22:49:49,150 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-25 22:49:50,016 - BERTopic - Representation - Completed ✓
2024-05-25 22:49:50,019 - BERTopic - Topic reduction - Reducing number of topics
2024-05-25 22:49:50,964 - BERTopic - Topic reduction - Reduced number of topics from 4 to 4
3it [00:02,  1.03it/s]
2024-05-25 22:49:54,249 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/202 [00:00<?, ?it/s]

2024-05-25 22:50:21,684 - BERTopic - Embedding - Completed ✓
2024-05-25 22:50:21,685 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-25 22:50:30,140 - BERTopic - Dimensionality - Completed ✓
2024-05-25 22:50:30,142 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-25 22:50:30,557 - BERTopic - Cluster - Completed ✓
2024-05-25 22:50:30,558 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-25 22:50:32,570 - BERTopic - Representation - Completed ✓
2024-05-25 22:50:32,571 - BERTopic - Topic reduction - Reducing number of topics
2024-05-25 22:50:34,896 - BERTopic - Topic reduction - Reduced number of topics from 5 to 5
3it [00:01,  1.69it/s]
2024-05-25 22:50:41,460 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/320 [00:00<?, ?it/s]

2024-05-25 22:51:07,795 - BERTopic - Embedding - Completed ✓
2024-05-25 22:51:07,798 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-25 22:51:17,879 - BERTopic - Dimensionality - Completed ✓
2024-05-25 22:51:17,881 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-25 22:51:19,197 - BERTopic - Cluster - Completed ✓
2024-05-25 22:51:19,198 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-25 22:51:19,905 - BERTopic - Representation - Completed ✓
2024-05-25 22:51:19,907 - BERTopic - Topic reduction - Reducing number of topics
2024-05-25 22:51:20,463 - BERTopic - Topic reduction - Reduced number of topics from 7 to 7
3it [00:02,  1.20it/s]
2024-05-25 22:51:23,308 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/220 [00:00<?, ?it/s]

2024-05-25 22:51:40,559 - BERTopic - Embedding - Completed ✓
2024-05-25 22:51:40,568 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-25 22:51:56,170 - BERTopic - Dimensionality - Completed ✓
2024-05-25 22:51:56,172 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-25 22:51:57,050 - BERTopic - Cluster - Completed ✓
2024-05-25 22:51:57,051 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-25 22:51:57,657 - BERTopic - Representation - Completed ✓
2024-05-25 22:51:57,660 - BERTopic - Topic reduction - Reducing number of topics
2024-05-25 22:51:58,312 - BERTopic - Topic reduction - Reduced number of topics from 6 to 6
3it [00:01,  2.05it/s]
2024-05-25 22:52:00,101 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/269 [00:00<?, ?it/s]

2024-05-25 22:52:17,226 - BERTopic - Embedding - Completed ✓
2024-05-25 22:52:17,228 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-25 22:52:33,946 - BERTopic - Dimensionality - Completed ✓
2024-05-25 22:52:33,948 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-25 22:52:35,144 - BERTopic - Cluster - Completed ✓
2024-05-25 22:52:35,147 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-25 22:52:36,387 - BERTopic - Representation - Completed ✓
2024-05-25 22:52:36,392 - BERTopic - Topic reduction - Reducing number of topics
2024-05-25 22:52:37,162 - BERTopic - Topic reduction - Reduced number of topics from 5 to 5
3it [00:02,  1.44it/s]
2024-05-25 22:52:39,769 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/268 [00:00<?, ?it/s]

2024-05-25 22:52:59,484 - BERTopic - Embedding - Completed ✓
2024-05-25 22:52:59,486 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-25 22:53:14,697 - BERTopic - Dimensionality - Completed ✓
2024-05-25 22:53:14,700 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-25 22:53:15,818 - BERTopic - Cluster - Completed ✓
2024-05-25 22:53:15,819 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-25 22:53:16,550 - BERTopic - Representation - Completed ✓
2024-05-25 22:53:16,556 - BERTopic - Topic reduction - Reducing number of topics
2024-05-25 22:53:17,246 - BERTopic - Topic reduction - Reduced number of topics from 4 to 4
3it [00:02,  1.33it/s]
2024-05-25 22:53:20,138 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/362 [00:00<?, ?it/s]

2024-05-25 22:53:44,073 - BERTopic - Embedding - Completed ✓
2024-05-25 22:53:44,079 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-25 22:53:55,773 - BERTopic - Dimensionality - Completed ✓
2024-05-25 22:53:55,775 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-25 22:53:57,472 - BERTopic - Cluster - Completed ✓
2024-05-25 22:53:57,474 - BERTopic - Representation - Extracting topics from clusters using representation models.


MemoryError: Unable to allocate 15.8 MiB for an array with shape (10792, 384) and data type float32

# Result

In [None]:
for period, result in results.items():
    fig = result['model'].visualize_topics_over_time(result['topics_over_time'], top_n_topics=10)
    fig.update_layout(title_text=f"Topics Over Time for {period}")
    fig.show()

In [None]:
for period, result in results.items():
    display(result['topics_over_time'])

In [None]:
for period, result in results.items():
    print(result['model'].get_topics())

In [None]:
for period, result in results.items():
    fig = result['model'].visualize_topics()
    fig.update_layout(title_text=f"Period {period}")
    fig.show()

In [None]:
for period, result in results.items():
    fig = result['model'].visualize_documents(result['text_data'])
    fig.update_layout(title_text=f"Period {period}")
    fig.show()

In [None]:
for period, result in results.items():
    hierarchical_topics = result['model'].hierarchical_topics(result['text_data'])
    fig = result['model'].visualize_hierarchy(hierarchical_topics=hierarchical_topics)
    fig.update_layout(title_text=f"Period {period}")
    fig.show()


In [None]:
for period, result in results.items():
    print(f'{period} : {get_topic_diversity(result["model"])}')