# Libraries

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

sys.path.insert(0, "../")
from utils.quarter_label import labelQuarters

In [2]:
tweet = pd.read_csv('../../src/cleaned_combined.csv')
tweet = tweet.dropna(subset=['full_text'])
tweet = tweet[['created_at', 'full_text']]
tweet['created_at'] = pd.to_datetime(tweet['created_at'])
tweet

Unnamed: 0,created_at,full_text
0,2019-11-07 16:20:08+00:00,suka bikin gaduh lebih orang teken petisi ui p...
1,2019-11-08 19:22:05+00:00,nadiem makarim pesan rektor itb kampus merdeka...
2,2019-11-09 01:31:35+00:00,kalau merdeka kampus potensi asai pikir berang...
3,2019-11-09 04:16:02+00:00,nadiem makarim minta kampus merdeka
4,2019-11-12 06:27:24+00:00,enggak punya kenang apaapa pertama kali lihat ...
...,...,...
65319,2024-04-29 13:43:31+00:00,wakatobi siap gelar konferensi internasional s...
65320,2024-04-29 14:23:23+00:00,konsorsium baterai listrik nasional
65321,2024-04-29 15:50:14+00:00,joki essay tema merdeka ajar
65322,2024-04-29 18:23:19+00:00,tim pkm pnup terima dana pkm vokasi tahun


In [14]:
%%time
topic_model = BERTopic(verbose=True,
                        calculate_probabilities=True,
                        language='indonesian',
                        nr_topics="auto",
                        )
text_data = tweet['full_text'].tolist()
timestamps = tweet['created_at'].tolist()

topics, probs = topic_model.fit_transform(text_data)

results = {
        'model' : topic_model,
        'topics_over_time' : topic_model.topics_over_time(text_data, timestamps, nr_bins=18),
        'probs': probs,
        'topic_info': topic_model.get_topic_info(),
        
        'topics': topics,
        'text': text_data,
    }

2024-05-23 14:46:02,185 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2041 [00:00<?, ?it/s]

2024-05-23 15:00:37,309 - BERTopic - Embedding - Completed ✓
2024-05-23 15:00:37,312 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-23 15:01:36,982 - BERTopic - Dimensionality - Completed ✓
2024-05-23 15:01:36,995 - BERTopic - Cluster - Start clustering the reduced embeddings


In [None]:
fig = results['model'].visualize_topics_over_time(results['topics_over_time'], top_n_topics=20)
# fig.update_layout(title_text=f"Topics Over Time for {period}")
fig.show()

In [None]:
topic_model.get_topics()

In [None]:
topic_model.get_topic_freq()

In [None]:
topic_model.get_document_info(text_data)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_documents(text_data)

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(text_data)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

# Grouping Dataset

In [6]:
def get_custom_quarter(date):
    start_month = date.month
    end_month = start_month + 2
    
    if end_month > 12:
        end_year = date.year + 1
        end_month = end_month % 12
    else:
        end_year = date.year
    
    start_date = pd.Timestamp(year=date.year, month=start_month, day=1)
    end_date = pd.Timestamp(year=end_year, month=end_month, day=1) + pd.offsets.MonthEnd(1)
    
    return f"{start_date.strftime('%Y_%m')}-{end_date.strftime('%Y_%m')}"

In [13]:
tweet_quarter = tweet.copy()

tweet_quarter = labelQuarters(tweet_quarter, 'created_at')

tweet_quarter = tweet_quarter.groupby('quarter')

In [None]:
%%time
for period, group in tweet_quarter:
    topic_model = BERTopic(verbose=True,
                           language='indonesian', calculate_probabilities=True,
                           nr_topics=10,
                           )
    text_data = group['full_text'].tolist()
    timestamps = group['created_at'].tolist()
    topics, probs = topic_model.fit_transform(text_data)
    results[period] = {
        'model' : topic_model,
        'topics_over_time' : topic_model.topics_over_time(text_data, timestamps, nr_bins=18),
        'topics': topics,
        'probs': probs,
        'topic_info': topic_model.get_topic_info(),
    }