# Libraries

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

# Load Data

In [3]:
tweet = pd.read_csv('../../src/cleaned_kp.csv')
tweet = tweet.dropna(subset=['full_text'])
tweet = tweet[['created_at', 'full_text']]
tweet['created_at'] = pd.to_datetime(tweet['created_at'])
tweet

Unnamed: 0,created_at,full_text
0,2019-11-01 00:43:08+00:00,by lewat
1,2019-11-01 02:42:52+00:00,kakanwil kemenag provinsi papua pdt amsal yowe...
2,2019-11-01 08:53:32+00:00,politik baik anggota dprd betul psi libat raky...
3,2019-11-01 09:15:05+00:00,politik baik anggota dprd betul psi libat raky...
4,2019-11-01 11:16:16+00:00,kerja institusi izin alami baru masuk temu bi...
...,...,...
8431,2024-04-29 09:26:46+00:00,pintas masa depan agama indonesia lihat lebih ...
8432,2024-04-29 12:00:01+00:00,hitung tahun ajar perintah lalu kemendikbud ja...
8433,2024-04-29 13:45:36+00:00,bersikukuh tinggi komersialisasi semenjak pt...
8434,2024-04-29 16:48:01+00:00,kagak ngotak asli dahh sbnrnya salah sekaran...


# Preprocess

In [5]:
# tweet_quarter = tweet.copy()

# tweet_quarter['quarter'] = tweet_quarter['created_at'].dt.to_period('Q')

# tweet_quarter = tweet_quarter.groupby('quarter')

In [6]:
tweet_time = tweet.copy()

# Modeling

In [9]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

In [10]:
from transformers import pipeline
embedding_model = pipeline("feature-extraction", model=model, tokenizer=tokenizer, device=0)
# indolem/indobert-base-uncased

In [11]:
results = {}

In [12]:
%%time
topic_model = BERTopic(verbose=True,
                        embedding_model=embedding_model, calculate_probabilities=True,
                        nr_topics="auto",
                        )
text_data = tweet_time['full_text'].tolist()
timestamps = tweet_time['created_at'].tolist()

topics, probs = topic_model.fit_transform(text_data)

results = {
        'model' : topic_model,
        'topics_over_time' : topic_model.topics_over_time(text_data, timestamps, nr_bins=18),
        'probs': probs,
        'topic_info': topic_model.get_topic_info(),
        
        'topics': topics,
        'text': text_data,
    }

2024-05-22 14:23:39,461 - BERTopic - Embedding - Transforming documents to embeddings.
  0%|          | 0/8434 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 8434/8434 [01:23<00:00, 101.13it/s]
2024-05-22 14:25:02,938 - BERTopic - Embedding - Completed ✓
2024-05-22 14:25:02,939 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-22 14:25:31,036 - BERTopic - Dimensionality - Completed ✓
2024-05-22 14:25:31,040 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-22 14:25:37,543 - BERTopic - Cluster - Completed ✓
2024-05-22 14:25:37,544 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-22 14:25:37,821 - BERTopic - Representation - Completed ✓
2024-05-22 14:25:37,822 - BERTopic - Topic reduction - Reducing number of topics
2024-05-22 14:25:37,994 - BERTopic - Topic 

CPU times: total: 16.2 s
Wall time: 1min 59s





In [13]:
# %%time
# for period, group in tweet_quarter:
#     topic_model = BERTopic(verbose=False,
#                            embedding_model=embedding_model, calculate_probabilities=True,
#                            nr_topics=10,
#                            )
#     text_data = group['full_text'].tolist()
#     timestamps = group['created_at'].tolist()
#     topics, probs = topic_model.fit_transform(text_data)
#     results[period] = {
#         'model' : topic_model,
#         'topics_over_time' : topic_model.topics_over_time(text_data, timestamps, nr_bins=30),
#         'probs': probs,
#         'topic_info': topic_model.get_topic_info(),
        
#         'topics': topics,
#         'text': text_data,
#     }

In [14]:
fig = results['model'].visualize_topics_over_time(results['topics_over_time'], top_n_topics=20)
# fig.update_layout(title_text=f"Topics Over Time for {period}")
fig.show()

In [15]:
topic_model.get_topics()

{-1: [('ajar', 0.010333911544481147),
  ('guru', 0.009972549550894525),
  ('jadi', 0.009873304360747556),
  ('menteri', 0.009153458341474324),
  ('indonesia', 0.008708544781860431),
  ('buat', 0.008671941401943533),
  ('perintah', 0.008459818290397035),
  ('merdeka', 0.007947857002706695),
  ('tinggi', 0.007525988147508964),
  ('sekolah', 0.007426509096659935)],
 0: [('papua', 0.01089720551890739),
  ('menteri', 0.01054701098815258),
  ('enggak', 0.010387914083789319),
  ('ganjar', 0.009905746362025227),
  ('nadiem', 0.009686842300713146),
  ('budaya', 0.009487460692810241),
  ('ajar', 0.009095880752425177),
  ('sekolah', 0.008921952547956122),
  ('makarim', 0.008852026030296582),
  ('jadi', 0.008715285438800327)],
 1: [('prabowo', 0.08481580860751176),
  ('visisubianto', 0.06582648345375938),
  ('nih', 0.05472371748086696),
  ('bapak', 0.05234032329267939),
  ('delapansmangat', 0.051160089868006346),
  ('utkmasadepan', 0.051160089868006346),
  ('semakindekat', 0.05016004011641012),
  

In [16]:
topic_model.get_topic_freq()

Unnamed: 0,Topic,Count
2,-1,3678
1,0,2651
50,1,377
12,2,136
0,3,115
...,...,...
35,60,11
61,61,11
54,62,10
7,63,10


In [17]:
topic_model.get_document_info(text_data)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,by lewat,3,3_sinkron_integrasi_disruptif_publik,"[sinkron, integrasi, disruptif, publik, jalur,...","[sinkron kemendikbud integrasi jalur, sinkron...",sinkron - integrasi - disruptif - publik - jal...,1.000000,False
1,kakanwil kemenag provinsi papua pdt amsal yowe...,0,0_papua_menteri_enggak_ganjar,"[papua, menteri, enggak, ganjar, nadiem, buday...",[masyarakat papua dukung otonomi khusus papua...,papua - menteri - enggak - ganjar - nadiem - b...,1.000000,False
2,politik baik anggota dprd betul psi libat raky...,-1,-1_ajar_guru_jadi_menteri,"[ajar, guru, jadi, menteri, indonesia, buat, p...",[selamat hari guru semua guru hebat moga kuali...,ajar - guru - jadi - menteri - indonesia - bua...,0.948286,False
3,politik baik anggota dprd betul psi libat raky...,-1,-1_ajar_guru_jadi_menteri,"[ajar, guru, jadi, menteri, indonesia, buat, p...",[selamat hari guru semua guru hebat moga kuali...,ajar - guru - jadi - menteri - indonesia - bua...,0.949651,False
4,kerja institusi izin alami baru masuk temu bi...,-1,-1_ajar_guru_jadi_menteri,"[ajar, guru, jadi, menteri, indonesia, buat, p...",[selamat hari guru semua guru hebat moga kuali...,ajar - guru - jadi - menteri - indonesia - bua...,0.947349,False
...,...,...,...,...,...,...,...,...
8429,pintas masa depan agama indonesia lihat lebih ...,0,0_papua_menteri_enggak_ganjar,"[papua, menteri, enggak, ganjar, nadiem, buday...",[masyarakat papua dukung otonomi khusus papua...,papua - menteri - enggak - ganjar - nadiem - b...,0.664921,False
8430,hitung tahun ajar perintah lalu kemendikbud ja...,-1,-1_ajar_guru_jadi_menteri,"[ajar, guru, jadi, menteri, indonesia, buat, p...",[selamat hari guru semua guru hebat moga kuali...,ajar - guru - jadi - menteri - indonesia - bua...,0.950007,False
8431,bersikukuh tinggi komersialisasi semenjak pt...,-1,-1_ajar_guru_jadi_menteri,"[ajar, guru, jadi, menteri, indonesia, buat, p...",[selamat hari guru semua guru hebat moga kuali...,ajar - guru - jadi - menteri - indonesia - bua...,0.350397,False
8432,kagak ngotak asli dahh sbnrnya salah sekaran...,0,0_papua_menteri_enggak_ganjar,"[papua, menteri, enggak, ganjar, nadiem, buday...",[masyarakat papua dukung otonomi khusus papua...,papua - menteri - enggak - ganjar - nadiem - b...,0.402309,False


In [18]:
# topic_model.get_representative_docs()

In [19]:
topic_model.visualize_topics()

In [20]:
topic_model.visualize_documents(text_data)

In [21]:
hierarchical_topics = topic_model.hierarchical_topics(text_data)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 64/64 [00:00<00:00, 391.46it/s]


In [22]:
# for period, result in results.items():
#     fig = result['model'].visualize_topics_over_time(result['topics_over_time'], top_n_topics=10)
#     fig.update_layout(title_text=f"Topics Over Time for {period}")
#     fig.show()

In [23]:
# top_topics = []


# for period, result in results.items():
#     top_10_topics = result['topic_info'][0:11]
#     # top_10_topics['Name'] = top_10_topics['Name'].str.replace(r'^\d+_', '', regex=True)
#     topic_descriptions = top_10_topics['Name'].tolist()
#     top_topics.append({'period': period, 'topics': topic_descriptions})


# top_topics_df = pd.DataFrame(top_topics)
# display(top_topics_df)

In [24]:
# for quarter in top_topics:
#     print(f"{quarter['period']}: {quarter['topics']}")

# Evaluation

In [65]:
# coherence_obj = {}

# for period, result in results.items():
#     documents = pd.DataFrame({"Document": result['text'],
#                               "ID": range(len(result['text'])),
#                               "Topic": result['topics'],
#                               })

#     documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
#     cleaned_docs = result['model']._preprocess_text(documents_per_topic.Document.values)

#     vectorizer = result['model'].vectorizer_model
#     analyzer = vectorizer.build_analyzer()

#     words = vectorizer.get_feature_names_out()
#     tokens = [analyzer(doc) for doc in cleaned_docs]
#     dictionary = corpora.Dictionary(tokens)
#     corpus = [dictionary.doc2bow(token) for token in tokens]
#     topic_words = [[words for words, _ in result['model'].get_topic(topic)] for topic in range(len(set(result['topics']))-1)]

#     coherence_model_c_v = CoherenceModel(topics=topic_words,
#                                     texts=tokens,
#                                     corpus=corpus,
#                                     dictionary=dictionary,
#                                     coherence='c_v')
#     coherence_model_u_mass = CoherenceModel(topics=topic_words,
#                                     texts=tokens,
#                                     corpus=corpus,
#                                     dictionary=dictionary,
#                                     coherence='u_mass')
#     coherence_c_v = coherence_model_c_v.get_coherence()
#     coherence_u_mass = coherence_model_u_mass.get_coherence()
#     coherence_obj[period] = {
#         'coherence_c_v': coherence_c_v,
#         'coherence_u_mass': coherence_u_mass,
#     }

In [26]:
# coherence_obj

In [66]:
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [37]:
topic_list = list(topic_model.get_topics().values())

In [67]:
td = TopicDiversity(topk=10)
td_score = td.score({'topics': topic_list})
td_score

0.9924242424242424