# Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

# Load Data

In [2]:
tweet = pd.read_csv('../../src/cleaned_kp.csv')
tweet = tweet.dropna(subset=['full_text'])
tweet = tweet[['created_at', 'full_text']]
tweet['created_at'] = pd.to_datetime(tweet['created_at'])
tweet

Unnamed: 0,created_at,full_text
0,2019-11-01 00:43:08+00:00,by lewat
1,2019-11-01 02:42:52+00:00,kakanwil kemenag provinsi papua pdt amsal yowe...
2,2019-11-01 08:53:32+00:00,politik baik anggota dprd betul psi libat raky...
3,2019-11-01 09:15:05+00:00,politik baik anggota dprd betul psi libat raky...
4,2019-11-01 11:16:16+00:00,kerja institusi izin alami baru masuk temu bi...
...,...,...
8431,2024-04-29 09:26:46+00:00,pintas masa depan agama indonesia lihat lebih ...
8432,2024-04-29 12:00:01+00:00,hitung tahun ajar perintah lalu kemendikbud ja...
8433,2024-04-29 13:45:36+00:00,bersikukuh tinggi komersialisasi semenjak pt...
8434,2024-04-29 16:48:01+00:00,kagak ngotak asli dahh sbnrnya salah sekaran...


# Preprocess

In [3]:
# tweet_quarter = tweet.copy()

# tweet_quarter['quarter'] = tweet_quarter['created_at'].dt.to_period('Q')

# tweet_quarter = tweet_quarter.groupby('quarter')

In [4]:
tweet_time = tweet.copy()

# Modeling

In [5]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

In [6]:
from transformers import pipeline
embedding_model = pipeline("feature-extraction", model=model, tokenizer=tokenizer, device=0)
# indolem/indobert-base-uncased

In [7]:
results = {}

In [8]:
%%time
topic_model = BERTopic(verbose=True,
                        embedding_model=embedding_model, calculate_probabilities=True,
                        nr_topics="auto",
                        )
text_data = tweet_time['full_text'].tolist()
timestamps = tweet_time['created_at'].tolist()

topics, probs = topic_model.fit_transform(text_data)

results = {
        'model' : topic_model,
        'topics_over_time' : topic_model.topics_over_time(text_data, timestamps, nr_bins=18),
        'probs': probs,
        'topic_info': topic_model.get_topic_info(),
        
        'topics': topics,
        'text': text_data,
    }

2024-05-22 16:11:52,114 - BERTopic - Embedding - Transforming documents to embeddings.
  0%|          | 0/8434 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 8434/8434 [01:23<00:00, 101.00it/s]
2024-05-22 16:13:17,462 - BERTopic - Embedding - Completed ✓
2024-05-22 16:13:17,464 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-22 16:13:41,208 - BERTopic - Dimensionality - Completed ✓
2024-05-22 16:13:41,209 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-22 16:13:46,397 - BERTopic - Cluster - Completed ✓
2024-05-22 16:13:46,398 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-22 16:13:46,600 - BERTopic - Representation - Completed ✓
2024-05-22 16:13:46,601 - BERTopic - Topic reduction - Reducing number of topics
2024-05-22 16:13:46,742 - BERTopic - Topic 

CPU times: total: 25 s
Wall time: 1min 58s





In [9]:
# %%time
# for period, group in tweet_quarter:
#     topic_model = BERTopic(verbose=False,
#                            embedding_model=embedding_model, calculate_probabilities=True,
#                            nr_topics=10,
#                            )
#     text_data = group['full_text'].tolist()
#     timestamps = group['created_at'].tolist()
#     topics, probs = topic_model.fit_transform(text_data)
#     results[period] = {
#         'model' : topic_model,
#         'topics_over_time' : topic_model.topics_over_time(text_data, timestamps, nr_bins=30),
#         'probs': probs,
#         'topic_info': topic_model.get_topic_info(),
        
#         'topics': topics,
#         'text': text_data,
#     }

In [10]:
fig = results['model'].visualize_topics_over_time(results['topics_over_time'], top_n_topics=20)
# fig.update_layout(title_text=f"Topics Over Time for {period}")
fig.show()

In [11]:
topic_model.get_topics()

{-1: [('ajar', 0.011395432034527),
  ('menteri', 0.009563176508836684),
  ('jadi', 0.009304856522588912),
  ('perintah', 0.009052756678343994),
  ('guru', 0.008826320222420431),
  ('indonesia', 0.008749064549098757),
  ('buat', 0.007761744750278137),
  ('merdeka', 0.007563420219792927),
  ('sekolah', 0.007400091681408836),
  ('tinggi', 0.00724698469140529)],
 0: [('enggak', 0.01178517148902593),
  ('ganjar', 0.011273098650458944),
  ('menteri', 0.010371185958137956),
  ('jadi', 0.009899507268161634),
  ('nadiem', 0.00882676962382643),
  ('sekolah', 0.008741463863141325),
  ('indonesia', 0.008653326683482709),
  ('guru', 0.008617016284121879),
  ('budaya', 0.008556327501086765),
  ('merdeka', 0.008409454407972822)],
 1: [('prabowo', 0.08201809983263758),
  ('visisubianto', 0.06551809709006114),
  ('nih', 0.05615692170721413),
  ('bapak', 0.05163639467105243),
  ('semakinpakbowo', 0.05132391877906781),
  ('semakindekat', 0.05132391877906781),
  ('jelasarahnya', 0.05100063124205645),
  ('

In [12]:
topic_model.get_topic_freq()

Unnamed: 0,Topic,Count
2,-1,3140
4,0,3083
51,1,400
1,2,140
9,3,134
...,...,...
26,61,11
60,62,11
59,63,11
11,64,11


In [13]:
topic_model.get_document_info(text_data)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,by lewat,4,4_sinkron_integrasi_disruptif_publik,"[sinkron, integrasi, disruptif, publik, jalur,...","[sinkron kemendikbud integrasi jalur, sinkron...",sinkron - integrasi - disruptif - publik - jal...,0.670896,False
1,kakanwil kemenag provinsi papua pdt amsal yowe...,2,2_giat_kepala_provinsi_dinas,"[giat, kepala, provinsi, dinas, rapat, laksana...",[kasih penkum kejat aceh h munawal had sh mh l...,giat - kepala - provinsi - dinas - rapat - lak...,0.257977,False
2,politik baik anggota dprd betul psi libat raky...,-1,-1_ajar_menteri_jadi_perintah,"[ajar, menteri, jadi, perintah, guru, indonesi...",[menteri budaya riset teknologi republik indo...,ajar - menteri - jadi - perintah - guru - indo...,0.443967,False
3,politik baik anggota dprd betul psi libat raky...,-1,-1_ajar_menteri_jadi_perintah,"[ajar, menteri, jadi, perintah, guru, indonesi...",[menteri budaya riset teknologi republik indo...,ajar - menteri - jadi - perintah - guru - indo...,0.351337,False
4,kerja institusi izin alami baru masuk temu bi...,-1,-1_ajar_menteri_jadi_perintah,"[ajar, menteri, jadi, perintah, guru, indonesi...",[menteri budaya riset teknologi republik indo...,ajar - menteri - jadi - perintah - guru - indo...,0.635553,False
...,...,...,...,...,...,...,...,...
8429,pintas masa depan agama indonesia lihat lebih ...,0,0_enggak_ganjar_menteri_jadi,"[enggak, ganjar, menteri, jadi, nadiem, sekola...",[mendikbud nadiem makarim sampai empat program...,enggak - ganjar - menteri - jadi - nadiem - se...,1.000000,False
8430,hitung tahun ajar perintah lalu kemendikbud ja...,0,0_enggak_ganjar_menteri_jadi,"[enggak, ganjar, menteri, jadi, nadiem, sekola...",[mendikbud nadiem makarim sampai empat program...,enggak - ganjar - menteri - jadi - nadiem - se...,0.571317,False
8431,bersikukuh tinggi komersialisasi semenjak pt...,0,0_enggak_ganjar_menteri_jadi,"[enggak, ganjar, menteri, jadi, nadiem, sekola...",[mendikbud nadiem makarim sampai empat program...,enggak - ganjar - menteri - jadi - nadiem - se...,0.398559,False
8432,kagak ngotak asli dahh sbnrnya salah sekaran...,0,0_enggak_ganjar_menteri_jadi,"[enggak, ganjar, menteri, jadi, nadiem, sekola...",[mendikbud nadiem makarim sampai empat program...,enggak - ganjar - menteri - jadi - nadiem - se...,0.674620,False


In [14]:
# topic_model.get_representative_docs()

In [15]:
topic_model.visualize_topics()

In [16]:
topic_model.visualize_documents(text_data)

In [17]:
hierarchical_topics = topic_model.hierarchical_topics(text_data)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 65/65 [00:00<00:00, 352.88it/s]


In [18]:
# for period, result in results.items():
#     fig = result['model'].visualize_topics_over_time(result['topics_over_time'], top_n_topics=10)
#     fig.update_layout(title_text=f"Topics Over Time for {period}")
#     fig.show()

In [19]:
# top_topics = []


# for period, result in results.items():
#     top_10_topics = result['topic_info'][0:11]
#     # top_10_topics['Name'] = top_10_topics['Name'].str.replace(r'^\d+_', '', regex=True)
#     topic_descriptions = top_10_topics['Name'].tolist()
#     top_topics.append({'period': period, 'topics': topic_descriptions})


# top_topics_df = pd.DataFrame(top_topics)
# display(top_topics_df)

In [20]:
# for quarter in top_topics:
#     print(f"{quarter['period']}: {quarter['topics']}")

# Evaluation

In [21]:
# coherence_obj = {}

# for period, result in results.items():
#     documents = pd.DataFrame({"Document": result['text'],
#                               "ID": range(len(result['text'])),
#                               "Topic": result['topics'],
#                               })

#     documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
#     cleaned_docs = result['model']._preprocess_text(documents_per_topic.Document.values)

#     vectorizer = result['model'].vectorizer_model
#     analyzer = vectorizer.build_analyzer()

#     words = vectorizer.get_feature_names_out()
#     tokens = [analyzer(doc) for doc in cleaned_docs]
#     dictionary = corpora.Dictionary(tokens)
#     corpus = [dictionary.doc2bow(token) for token in tokens]
#     topic_words = [[words for words, _ in result['model'].get_topic(topic)] for topic in range(len(set(result['topics']))-1)]

#     coherence_model_c_v = CoherenceModel(topics=topic_words,
#                                     texts=tokens,
#                                     corpus=corpus,
#                                     dictionary=dictionary,
#                                     coherence='c_v')
#     coherence_model_u_mass = CoherenceModel(topics=topic_words,
#                                     texts=tokens,
#                                     corpus=corpus,
#                                     dictionary=dictionary,
#                                     coherence='u_mass')
#     coherence_c_v = coherence_model_c_v.get_coherence()
#     coherence_u_mass = coherence_model_u_mass.get_coherence()
#     coherence_obj[period] = {
#         'coherence_c_v': coherence_c_v,
#         'coherence_u_mass': coherence_u_mass,
#     }

In [22]:
# coherence_obj

In [23]:
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [24]:
topic_list = list(topic_model.get_topics().values())

In [25]:
td = TopicDiversity(topk=10)
td_score = td.score({'topics': topic_list})
td_score

0.9925373134328358

In [None]:
def get_topic_diversity(topic_model, topk=10):
    td = TopicDiversity(topk=topk)
    topic_list = list(topic_model.get_topics().values())
    td_score = td.score({'topics': topic_list})
    return td_score