# Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

# Load Data

In [2]:
tweet = pd.read_csv('../../src/cleaned_kp.csv')
tweet = tweet.dropna(subset=['full_text'])
tweet = tweet[['created_at', 'full_text']]
tweet['created_at'] = pd.to_datetime(tweet['created_at'])
tweet

Unnamed: 0,created_at,full_text
0,2019-11-01 00:43:08+00:00,by lewat
1,2019-11-01 02:42:52+00:00,kakanwil kemenag provinsi papua pdt amsal yowe...
2,2019-11-01 08:53:32+00:00,politik baik anggota dprd betul psi libat raky...
3,2019-11-01 09:15:05+00:00,politik baik anggota dprd betul psi libat raky...
4,2019-11-01 11:16:16+00:00,kerja institusi izin alami baru masuk temu bi...
...,...,...
8431,2024-04-29 09:26:46+00:00,pintas masa depan agama indonesia lihat lebih ...
8432,2024-04-29 12:00:01+00:00,hitung tahun ajar perintah lalu kemendikbud ja...
8433,2024-04-29 13:45:36+00:00,bersikukuh tinggi komersialisasi semenjak pt...
8434,2024-04-29 16:48:01+00:00,kagak ngotak asli dahh sbnrnya salah sekaran...


In [3]:
from torch.utils.data import DataLoader, Dataset

# Preprocess

In [3]:
# tweet_quarter = tweet.copy()

# tweet_quarter['quarter'] = tweet_quarter['created_at'].dt.to_period('Q')

# tweet_quarter = tweet_quarter.groupby('quarter')

In [4]:
tweet_time = tweet.copy()

In [5]:
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

In [6]:
text_dataset = TextDataset(tweet)

# Modeling

In [7]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

In [8]:
from transformers import pipeline
embedding_model = pipeline("feature-extraction", model=model, tokenizer=tokenizer, device=0)
# indolem/indobert-base-uncased

In [14]:
results = {}

In [21]:
topic_model = BERTopic(verbose=False,
                        embedding_model=embedding_model, calculate_probabilities=True,
                        nr_topics=10,
                        )
text_data = tweet_time['full_text'].tolist()
timestamps = tweet_time['created_at'].tolist()

topics, probs = topic_model.fit_transform(text_data)

results = {
        'model' : topic_model,
        'topics_over_time' : topic_model.topics_over_time(text_data, timestamps, nr_bins=18),
        'probs': probs,
        'topic_info': topic_model.get_topic_info(),
        
        'topics': topics,
        'text': text_data,
    }

In [7]:
# %%time
# for period, group in tweet_quarter:
#     topic_model = BERTopic(verbose=False,
#                            embedding_model=embedding_model, calculate_probabilities=True,
#                            nr_topics=10,
#                            )
#     text_data = group['full_text'].tolist()
#     timestamps = group['created_at'].tolist()
#     topics, probs = topic_model.fit_transform(text_data)
#     results[period] = {
#         'model' : topic_model,
#         'topics_over_time' : topic_model.topics_over_time(text_data, timestamps, nr_bins=30),
#         'probs': probs,
#         'topic_info': topic_model.get_topic_info(),
        
#         'topics': topics,
#         'text': text_data,
#     }

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


CPU times: total: 28.4 s
Wall time: 2min 51s


In [22]:
fig = results['model'].visualize_topics_over_time(results['topics_over_time'], top_n_topics=18)
# fig.update_layout(title_text=f"Topics Over Time for {period}")
fig.show()

In [23]:
topic_model.get_topics()

{-1: [('ajar', 0.029005994866241714),
  ('menteri', 0.02463909496290714),
  ('indonesia', 0.022507745762977108),
  ('perintah', 0.022170892118884813),
  ('guru', 0.02160703029739906),
  ('jadi', 0.02124994202303235),
  ('sekolah', 0.01967137222505872),
  ('merdeka', 0.018508966863769894),
  ('budaya', 0.01633341158012421),
  ('baru', 0.016328123805663973)],
 0: [('sekolah', 0.023824331026173005),
  ('baik', 0.02264441104229927),
  ('indonesia', 0.021277387834932316),
  ('jadi', 0.020937464546624728),
  ('buat', 0.02010130975504663),
  ('tingkat', 0.019822356957325607),
  ('enggak', 0.019658322187683673),
  ('menteri', 0.01931530430842517),
  ('ajar', 0.019014111916979423),
  ('lebih', 0.017753256720399464)],
 1: [('pandemi', 0.1428694886118967),
  ('masa', 0.09722004022816408),
  ('covid', 0.08593744082290117),
  ('tengah', 0.05574149677021915),
  ('perintah', 0.054103793722644206),
  ('lama', 0.046952279655162216),
  ('inews', 0.037819538510577164),
  ('pesantren', 0.03580985368153277

In [24]:
topic_model.get_topic_freq()

Unnamed: 0,Topic,Count
1,0,4695
2,-1,3114
0,1,374
5,2,91
3,3,48
8,4,36
9,5,28
6,6,20
7,7,16
4,8,12


In [25]:
topic_model.get_document_info(text_data)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,by lewat,1,1_pandemi_masa_covid_tengah,"[pandemi, masa, covid, tengah, perintah, lama,...","[tinggi masa pandemi covid, ubah lama masa p...",pandemi - masa - covid - tengah - perintah - l...,0.405194,False
1,kakanwil kemenag provinsi papua pdt amsal yowe...,0,0_sekolah_baik_indonesia_jadi,"[sekolah, baik, indonesia, jadi, buat, tingkat...",[info nih tuips visi bapak prabowo kuat standa...,sekolah - baik - indonesia - jadi - buat - tin...,0.488641,False
2,politik baik anggota dprd betul psi libat raky...,-1,-1_ajar_menteri_indonesia_perintah,"[ajar, menteri, indonesia, perintah, guru, jad...",[menteri budaya kemendikbud umum ada sesuai ...,ajar - menteri - indonesia - perintah - guru -...,0.645905,False
3,politik baik anggota dprd betul psi libat raky...,-1,-1_ajar_menteri_indonesia_perintah,"[ajar, menteri, indonesia, perintah, guru, jad...",[menteri budaya kemendikbud umum ada sesuai ...,ajar - menteri - indonesia - perintah - guru -...,0.619910,False
4,kerja institusi izin alami baru masuk temu bi...,-1,-1_ajar_menteri_indonesia_perintah,"[ajar, menteri, indonesia, perintah, guru, jad...",[menteri budaya kemendikbud umum ada sesuai ...,ajar - menteri - indonesia - perintah - guru -...,0.214271,False
...,...,...,...,...,...,...,...,...
8429,pintas masa depan agama indonesia lihat lebih ...,0,0_sekolah_baik_indonesia_jadi,"[sekolah, baik, indonesia, jadi, buat, tingkat...",[info nih tuips visi bapak prabowo kuat standa...,sekolah - baik - indonesia - jadi - buat - tin...,0.911461,False
8430,hitung tahun ajar perintah lalu kemendikbud ja...,0,0_sekolah_baik_indonesia_jadi,"[sekolah, baik, indonesia, jadi, buat, tingkat...",[info nih tuips visi bapak prabowo kuat standa...,sekolah - baik - indonesia - jadi - buat - tin...,0.467127,False
8431,bersikukuh tinggi komersialisasi semenjak pt...,-1,-1_ajar_menteri_indonesia_perintah,"[ajar, menteri, indonesia, perintah, guru, jad...",[menteri budaya kemendikbud umum ada sesuai ...,ajar - menteri - indonesia - perintah - guru -...,0.833745,False
8432,kagak ngotak asli dahh sbnrnya salah sekaran...,0,0_sekolah_baik_indonesia_jadi,"[sekolah, baik, indonesia, jadi, buat, tingkat...",[info nih tuips visi bapak prabowo kuat standa...,sekolah - baik - indonesia - jadi - buat - tin...,0.888564,False


In [26]:
topic_model.get_representative_docs()

{-1: ['menteri  budaya kemendikbud umum ada sesuai  ajar masa pandemi covid mendikbud nadiem makarim kata laksana kurikulum kondisi khusus',
  'menteri  budaya riset teknologi mendikbudristek nadiem anwar makarim kata kurikulum merdeka rupa bagi  merdeka ajar mampu kurang dampak hilang ajar akibat pandemi covid',
  'dekat ajar jarak jauh menteri  budaya putus  ajar ajar ajar tahun ajar baru pandemi covid jadi ancam sistem ajar'],
 0: ['info nih tuips visi bapak prabowo kuat standar sekolah cipta  lebih baik  baik semua lapis semakindekat semakinpakbowo visisubianto jelasarahnya',
  'info nih bro harap bapak prabowo tingkat sistem  masuk  lebih baik  kualitas semua lapis semakindekat semakinpakbowo visisubianto jelasarahnya',
  'tarik nih sis visi bapak prabowo kuat sistem sekolah buat  solutif  baik semua rakyat'],
 1: ['tinggi masa pandemi covid',
  'ubah   lama masa pandemi covid',
  'ubah   lama masa pandemi covid'],
 2: ['ptn bantu pilih mending pil   uny pil teknologi  unnes pil t

In [27]:
topic_model.generate_topic_labels()

['-1_ajar_menteri_indonesia',
 '0_sekolah_baik_indonesia',
 '1_pandemi_masa_covid',
 '2_uny_pil_ptn',
 '3_dewantara_ki_belanda',
 '4_pokok_empat_saran',
 '5_musik_tradisional_literasi',
 '6_nadiem_kembang_mendikbud',
 '7_mahasiswa_hari_nasional',
 '8_akurat_keliru_data']

In [28]:
topic_model.visualize_topics()

In [30]:
topic_model.visualize_documents(text_data)

In [38]:
topic_model.visualize_hierarchy()

In [36]:
hierarchical_topics = topic_model.hierarchical_topics(text_data)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 8/8 [00:00<00:00, 72.81it/s]


In [11]:
# for period, result in results.items():
#     fig = result['model'].visualize_topics_over_time(result['topics_over_time'], top_n_topics=10)
#     fig.update_layout(title_text=f"Topics Over Time for {period}")
#     fig.show()

In [16]:
# top_topics = []


# for period, result in results.items():
#     top_10_topics = result['topic_info'][0:11]
#     # top_10_topics['Name'] = top_10_topics['Name'].str.replace(r'^\d+_', '', regex=True)
#     topic_descriptions = top_10_topics['Name'].tolist()
#     top_topics.append({'period': period, 'topics': topic_descriptions})


# top_topics_df = pd.DataFrame(top_topics)
# display(top_topics_df)

In [17]:
# for quarter in top_topics:
#     print(f"{quarter['period']}: {quarter['topics']}")

# Evaluation

In [11]:
# coherence_obj = {}

# for period, result in results.items():
#     documents = pd.DataFrame({"Document": result['text'],
#                               "ID": range(len(result['text'])),
#                               "Topic": result['topics'],
#                               })

#     documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
#     cleaned_docs = result['model']._preprocess_text(documents_per_topic.Document.values)

#     vectorizer = result['model'].vectorizer_model
#     analyzer = vectorizer.build_analyzer()

#     words = vectorizer.get_feature_names_out()
#     tokens = [analyzer(doc) for doc in cleaned_docs]
#     dictionary = corpora.Dictionary(tokens)
#     corpus = [dictionary.doc2bow(token) for token in tokens]
#     topic_words = [[words for words, _ in result['model'].get_topic(topic)] for topic in range(len(set(result['topics']))-1)]

#     coherence_model_c_v = CoherenceModel(topics=topic_words,
#                                     texts=tokens,
#                                     corpus=corpus,
#                                     dictionary=dictionary,
#                                     coherence='c_v')
#     coherence_model_u_mass = CoherenceModel(topics=topic_words,
#                                     texts=tokens,
#                                     corpus=corpus,
#                                     dictionary=dictionary,
#                                     coherence='u_mass')
#     coherence_c_v = coherence_model_c_v.get_coherence()
#     coherence_u_mass = coherence_model_u_mass.get_coherence()
#     coherence_obj[period] = {
#         'coherence_c_v': coherence_c_v,
#         'coherence_u_mass': coherence_u_mass,
#     }

In [12]:
# coherence_obj

In [37]:
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence