# Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

# Load Data

In [2]:
tweet = pd.read_csv('../../src/cleaned_kp.csv')
tweet = tweet.dropna(subset=['full_text'])
tweet = tweet[['created_at', 'full_text']]
tweet['created_at'] = pd.to_datetime(tweet['created_at'])
tweet

Unnamed: 0,created_at,full_text
0,2019-11-01 00:43:08+00:00,by lewat
1,2019-11-01 02:42:52+00:00,kakanwil kemenag provinsi papua pdt amsal yowe...
2,2019-11-01 08:53:32+00:00,politik baik anggota dprd betul psi libat raky...
3,2019-11-01 09:15:05+00:00,politik baik anggota dprd betul psi libat raky...
4,2019-11-01 11:16:16+00:00,kerja institusi izin alami baru masuk temu bi...
...,...,...
8431,2024-04-29 09:26:46+00:00,pintas masa depan agama indonesia lihat lebih ...
8432,2024-04-29 12:00:01+00:00,hitung tahun ajar perintah lalu kemendikbud ja...
8433,2024-04-29 13:45:36+00:00,bersikukuh tinggi komersialisasi semenjak pt...
8434,2024-04-29 16:48:01+00:00,kagak ngotak asli dahh sbnrnya salah sekaran...


# Preprocess

In [3]:
tweet_quarter = tweet.copy()

tweet_quarter['quarter'] = tweet_quarter['created_at'].dt.to_period('Q')

tweet_quarter = tweet_quarter.groupby('quarter')

# Modeling

In [4]:
# from transformers import AutoTokenizer, AutoModel
# tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased")
# model = AutoModel.from_pretrained("indolem/indobertweet-base-uncased")

In [5]:
from transformers import pipeline
# embedding_model = pipeline("feature-extraction", model=model, tokenizer=tokenizer, device=0)
embedding_model = pipeline("feature-extraction", model='indolem/indobertweet-base-uncased', device=0)
# indolem/indobert-base-uncased

In [6]:
results = {}

In [7]:
%%time
for period, group in tweet_quarter:
    topic_model = BERTopic(verbose=False,
                           embedding_model=embedding_model, calculate_probabilities=True,
                           nr_topics=10,
                           )
    text_data = group['full_text'].tolist()
    timestamps = group['created_at'].tolist()
    topics, probs = topic_model.fit_transform(text_data)
    results[period] = {
        'model' : topic_model,
        'topics_over_time' : topic_model.topics_over_time(text_data, timestamps, nr_bins=30),
        'probs': probs,
        'topic_info': topic_model.get_topic_info(),
        
        'topics': topics,
        'text': text_data,
    }

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


CPU times: total: 28.4 s
Wall time: 2min 51s


In [8]:
for period, result in results.items():
    fig = result['model'].visualize_topics_over_time(result['topics_over_time'], top_n_topics=10)
    fig.update_layout(title_text=f"Topics Over Time for {period}")
    fig.show()

In [9]:
top_topics = []


for period, result in results.items():
    top_10_topics = result['topic_info'][0:11]
    # top_10_topics['Name'] = top_10_topics['Name'].str.replace(r'^\d+_', '', regex=True)
    topic_descriptions = top_10_topics['Name'].tolist()
    top_topics.append({'period': period, 'topics': topic_descriptions})


top_topics_df = pd.DataFrame(top_topics)
display(top_topics_df)

Unnamed: 0,period,topics
0,2019Q4,"[-1_keliru_akurat_data_sampai, 0_kembang_mendi..."
1,2020Q1,"[0_minta_islam_cegah_pemda, 1_merdeka_ajar_men..."
2,2020Q2,"[-1_ajar_anak_perintah_buat, 0_perintah_mahasi..."
3,2020Q3,"[0_kuat_prakerja_kartu_efektivitas, 1_papua_ot..."
4,2020Q4,"[-1_pandemi_ajar_menteri_perintah, 0_cecep_upi..."
5,2021Q1,"[0_kualitas_mampu_otsus_papua, 1_audiensi_teng..."
6,2021Q2,"[-1_lanny_jaya_otsus_papua, 0_papua_otsus_infr..."
7,2021Q3,"[-1_pandemi_ajar_sekolah_jadi, 0_kualitas_ting..."
8,2021Q4,"[-1_materi_antikorupsi_mesti_libat, 0_jokowi_s..."
9,2022Q1,"[-1_tingkat_milu_bawaslu_perintah, 0_enggak_ne..."


In [10]:
for quarter in top_topics:
    print(f"{quarter['period']}: {quarter['topics']}")

2019Q4: ['-1_keliru_akurat_data_sampai', '0_kembang_mendikbud_nadiem_tempo', '1_menteri_nadiem_jadi_guru', '2_merdeka_ajar_pokok_empat', '3_tetap_empat_pokok_mendikbud']
2020Q1: ['0_minta_islam_cegah_pemda', '1_merdeka_ajar_menteri_nadiem']
2020Q2: ['-1_ajar_anak_perintah_buat', '0_perintah_mahasiswa_menteri_baru', '1_masa_pandemi_ubah_lama', '2_timur_jawa_darurat_masa', '3_dpr_kali_komisi_kemendikbud', '4_saran_hendak_bila_implementasi', '5_afirmasi_kuat_sektor_pesantren', '6_ajar_jarak_tahun_ancam', '7_era_dokter_normal_siap', '8_pondok_menko_umum_pmk']
2020Q3: ['0_kuat_prakerja_kartu_efektivitas', '1_papua_otonomi_khusus_otsus', '2_ajar_menteri_pandemi_covid']
2020Q4: ['-1_pandemi_ajar_menteri_perintah', '0_cecep_upi_darmawan_universitas', '1_jadi_buat_ptnbh_baik', '2_kurikulum_kembang_buku_politik', '3_buat_satu_program_kalian', '4_muka_tatap_sekolah_ajar', '5_merdeka_menteri_kampus_nadiem']
2021Q1: ['0_kualitas_mampu_otsus_papua', '1_audiensi_tengah_pandemi_langsung', '2_musik_tra

# Evaluation

In [11]:
# coherence_obj = {}

# for period, result in results.items():
#     documents = pd.DataFrame({"Document": result['text'],
#                               "ID": range(len(result['text'])),
#                               "Topic": result['topics'],
#                               })

#     documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
#     cleaned_docs = result['model']._preprocess_text(documents_per_topic.Document.values)

#     vectorizer = result['model'].vectorizer_model
#     analyzer = vectorizer.build_analyzer()

#     words = vectorizer.get_feature_names_out()
#     tokens = [analyzer(doc) for doc in cleaned_docs]
#     dictionary = corpora.Dictionary(tokens)
#     corpus = [dictionary.doc2bow(token) for token in tokens]
#     topic_words = [[words for words, _ in result['model'].get_topic(topic)] for topic in range(len(set(result['topics']))-1)]

#     coherence_model_c_v = CoherenceModel(topics=topic_words,
#                                     texts=tokens,
#                                     corpus=corpus,
#                                     dictionary=dictionary,
#                                     coherence='c_v')
#     coherence_model_u_mass = CoherenceModel(topics=topic_words,
#                                     texts=tokens,
#                                     corpus=corpus,
#                                     dictionary=dictionary,
#                                     coherence='u_mass')
#     coherence_c_v = coherence_model_c_v.get_coherence()
#     coherence_u_mass = coherence_model_u_mass.get_coherence()
#     coherence_obj[period] = {
#         'coherence_c_v': coherence_c_v,
#         'coherence_u_mass': coherence_u_mass,
#     }

In [12]:
# coherence_obj