# Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

# Load Data

In [2]:
tweet = pd.read_csv('../../src/cleaned_mb.csv')
tweet = tweet.dropna(subset=['full_text'])
tweet = tweet[['created_at', 'full_text']]
tweet['created_at'] = pd.to_datetime(tweet['created_at'])
tweet

Unnamed: 0,created_at,full_text
0,2019-11-01 02:51:51+00:00,kerja ajar senang senang penuh semangat merdeka
1,2019-11-02 04:27:12+00:00,tpn guruguru merdeka ajar moga tahun depan gab...
2,2019-11-03 10:46:27+00:00,kbm ambengan batu admin mau share beberapa dok...
3,2019-11-03 10:50:12+00:00,kbm tl ambengan admin mau share beberapa dokum...
4,2019-11-03 14:43:27+00:00,harus ajar loyal nurani pikir sendiri my heart...
...,...,...
34593,2024-04-29 13:43:31+00:00,wakatobi siap gelar konferensi internasional s...
34594,2024-04-29 14:23:23+00:00,konsorsium baterai listrik nasional
34595,2024-04-29 15:50:14+00:00,joki essay tema merdeka ajar
34596,2024-04-29 18:23:19+00:00,tim pkm pnup terima dana pkm vokasi tahun


# Preprocess

In [3]:
tweet_quarter = tweet.copy()

tweet_quarter['quarter'] = tweet_quarter['created_at'].dt.to_period('Q')

tweet_quarter = tweet_quarter.groupby('quarter')

# Modeling

In [4]:
results = {}

In [5]:
%%time
for period, group in tweet_quarter:
    topic_model = BERTopic(verbose=False,
                           language='indonesian', calculate_probabilities=True,
                           nr_topics=10,
                           )
    text_data = group['full_text'].tolist()
    timestamps = group['created_at'].tolist()
    topics, probs = topic_model.fit_transform(text_data)
    results[period] = {
        'model' : topic_model,
        'topics_over_time' : topic_model.topics_over_time(text_data, timestamps, nr_bins=30),
        'probs': probs,
        'topic_info': topic_model.get_topic_info(),
        
        'topics': topics,
        'text': text_data,
    }

KeyboardInterrupt: 

In [None]:
for period, result in results.items():
    fig = result['model'].visualize_topics_over_time(result['topics_over_time'], top_n_topics=10)
    fig.update_layout(title_text=f"Topics Over Time for {period}")
    fig.show()

In [None]:
top_topics = []


for period, result in results.items():
    top_10_topics = result['topic_info'][0:11]
    # top_10_topics['Name'] = top_10_topics['Name'].str.replace(r'^\d+_', '', regex=True)
    topic_descriptions = top_10_topics['Name'].tolist()
    top_topics.append({'period': period, 'topics': topic_descriptions})


top_topics_df = pd.DataFrame(top_topics)
display(top_topics_df)

Unnamed: 0,period,topics
0,2019Q4,"[-1_menteri_nadiem_ajar_merdeka, 0_empat_pokok..."
1,2020Q1,"[0_islam_minta_cegah_pemda, 1_merdeka_menteri_..."
2,2020Q2,"[-1_papua_ajar_perintah_covid, 0_mahasiswa_sek..."
3,2020Q3,"[0_kuat_kartu_prakerja_efektivitas, 1_papua_ot..."
4,2020Q4,"[0_islam_gtgt_cukup_seluruh, 1_menteri_ajar_pe..."
5,2021Q1,"[-1_akmrtv_sarah_radio_tv, 0_musik_tradisional..."
6,2021Q2,"[-1_hak_awal_anak_milik, 0_papua_otsus_infrast..."
7,2021Q3,"[-1_masyarakat_sehat_sanggau_papua, 0_pandemi_..."
8,2021Q4,"[0_tara_disabilitas_dorong_risma, 1_kota_narko..."
9,2022Q1,"[-1_ajar_bawaslu_milu_ptm, 0_tingkat_sehat_per..."


In [None]:
for quarter in top_topics:
    print(f"{quarter['period']}: {quarter['topics']}")

2019Q4: ['-1_menteri_nadiem_ajar_merdeka', '0_empat_pokok_tetap_merdeka', '1_indonesia_nadiem_makarim_presiden', '2_kembang_mendikbud_nadiem_jangan', '3_tri_baru_tingkat_kritik', '4_analisis_negara_suatu_rakyat', '5_guru_anak_sekolah_jadi', '6_budaya_menteri_program_makarim', '7_menteri_un_enggak_pak']
2020Q1: ['0_islam_minta_cegah_pemda', '1_merdeka_menteri_ajar_nadiem', '2_disease_coronavirus_darurat_masa', '3_virus_corona_sebar_covid']
2020Q2: ['-1_papua_ajar_perintah_covid', '0_mahasiswa_sekolah_jadi_buat', '1_menteri_indonesia_ajar_budaya', '2_pandemi_masa_covid_indonesia', '3_darurat_sebar_timur_jawa', '4_new_normal_hadap_jabar', '5_dpr_apresiasi_kali_komisi', '6_islam_normal_pesantren_agama', '7_saran_hendak_bila_implementasi', '8_dokter_siap_normal_era']
2020Q3: ['0_kuat_kartu_prakerja_efektivitas', '1_papua_otonomi_khusus_barat', '2_ajar_pjj_nilai_jarak', '3_ajar_menteri_pandemi_covid']
2020Q4: ['0_islam_gtgt_cukup_seluruh', '1_menteri_ajar_perintah_indonesia']
2021Q1: ['-1_ak

# Evaluation

In [None]:
coherence_obj = {}

for period, result in results.items():
    documents = pd.DataFrame({"Document": result['text'],
                              "ID": range(len(result['text'])),
                              "Topic": result['topics'],
                              })

    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = result['model']._preprocess_text(documents_per_topic.Document.values)

    vectorizer = result['model'].vectorizer_model
    analyzer = vectorizer.build_analyzer()

    words = vectorizer.get_feature_names_out()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in result['model'].get_topic(topic)] for topic in range(len(set(result['topics']))-1)]

    coherence_model_c_v = CoherenceModel(topics=topic_words,
                                    texts=tokens,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    coherence='c_v')
    coherence_model_u_mass = CoherenceModel(topics=topic_words,
                                    texts=tokens,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    coherence='u_mass')
    coherence_c_v = coherence_model_c_v.get_coherence()
    coherence_u_mass = coherence_model_u_mass.get_coherence()
    coherence_obj[period] = {
        'coherence_c_v': coherence_c_v,
        'coherence_u_mass': coherence_u_mass,
    }

In [None]:
coherence_obj

{Period('2019Q4', 'Q-DEC'): {'coherence_c_v': 0.5586950531695594,
  'coherence_u_mass': -0.5607027727226086},
 Period('2020Q1', 'Q-DEC'): {'coherence_c_v': 0.7348581618075546,
  'coherence_u_mass': -0.16566044331697286},
 Period('2020Q2', 'Q-DEC'): {'coherence_c_v': 0.6084942290321547,
  'coherence_u_mass': -0.40894767149817063},
 Period('2020Q3', 'Q-DEC'): {'coherence_c_v': 0.6559972028358545,
  'coherence_u_mass': -0.07837524280155266},
 Period('2020Q4', 'Q-DEC'): {'coherence_c_v': 0.7809663259471452,
  'coherence_u_mass': -0.015403270677732079},
 Period('2021Q1', 'Q-DEC'): {'coherence_c_v': 0.7049338235698431,
  'coherence_u_mass': -0.42268877879561967},
 Period('2021Q2', 'Q-DEC'): {'coherence_c_v': 0.3430110023832821,
  'coherence_u_mass': -0.09053185699700497},
 Period('2021Q3', 'Q-DEC'): {'coherence_c_v': 0.7318491948466705,
  'coherence_u_mass': -0.5081136275309767},
 Period('2021Q4', 'Q-DEC'): {'coherence_c_v': 0.7626448785739149,
  'coherence_u_mass': -0.2946078328114052},
 Pe