# Libraries

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
from bertopic import BERTopic
from gensim.corpora import Dictionary

sys.path.insert(0, "../")
from utils.text_preprocessing import preprocessTextDataFrame, filterWordLengthTextDataFrame
from utils.quarter_labeling import labelQuarters
from utils.evaluation import calculate_coherence_score

# Load Data

In [2]:
tweet = pd.read_csv('../../src/cleaned_kp.csv')
tweet = tweet.dropna(subset=['full_text'])
tweet = tweet[['created_at', 'full_text']]
tweet

Unnamed: 0,created_at,full_text
0,2019-11-01 00:43:08+00:00,by lewat
1,2019-11-01 02:42:52+00:00,kakanwil kemenag provinsi papua pdt amsal yowe...
2,2019-11-01 08:53:32+00:00,politik baik anggota dprd betul psi libat raky...
3,2019-11-01 09:15:05+00:00,politik baik anggota dprd betul psi libat raky...
4,2019-11-01 11:16:16+00:00,kerja institusi izin alami baru masuk temu bi...
...,...,...
8431,2024-04-29 09:26:46+00:00,pintas masa depan agama indonesia lihat lebih ...
8432,2024-04-29 12:00:01+00:00,hitung tahun ajar perintah lalu kemendikbud ja...
8433,2024-04-29 13:45:36+00:00,bersikukuh tinggi komersialisasi semenjak pt...
8434,2024-04-29 16:48:01+00:00,kagak ngotak asli dahh sbnrnya salah sekaran...


# Preprocess

In [3]:
tweet_preprocessed = tweet.copy()
tweet_preprocessed = preprocessTextDataFrame(df=tweet_preprocessed, dt_column='created_at', text_column='full_text')
tweet_preprocessed = filterWordLengthTextDataFrame(tweet_preprocessed, 'full_text', 3)
tweet_preprocessed

Unnamed: 0,created_at,full_text
1,2019-11-01 02:42:52+00:00,"[kakanwil, kemenag, provinsi, papua, pdt, amsa..."
2,2019-11-01 08:53:32+00:00,"[politik, baik, anggota, dprd, betul, psi, lib..."
3,2019-11-01 09:15:05+00:00,"[politik, baik, anggota, dprd, betul, psi, lib..."
4,2019-11-01 11:16:16+00:00,"[kerja, institusi, izin, alami, baru, masuk, t..."
6,2019-11-02 02:40:05+00:00,"[penting, aspekaspek, publik]"
...,...,...
8431,2024-04-29 09:26:46+00:00,"[pintas, masa, depan, agama, indonesia, lihat,..."
8432,2024-04-29 12:00:01+00:00,"[hitung, tahun, ajar, perintah, lalu, kemendik..."
8433,2024-04-29 13:45:36+00:00,"[bersikukuh, tinggi, komersialisasi, semenjak,..."
8434,2024-04-29 16:48:01+00:00,"[kagak, ngotak, asli, dahh, sbnrnya, salah, se..."


In [4]:
tweet_quarter = tweet_preprocessed.copy()

tweet_quarter = labelQuarters(tweet_quarter, 'created_at')
tweet_quarter['full_text'] = tweet_quarter['full_text'].str.join(' ')

tweet_quarter = tweet_quarter.groupby('quarter')

# Modeling

In [37]:
results = {}

In [38]:
%%time
for period, group in tweet_quarter:
    topic_model = BERTopic(verbose=False,
                           language='indonesian', calculate_probabilities=True,
                           nr_topics=10,
                           )
    text_data = group['full_text'].tolist()
    timestamps = group['created_at'].tolist()
    topics, probs = topic_model.fit_transform(text_data)
    results[period] = {
        'model' : topic_model,
        'topics_over_time' : topic_model.topics_over_time(text_data, timestamps, nr_bins=30),
        'topics': topics,
        'probs': probs,
        'topic_info': topic_model.get_topic_info(),
    }

CPU times: total: 22min 10s
Wall time: 6min 47s


In [39]:
for period, result in results.items():
    fig = result['model'].visualize_topics_over_time(result['topics_over_time'], top_n_topics=10)
    fig.update_layout(title_text=f"Topics Over Time for {period}")
    fig.show()

In [40]:
tweet_preprocessed['full_text']

1       [kakanwil, kemenag, provinsi, papua, pdt, amsa...
2       [politik, baik, anggota, dprd, betul, psi, lib...
3       [politik, baik, anggota, dprd, betul, psi, lib...
4       [kerja, institusi, izin, alami, baru, masuk, t...
6                           [penting, aspekaspek, publik]
                              ...                        
8431    [pintas, masa, depan, agama, indonesia, lihat,...
8432    [hitung, tahun, ajar, perintah, lalu, kemendik...
8433    [bersikukuh, tinggi, komersialisasi, semenjak,...
8434    [kagak, ngotak, asli, dahh, sbnrnya, salah, se...
8435               [bahasa, inggris, sd, tantang, imbang]
Name: full_text, Length: 8391, dtype: object

In [41]:
top_topics = []


for period, result in results.items():
    top_10_topics = result['topic_info'][1:11]
    # top_10_topics['Name'] = top_10_topics['Name'].str.replace(r'^\d+_', '', regex=True)
    topic_descriptions = top_10_topics['Name'].tolist()
    top_topics.append({'period': period, 'topics': topic_descriptions})


top_topics_df = pd.DataFrame(top_topics)
display(top_topics_df)

Unnamed: 0,period,topics
0,2019_11-2020_01,"[0_nadiem_menteri_merdeka_guru, 1_tetap_pokok_..."
1,2020_02-2020_04,"[1_virus_corona_sebar_disease, 2_ajar_merdeka_..."
2,2020_05-2020_07,"[0_menteri_ajar_mahasiswa_sekolah, 1_normal_ne..."
3,2020_08-2020_10,"[0_kuat_efektivitas_prakerja_salur, 1_papua_ot..."
4,2020_11-2021_01,[1_ajar_perintah_indonesia_menteri]
5,2021_02-2021_04,"[0_musik_tradisional_apresiasi_karya, 1_pandem..."
6,2021_05-2021_07,[1_ajar_menteri_merdeka_sekolah]
7,2021_08-2021_10,"[0_kota_perintah_jalan_narkoba, 1_merdeka_indo..."
8,2021_11-2022_01,"[0_disabilitas_dorong_tara_risma, 1_student_lo..."
9,2022_02-2022_04,"[0_tingkat_sehat_perintah_daya, 1_pakai_guru_s..."


In [8]:
for quarter in top_topics:
    print(f"{quarter['period']}: {quarter['topics']}")

2019_11-2020_01: ['0_nadiem_menteri_merdeka_guru', '1_tetap_pokok_empat_mendikbud', '2_pokok_ajar_empat_merdeka']
2020_02-2020_04: ['0_ajar_menteri_sekolah_covid', '1_virus_corona_sebar_surat', '2_islam_minta_cegah_pemda', '3_disease_coronavirus_tahun_laksana']
2020_05-2020_07: ['0_pandemi_masa_covid_prinsip', '1_sosial_ekonomi_ribu_online', '2_pandemi_mahasiswa_covid_dampak', '3_ajar_budaya_guru_menteri', '4_indonesia_kualitas_tak_baik', '5_menteri_buat_pikir_presiden', '6_manajemen_perintah_afirmasi_sektor', '7_nadiem_insan_tinggi_gulir', '8_normal_masyarakat_kontra_new', '9_kampus_demo_mahasiswa_jadi']
2020_08-2020_10: ['0_ajar_menteri_pandemi_covid', '1_kuat_efektivitas_prakerja_selenggara', '2_papua_otsus_otonomi_barat']
2020_11-2021_01: ['1_gtgt_islam_seluruh_rakyat']
2021_02-2021_04: ['1_pandemi_biaya_audiensi_tengah', '2_musik_tradisional_apresiasi_karya']
2021_05-2021_07: ['1_pandemi_biaya_mahasiswa_ui']
2021_08-2021_10: ['0_vokasi_tinggi_merdeka_mahasiswa', '1_merdeka_indones

# Evaluation

In [35]:
result['model'].get_topic()

False

In [21]:
texts = tweet_preprocessed['full_text'].tolist()
dictionary = Dictionary(texts)


for period, result in results.items():
    coherence_score_cv = calculate_coherence_score(result['model'], texts, dictionary, coherence='c_v')
    coherence_score_umass = calculate_coherence_score(result['model'], texts, dictionary, coherence='u_mass')
    print(f"Period: {period}")
    print(f"Coherence Score CV: {coherence_score_cv}")
    print(f"Coherence Score UMass: {coherence_score_umass}")

TypeError: 'int' object is not subscriptable