In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import torch

sys.path.insert(0, "../")
from utils.quarter_label import labelQuarters

In [24]:
tweet = pd.read_csv('../../src/cleaned_kp_bp.csv')
tweet = tweet.dropna(subset=['full_text'])
tweet = tweet[['created_at', 'full_text']]
tweet['created_at'] = pd.to_datetime(tweet['created_at'])
tweet = tweet.drop_duplicates(subset='full_text')
tweet

Unnamed: 0,created_at,full_text
0,2019-11-01 00:43:08+00:00,by lewat
1,2019-11-01 02:42:52+00:00,kakanwil kemenag provinsi papua pdt amsal yowe...
2,2019-11-01 08:53:32+00:00,politik baik anggota dprd betul psi libat raky...
4,2019-11-01 11:16:16+00:00,kerja institusi izin alami baru masuk temu bi...
5,2019-11-02 02:34:01+00:00,ajar publik
...,...,...
25773,2024-04-29 15:25:47+00:00,demo mahasiswa minta tuukt turun unsoed nyata ...
25774,2024-04-29 15:53:37+00:00,sumpah adikadik moga kalian kuatkuat mental am...
25775,2024-04-29 16:33:36+00:00,amanat konstitusi cerdas hidup bangsa bukan ma...
25776,2024-04-29 22:58:27+00:00,gue heran kadang biaya setinggi sekarang gaji...


In [25]:
from bertopic.vectorizers import ClassTfidfTransformer

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [26]:
from transformers import pipeline
from bertopic.representation import TextGeneration

prompt = "Aku memiliki kata kunci berikut: [KEYWORDS]. Berdasarkan kata kunci berikut, tentang apa topik ini?"

# Create your representation model
generator = pipeline('text2text-generation', model='google/flan-t5-base')
representation_model = TextGeneration(generator)

In [27]:
%%time
topic_model = BERTopic(verbose=True,
                        calculate_probabilities=True,
                        language='indonesian',
                        nr_topics="auto",
                        ctfidf_model = ctfidf_model,
                        n_gram_range=(1, 2),
                        min_topic_size = 50, 
                        # representation_model = representation_model
                        )
text_data = tweet['full_text'].tolist()
timestamps = tweet['created_at'].tolist()

topics, probs = topic_model.fit_transform(text_data)

results = {
        'model' : topic_model,
        'topics_over_time' : topic_model.topics_over_time(text_data, timestamps, nr_bins=18),
        'probs': probs,
        'topic_info': topic_model.get_topic_info(),
        
        'topics': topics,
        'text': text_data,
    }

2024-05-27 17:04:12,289 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/634 [00:00<?, ?it/s]

2024-05-27 17:04:34,941 - BERTopic - Embedding - Completed ✓
2024-05-27 17:04:34,942 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-27 17:04:45,489 - BERTopic - Dimensionality - Completed ✓
2024-05-27 17:04:45,649 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-27 17:04:59,765 - BERTopic - Cluster - Completed ✓
2024-05-27 17:04:59,766 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-27 17:05:00,910 - BERTopic - Representation - Completed ✓
2024-05-27 17:05:00,910 - BERTopic - Topic reduction - Reducing number of topics
2024-05-27 17:05:02,012 - BERTopic - Topic reduction - Reduced number of topics from 49 to 27
18it [00:15,  1.18it/s]


CPU times: total: 13.3 s
Wall time: 1min 9s


In [28]:
fig = results['model'].visualize_topics_over_time(results['topics_over_time'], top_n_topics=20)
# fig.update_layout(title_text=f"Topics Over Time for {period}")
fig.show()

In [21]:
topic_model.get_representative_docs()

{-1: ['cover nasional pinjam dana tunai modal usaha biaya renovasi rumah biaya  anak gadai bpkb mobil minimal tahun take over proses cepat mudah tuju leasing company go publik ayo guys dm wa langsung',
  'cover nasional tiap daerah kantor cabang pinjam dana tunai modal usaha biaya  anak biaya renovasi rumah dana talang jamin bpkb mobil minimal tahun take over kalau mobil kredit ayo guys butuh pinjam dm',
  'cover nasional pinjam dana tunai modal usaha biaya renovasi rumah biaya  anak gadai bpkb mobil minimal tahun take over proses cepat mudah tuju leasing company go publik ayo guys dm wa langsung'],
 0: ['baik datang pustaka umum biaya sekolah karcis bis sekal lima sen hari baca buku bekas kau perlu tahu bagaimana mula kau tahu cara pergi pustaka umum',
  'baik datang pustaka umum biaya sekolah karcis bis sekal lima sen hari baca buku bekas kau perlu tahu bagaimana mula kau tahu cara pergi pustaka umum',
  'baik datang pustaka umum biaya sekolah karcis bis sekal lima sen hari baca buku

In [6]:
topic_model.get_topics(

NameError: name 'topic_model' is not defined