# *Libraries*

In [1]:
!pip install sentence_transformers --quiet
!pip install bertopic --quiet

In [2]:
import pandas as pd
import numpy as np
import nltk
import re

In [3]:
from IPython.display import clear_output
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
from hdbscan  import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from transformers import pipeline
from nltk.corpus import stopwords
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from scipy.sparse import csr_matrix

clear_output()

# *Data*

In [4]:
DATASETS = '/kaggle/input/category-president-tweet-satria-data-sf/classify_president_cat_1.csv'

In [5]:
def clean_tweet(tweet):
    # Hapus hashtag
    tweet = re.sub(r"#\w+", "", tweet)
    # Hapus karakter newline
    tweet = tweet.replace('\n', ' ')
    # Hapus spasi berlebih yang mungkin tersisa
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    # Hapus tanda re
    tweet = re.sub(r'\[re[^\]]*\]', '', tweet)
   
    return tweet.strip()

In [6]:
df = pd.read_csv(DATASETS)

In [7]:
df = df.drop_duplicates(subset=['content'])
df = df.reset_index(drop=True)

In [8]:
df['content'] = df['content'].apply(clean_tweet)

In [9]:
df = df.drop_duplicates(subset=['content'])
df = df.reset_index(drop=True)

In [10]:
df.head()

Unnamed: 0,created_at,tcode,num_retweets,frn_cnt,flw_cnt,sts_cnt,lst_cnt,content,is_paslon_1,is_paslon_2,is_paslon_3
0,2024-01-04T09:57:09Z,rt,1248.0,266.0,107.0,9687.0,0.0,k-popers berencana kirim food truck untuk anie...,1,0,0
1,2024-01-04T09:57:09Z,rt,195.0,564.0,303.0,12461.0,2.0,bapak pendeta yusak ini dari magetan ke ponoro...,1,0,0
2,2024-01-04T09:57:10Z,rt,116.0,376.0,156.0,7488.0,1.0,"viral , gimana mak mak di jakarta tidak ter an...",1,0,0
3,2024-01-04T09:57:10Z,rt,2264.0,163.0,203.0,2065.0,0.0,"mendengar pak anies disini, rasanya saya sudah...",1,0,0
4,2024-01-04T09:57:11Z,rt,1157.0,1.0,1.0,798.0,0.0,media asing soroti cara anies gaet pemilih mud...,1,0,0


In [11]:
print(f"Total Banyak Data : {df.shape[0]}")

Total Banyak Data : 355740


In [12]:
max_data = 35000
df = df[6*max_data:7*max_data]
df = df.reset_index(drop=True)

In [13]:
tweets = df['content']

In [14]:
print(f"Number of unique tweets : {len(tweets)}")

Number of unique tweets : 35000


# *Clean Tweet*

In [15]:
# Hapus kata trending yang spam kalau bisa

# *Pre-calculate Embeddings*

In [16]:
embedding_model = SentenceTransformer("indolem/indobertweet-base-uncased")

clear_output()

In [17]:
embeddings  = embedding_model.encode(tweets,show_progress_bar=True)

Batches:   0%|          | 0/1094 [00:00<?, ?it/s]

In [18]:
similarity_matrix = cosine_similarity(embeddings)

In [19]:
threshold_similarity = 0.92

similarity_sparse = csr_matrix(similarity_matrix > threshold_similarity, dtype=bool)
row_indices, col_indices = similarity_sparse.nonzero()
filtered_indices = [(r, c) for r, c in zip(row_indices, col_indices) if r < c]
indices_to_drop = sorted(set(c for r, c in filtered_indices))

In [20]:
final_embeddings = np.delete(embeddings,indices_to_drop,0)

tweets = tweets.drop(index=indices_to_drop)

In [21]:
tweets = tweets.reset_index(drop=True)

# *Dimensionality Reduction*

In [22]:
umap_model = UMAP(n_neighbors=13, n_components=7, min_dist=0.0, metric='cosine', random_state=42)

# *Clustering*

In [23]:
hdbscan_model = HDBSCAN(min_cluster_size=200, metric='euclidean', prediction_data=True)

# *Vectorizer*

In [24]:
nltk.download('stopwords')

indonesian_stop_words = stopwords.words('indonesian')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
english_stop_words = stopwords.words('english')

In [26]:
stopwords_pemilu = [
    'presiden', 'capres', 'cawapres', 'pilpres', 'pemilu', 
    'vote', 'pilih', 'calon', 'partai', 'politik', 'suara', 'nyoblos', 'coblos',
    'kotak', 'suara', 'pemilihan', 'tps', 'pemungutan', 'hitung',
    'parpol', 'caleg', 'nomor', 'urut', 'kandidat', 'coblos', 'tps', 'caleg', 'pileg',
    'pemilihan', 'umum','nomor',  'jokowi', 'ganjar', 'prabowo', 'anies',
    'nasional', 'strategi', 'politik', 'pemimpin', 'negeri', 'bangsa', 'persatuan', 'kesatuan','anis','baswedan',
    'muhaimin','iskandar','imin','cak','subianto','bowo','gibran','paslon','mahfud','md','baswedan','pragib','amin','01','02','03',
    'paslon','rakabuming','raka','pranowo','2024'
]

In [27]:
stopwords_gaul = [
    'gue', 'lu', 'elo', 'gw', 'loe', 'gue', 'bro', 'sis', 'sob', 'wkwk', 'haha', 'hihi',
    'hehe', 'hoho', 'kalo', 'gitu', 'gini', 'nih', 'tuh', 'dong', 'deh', 'loh', 'yah',
    'aja', 'sih', 'kok', 'lah', 'kan', 'kan', 'yuk', 'yok', 'nih', 'tuh', 'dah', 'udah', 
    'nggak', 'ga', 'gak', 'enggak', 'ntar', 'nanti', 'aja', 'doang', 'dong', 'deh', 'nih', 
    'eh', 'ya', 'yuk', 'yok', 'lagi', 'cuma', 'aja', 'lah', 'kan', 'pake','beliau','pas', 'abis', 
    'abis', 'banget', 'parah', 'amat', 'kenapa', 'tau', 'dong', 'plis', 'please', 'bang', 
    'sis', 'gan', 'min', 'btw', 'gimana', 'gmn', 'gini', 'gitu', 'trs', 'terus', 'tp', 
    'tapi', 'jd', 'jadi', 'cm', 'cuma', 'cmn', 'aja', 'yg', 'yang', 'krn', 'karena', 
    'gmna', 'ngapain', 'apa', 'siapa', 'dimana', 'mana', 'kok', 'dong', 'banget', 'pol', 
    'abis', 'abisin', 'kek', 'like','wowo','share', 'komen', 'comment', 'subscribe', 'subrek',
    'follback', 'foll', 'dm', 'cek', 'ngecek', 'ngepost', 'el','nyebokin','masbowogbran','posting', 'up', 'update', 'thread', 
    'viral', 'story', 'status', 'bio', 'link', 'dm','mas','gemoy','abah','nya','klo','jg','bgt','si','sorry','yee','ye','yeee',
    'tdk','org','sdh','dr','kah','eti','dih','wkwkwk','mba','huuu','wah','wahh','wih','awok','awokwokwok','awokwokwokwok','awokawokawok','cebok','cebokin',
    'bikin','kayak','bener','dgn'
]

In [28]:
custom_stop_words = indonesian_stop_words + english_stop_words  + stopwords_gaul + stopwords_pemilu

In [29]:
vectorizer_model = CountVectorizer(stop_words=custom_stop_words, min_df=2, ngram_range=(1, 2))

In [30]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# *Topic Representation*

In [31]:
keybert_model = KeyBERTInspired()

In [32]:
representation_model = {
    'KeyBERT': keybert_model
}

# *Training*

In [33]:
topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model, 
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

In [34]:
# Train model
topics, probs = topic_model.fit_transform(tweets, final_embeddings)

2024-07-11 10:45:11,645 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-11 10:46:14,101 - BERTopic - Dimensionality - Completed ✓
2024-07-11 10:46:14,103 - BERTopic - Cluster - Start clustering the reduced embeddings
  pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `

# *Result and Custom Label*

In [35]:
# Show topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,1060,-1_tabrak_tabrak prof_towel_jepang,"[tabrak, tabrak prof, towel, jepang, prof, kar...","[tabrak prof, desak tabrak, tabrak, prof desak...","[setelah desak anies, ada tabrak prof 😂, sudah..."
1,0,31496,0_desak_orang_indonesia_jakarta,"[desak, orang, indonesia, jakarta, pendukung, ...","[beda, emang, keren, emg, coba, lo, acara desa...","[kesimpulannya, ya , sekarang kerja tom lembon..."
2,1,935,1_people_way_rent_free,"[people, way, rent, free, really, rent free, o...","[anything, father figure, rent free, things, p...",[ugh the more i know ab how dirty that candida...
3,2,284,2_pengungsi_rohingya_pengungsi rohingya_imigran,"[pengungsi, rohingya, pengungsi rohingya, imig...","[suka ngurusin, imigran rohingya, cari simpati...",[presiden anies pasti bikin pengungsi rohingya...


In [36]:
info_df = topic_model.get_topic_info()

info_df.to_csv('topic-information.csv',index=False)

In [37]:
# or use one of the other topic representations, like KeyBERTInspired
pos_topic_labels = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in topic_model.topic_aspects_["KeyBERT"].items()}
topic_model.set_topic_labels(pos_topic_labels)

# *Visualize Topic*

In [38]:
try:
    fig = topic_model.visualize_barchart()
    fig.show()
except:
    print("Gagal")

In [39]:
try:
    fig = topic_model.visualize_topics(custom_labels=True)
    fig.show()
except:
    print("Gagal")

Gagal



k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.



In [40]:
try:
    fig = topic_model.visualize_hierarchy(custom_labels=True)
    fig.show()
except:
    print("Gagal")

In [41]:
try:
    reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(final_embeddings)
    fig = topic_model.visualize_documents(tweets, reduced_embeddings=reduced_embeddings, custom_labels=True)
    fig.show()
except:
    print("Gagal")

In [42]:
try:
    fig = topic_model.visualize_heatmap()
    fig.show()
except:
    print("Gagal")

In [43]:
try:
    final_df = df.drop(index=indices_to_drop).reset_index(drop=True)
    final_df['created_at'] = pd.to_datetime(final_df['created_at'])

    timestamps = final_df['created_at']

    topics_over_time = topic_model.topics_over_time(tweets, timestamps, nr_bins=20)
    fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)
    fig.show()
except:
    print("Gagal")

20it [00:01, 13.68it/s]


# *Save Model*

In [44]:
embedding_model = "indolem/indobertweet-base-uncased"
topic_model.save("saved_model", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)