# Libraries

In [1]:
!pip install bertopic
from IPython.display import clear_output
clear_output()

In [2]:
# Core
import pandas as pd
import numpy as np

# Topic Modeling
from bertopic import BERTopic
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import TextGeneration
import random

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


# Data Preparation

**1. Data Collection**

In [3]:
train = pd.read_csv('/kaggle/input/dac-itfest-processed/train-processed.csv')
test = pd.read_csv('/kaggle/input/dac-ifest-predicted/test-predicted-slang-indobertweet.csv')

In [4]:
data = pd.concat([train,test],ignore_index=True)

# EDA

# BERTopic Analysis

In [5]:
# Mengelompokkan semua data
fear_df = data[data['label'] == 'fear']
joy_df = data[data['label'] == 'joy']
anger_df = data[data['label'] == 'anger']
love_df = data[data['label'] == 'love']
sad_df = data[data['label'] == 'sadness']

# Mengelompokkan test data
fear_test = test[test['label'] == 'fear']
joy_test = test[test['label'] == 'joy']
anger_test = test[test['label'] == 'anger']
love_test = test[test['label'] == 'love']
sad_test = test[test['label'] == 'sadness']

### Fear Analysis

**Define Model**

In [6]:
docs = fear_df['tweet']

In [7]:
# Prevent Stochastic Behavior (Mencegah Hasil Berubah-ubah setiap Menjalankan)
umap_model = UMAP(random_state = 42)

# Model representasi untuk topik
representation_model = KeyBERTInspired()

In [8]:
# Define topic model
topic_model = BERTopic(
    umap_model=umap_model,
    language="indonesian", 
    representation_model=representation_model,
    verbose = True, 
    nr_topics="auto",
    min_topic_size = 3
)

**Clustering**

In [9]:
# Train
topics, probs = topic_model.fit_transform(docs)

Downloading (…)0fe39/.gitattributes:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)83e900fe39/README.md:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading (…)e900fe39/config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading (…)900fe39/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

2023-09-04 01:14:41,764 - BERTopic - Transformed documents to Embeddings
2023-09-04 01:14:54,149 - BERTopic - Reduced dimensionality
2023-09-04 01:14:54,205 - BERTopic - Clustered reduced embeddings
2023-09-04 01:14:56,577 - BERTopic - Reduced number of topics from 3 to 3


In [10]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,785,0_takut_kenapa_perasaan_sendiri,"[takut, kenapa, perasaan, sendiri, merasa, tap...","[teman-teman aku lagi deg degan parah, takut t..."
1,1,11,1_sholat_gereja_setan_hadist,"[sholat, gereja, setan, hadist, paroki, godaan...",[tidak disemua tempat kita boleh melaksanakan ...
2,2,7,2_pasien_pasiennya_medis_mohon,"[pasien, pasiennya, medis, mohon, perintahkan,...","[aku tidak salah bila mohon pada , segera peri..."


In [11]:
topic_model.get_document_info(docs)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,lebih menyeramkan kalau punya grup wa keluarga...,1,1_sholat_gereja_setan_hadist,"[sholat, gereja, setan, hadist, paroki, godaan...",[tidak disemua tempat kita boleh melaksanakan ...,sholat - gereja - setan - hadist - paroki - go...,1.0,False
1,iya bagaimana. diajak jalan saja gatau kenapa ...,0,0_takut_kenapa_perasaan_sendiri,"[takut, kenapa, perasaan, sendiri, merasa, tap...","[teman-teman aku lagi deg degan parah, takut t...",takut - kenapa - perasaan - sendiri - merasa -...,1.0,False
2,aku dulu nemu ig nya dia. pas dia lagi lihat c...,0,0_takut_kenapa_perasaan_sendiri,"[takut, kenapa, perasaan, sendiri, merasa, tap...","[teman-teman aku lagi deg degan parah, takut t...",takut - kenapa - perasaan - sendiri - merasa -...,1.0,False
3,"entah gatau kenapa, semakin bertambahnya umur ...",0,0_takut_kenapa_perasaan_sendiri,"[takut, kenapa, perasaan, sendiri, merasa, tap...","[teman-teman aku lagi deg degan parah, takut t...",takut - kenapa - perasaan - sendiri - merasa -...,1.0,False
4,tidak disemua tempat kita boleh melaksanakan s...,1,1_sholat_gereja_setan_hadist,"[sholat, gereja, setan, hadist, paroki, godaan...",[tidak disemua tempat kita boleh melaksanakan ...,sholat - gereja - setan - hadist - paroki - go...,1.0,True
...,...,...,...,...,...,...,...,...
798,bicara-bicara doa restu orang tua itu emang gi...,0,0_takut_kenapa_perasaan_sendiri,"[takut, kenapa, perasaan, sendiri, merasa, tap...","[teman-teman aku lagi deg degan parah, takut t...",takut - kenapa - perasaan - sendiri - merasa -...,1.0,False
799,desain razer mirip banget mbp. bahkan lbh mode...,0,0_takut_kenapa_perasaan_sendiri,"[takut, kenapa, perasaan, sendiri, merasa, tap...","[teman-teman aku lagi deg degan parah, takut t...",takut - kenapa - perasaan - sendiri - merasa -...,1.0,False
800,ketika kamu merasa seperti kamu terjebak dalam...,0,0_takut_kenapa_perasaan_sendiri,"[takut, kenapa, perasaan, sendiri, merasa, tap...","[teman-teman aku lagi deg degan parah, takut t...",takut - kenapa - perasaan - sendiri - merasa -...,1.0,False
801,saat pintu berderik-derik terbuka sendiri,0,0_takut_kenapa_perasaan_sendiri,"[takut, kenapa, perasaan, sendiri, merasa, tap...","[teman-teman aku lagi deg degan parah, takut t...",takut - kenapa - perasaan - sendiri - merasa -...,1.0,False


In [12]:
topic_model.visualize_barchart(title='Topic pada Undang-Undang')