# LDA Topic Modelling

Latent Dirichlet Allocation (LDA) merupakan model generatif yang digunakan dalam pemrosesan bahasa alami untuk mengelompokkan dokumen ke dalam topik-topik tersembunyi. Model ini mengasumsikan bahwa setiap dokumen adalah kombinasi dari beberapa topik, dan masing-masing kata dalam dokumen dihasilkan dari distribusi probabilitas topik tersebut. Penggunaan umum LDA adalah untuk melakukan analisis topik pada koleksi dokumen yang berskala besar.

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

Membaca data term frequency

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/penambangan_web/Data/TermFrequensi.csv')
df

Unnamed: 0,Dokumen,aalysis,aam,ab,abad,abadi,ability,abjad,absensi,absolut,...,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu,Label
0,sistem informasi akademik siakad sistem inform...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,RPL
1,berjalannya koneksi jaringan komputer lancar g...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,RPL
2,web server perangkat lunak server berfungsi me...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,RPL
3,penjadwalan kuliah perguruan kompleks permasal...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,KK
4,seiring perkembangan teknologi didunia muncul ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,KK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
823,kurangnya pemahaman gejala penyakit saluran pe...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,KK
824,data set hilang utama studi bersifat substansi...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,KK
825,proses seleksi penerimaan tenaga kerja faktor ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,KK
826,sapi salah hewan ternak komoditi utama bahan p...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,KK


In [None]:
import numpy as np
kelas_dataset = df['Label']

# Ubah kelas A menjadi 0 dan kelas B menjadi 1
kelas_dataset_binary = [0 if kelas == 'RPL' else 1 for kelas in kelas_dataset]

# Contoh cetak hasilnya
df['Label']=kelas_dataset_binary


In [None]:
y = df['Label']
y

0      0
1      0
2      0
3      1
4      1
      ..
823    1
824    1
825    1
826    1
827    1
Name: Label, Length: 828, dtype: int64

Drop Kolom Dokumen

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
X = df.drop('Dokumen', axis=1)
# Inisialisasi CountVectorizer dengan menghilangkan stop words bahasa Inggris
vectorizer = CountVectorizer(stop_words='english')
# Konversi teks menjadi matriks term frequency
X = vectorizer.fit_transform(df['Dokumen'].values.astype('U'))
X

<828x8818 sparse matrix of type '<class 'numpy.int64'>'
	with 49245 stored elements in Compressed Sparse Row format>

Split data sebelum LDA

In [None]:
from sklearn.model_selection import train_test_split
# Bagi data yang telah divectorisasi menjadi training set (80%) dan test set (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



Pemberian nilai K, Alpha dan Beta

In [None]:
k = 3
alpha = 0.1
beta = 0.2

lda = LatentDirichletAllocation(n_components=k, doc_topic_prior=alpha, topic_word_prior=beta)

# Latih model LDA pada training set yang telah divectorisasi
proporsi_topik_dokumen_train = lda.fit_transform(X_train)

# Proyeksikan dokumen pada test set ke dalam ruang topik yang telah dipelajari
proporsi_topik_dokumen_test = lda.transform(X_test)


Hasil LDA

In [None]:
dokumen = df['Dokumen']
label= df['Label']
output_proporsi_TD = pd.DataFrame(proporsi_topik_dokumen_test, columns=['Topik 1', 'Topik 2', 'Topik 3'])
output_proporsi_TD.insert(0,'Dokumen', dokumen)
output_proporsi_TD.insert(len(output_proporsi_TD.columns),'Label', df['Label'])
output_proporsi_TD

Unnamed: 0,Dokumen,Topik 1,Topik 2,Topik 3,Label
0,sistem informasi akademik siakad sistem inform...,0.076923,0.846154,0.076923,0
1,berjalannya koneksi jaringan komputer lancar g...,0.000950,0.000950,0.998100,0
2,web server perangkat lunak server berfungsi me...,0.997308,0.001346,0.001346,0
3,penjadwalan kuliah perguruan kompleks permasal...,0.191258,0.552358,0.256383,1
4,seiring perkembangan teknologi didunia muncul ...,0.462809,0.000860,0.536330,1
...,...,...,...,...,...
161,perkembangan zaman era globalisasi melepaskan ...,0.099644,0.001311,0.899045,1
162,visualisasi animasi meberikan informasi intera...,0.012445,0.018674,0.968881,0
163,tes prosedur penilaian tespilihan ganda tersed...,0.439170,0.441509,0.119322,1
164,ujian esai evaluasi pembelajaran bentuk esai b...,0.000978,0.473269,0.525753,1


In [None]:
df.columns.shape

(9050,)

Distribusi kata pada topik

In [None]:
distribusi_kata_topik = pd.DataFrame(lda.components_)
distribusi_kata_topik

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8808,8809,8810,8811,8812,8813,8814,8815,8816,8817
0,0.2,0.2,0.2,0.204091,0.2,1.2,0.2,0.203344,0.2,0.204667,...,3.383576,0.2,0.2,14.199755,0.201534,0.2,0.2,3.2,1.2,1.2
1,0.2,0.2,1.197756,1.195909,2.2,0.2,0.2,0.200284,1.2,0.2,...,2.355627,0.2,0.2,0.200245,0.2,0.2,0.2,0.2,0.2,0.2
2,0.2,0.2,0.202244,0.2,0.2,0.2,3.2,5.196372,0.2,2.195333,...,1.860797,1.2,1.2,0.2,8.198466,3.2,4.2,0.2,0.2,0.2


Cluster

In [None]:
from sklearn.cluster import KMeans
# Melakukan clustering dengan K-Means
X_clustering = proporsi_topik_dokumen_test
n_clusters = 2

kmeans = KMeans(n_clusters=n_clusters, random_state=0)
clusters = kmeans.fit_predict(X_clustering)

# Menambahkan hasil clustering ke DataFrame
output_proporsi_TD['Cluster'] = clusters

# Hasil akhir DataFrame
print(output_proporsi_TD)

                                               Dokumen   Topik 1   Topik 2  \
0    sistem informasi akademik siakad sistem inform...  0.076923  0.846154   
1    berjalannya koneksi jaringan komputer lancar g...  0.000950  0.000950   
2    web server perangkat lunak server berfungsi me...  0.997308  0.001346   
3    penjadwalan kuliah perguruan kompleks permasal...  0.191258  0.552358   
4    seiring perkembangan teknologi didunia muncul ...  0.462809  0.000860   
..                                                 ...       ...       ...   
161  perkembangan zaman era globalisasi melepaskan ...  0.099644  0.001311   
162  visualisasi animasi meberikan informasi intera...  0.012445  0.018674   
163  tes prosedur penilaian tespilihan ganda tersed...  0.439170  0.441509   
164  ujian esai evaluasi pembelajaran bentuk esai b...  0.000978  0.473269   
165  kemampuan menulis alphanumerik bekal utama ana...  0.133336  0.713484   

      Topik 3  Label  Cluster  
0    0.076923      0        0  



Menggabungkan DataFrame hasil LDA dan DataFrame hasil clustering

In [None]:
output_final_df = pd.concat([output_proporsi_TD], axis=1)

output_final_df

Unnamed: 0,Dokumen,Topik 1,Topik 2,Topik 3,Label,Cluster
0,sistem informasi akademik siakad sistem inform...,0.076923,0.846154,0.076923,0,0
1,berjalannya koneksi jaringan komputer lancar g...,0.000950,0.000950,0.998100,0,0
2,web server perangkat lunak server berfungsi me...,0.997308,0.001346,0.001346,0,1
3,penjadwalan kuliah perguruan kompleks permasal...,0.191258,0.552358,0.256383,1,0
4,seiring perkembangan teknologi didunia muncul ...,0.462809,0.000860,0.536330,1,0
...,...,...,...,...,...,...
161,perkembangan zaman era globalisasi melepaskan ...,0.099644,0.001311,0.899045,1,0
162,visualisasi animasi meberikan informasi intera...,0.012445,0.018674,0.968881,0,0
163,tes prosedur penilaian tespilihan ganda tersed...,0.439170,0.441509,0.119322,1,1
164,ujian esai evaluasi pembelajaran bentuk esai b...,0.000978,0.473269,0.525753,1,0


Modelling klasifikasi

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

Naive Bayes

In [None]:
# Naive Bayes
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)
predictions = naive_bayes.predict(X_test)
accuracy = round(accuracy_score(y_test, predictions)*100,2)
accnb = round(naive_bayes.score(X_train,y_train)*100,2)

print("Akurasi Naive Bayes:", accuracy)


Akurasi Naive Bayes: 83.73


KNN

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
predict = knn.predict(X_test)
accuracyknn = round(accuracy_score(y_test,predict)*100,2)
accknn = round(knn.score(X_train,y_train)*100,2)

print("Akurasi KNN :", accknn)


Akurasi KNN : 75.83


Decision tree

In [None]:
#decision tree
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)
predictions_dt = decision_tree.predict(X_test)
accuracy_dt = round(accuracy_score(y_test, predictions_dt) * 100, 2)
acc_dt = round(decision_tree.score(X_train, y_train) * 100, 2)

print("Akurasi Decision Tree:", accuracy_dt)

Akurasi Decision Tree: 70.48
