# Topic Modelling menggunakan LDA dengan klasifikasi KMeans

In [37]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import re
import numpy as np
import pandas as pd
import gensim
import nltk
import pyLDAvis
# import spacy
import pyLDAvis.gensim_models

from gensim import corpora

from nltk.stem.snowball import SnowballStemmer

from gensim.corpora import Dictionary, MmCorpus
from gensim.models import ldamodel, lsimodel

from sklearn.cluster import KMeans
from sklearn.manifold import MDS
from sklearn.decomposition import PCA

In [2]:
def load_dataset(filename):
    file = open(filename, 'r')

    acc_names = []
    tweets = []

    for line in file:
        line = line.strip()
        parts = line.split('###')
        acc_names.append(parts[0])
        tweets.append(parts[1])

    return acc_names, tweets

In [3]:
# config variables
num_topics=4

In [4]:
# definisikan beberapa fungsi untuk kebutuhkan pre-processing, pre-processing yang dilakukan adalah
# 1. lowercasing
# 2. stopword removal
# 3. stemming

stemmer = SnowballStemmer("english")
stopwords = nltk.corpus.stopwords.words('english')

def preprocess(text):

    # tokenizing and lowercasing
    tokens = [word.lower() for word in text.split()]
    filtered_tokens = []

    # buat yang bukan terdiri dari alfabet, dan merupakan stopword
    for token in tokens:
        if re.search('[a-zA-Z]', token) and (token not in stopwords):
            filtered_tokens.append(token)

    # lakukan stemming dengan snowball stemmer
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [5]:
# Kita load dokumen twitter, dan lakukan preprocessing terhadap tweet yang sudah di-load
acc_names, tweets = load_dataset("twitter.txt")

# Lakukan pre-process untuk setiap tweet pada koleksi "tweets" kita
# Gunakan List Comprehension untuk mempermudah hidup kita
tweets_label = tweets
tweets = [preprocess(tweet) for tweet in tweets]

In [40]:
print(tweets)

[['kardashian', 'yr', 'anniversary,', 'iphon', 'yr', 'anniversary,', 'so,', 'kardashian', 'made', 'iphon'], ['iphon', 'year', 'old.', 'appl', 'watch', 'year', 'old.', 'feel', 'peopl', 'forget', 'small', 'fact.'], ["can't", 'save', 'make,', 'live', 'beyond', 'means.', 'ditch', 'starbucks,', 'eat', 'less,', 'need', 'new', 'iphone,', 'save', 'money!'], ['time', 'year!', 'iphon', 'vs.', 'samsung', 'galaxi', 's8', 'smackdown:'], ['sell', 'yeezi', 'samsung', 'galaxi', 's8', 'anyon', 'interest', 'show', 'proof', 'trust', '@devilishrt', '@alienrt', '@bear_retweet', '@flyrt'], ['iphon', '16gb', 'spacegray', 'peso', 'only!', 'complet', 'full', 'packag', 'guys!', 'dm'], ['swear', 'even', 'iphon', 'dress', 'clown,', 'reach', 'pillow', '&choke', 'slept,', 'still', 'buy', 'samsung'], ['iphon', '8', 'a11', 'bionic', 'chip', 'lost', 'samsung', 'galaxi', 'note', 'app', 'launch', 'time', 'multitask', 'speeds.'], ['confus', 'post', 'dedic', 'camera', 'review', 'samsung', 'galaxi', 'note', 'includ', 'came

In [6]:
# membuat term dictionary dari korpus kita, dimana setiap kata unik akan diberikan sebuah index
dictionary = Dictionary(tweets)

# buang term yang:
# 1. muncul di kurang dari 2 dokumen
# 2. muncul di lebih dari 0.9*(total_dok) dokumen
dictionary.filter_extremes(no_below=2, no_above=0.9)

# ubah dictionary menjadi object bag-of-words reference
# ingat bahwa dalama LDA, dokumen diasumsikan dengan bag-of-words model
corpus = [dictionary.doc2bow(tweet) for tweet in tweets]

In [38]:
print(corpus)

[[(0, 2), (1, 1)], [(0, 1), (2, 1), (3, 2)], [(4, 1)], [(0, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(5, 1), (6, 1), (7, 1)], [(0, 1), (9, 1), (10, 1)], [(0, 1), (7, 1), (11, 1), (12, 1)], [(0, 1), (5, 1), (7, 1), (8, 1), (13, 1), (14, 1)], [(5, 1), (7, 1), (10, 1), (14, 1), (15, 1), (16, 1)], [(5, 1), (7, 1), (9, 1)], [(0, 1)], [(0, 1)], [(5, 1), (7, 1), (11, 1), (12, 1), (14, 1), (17, 1), (18, 1)], [(1, 1), (3, 1), (5, 1), (7, 1), (14, 1)], [(0, 1)], [(5, 1), (7, 2), (14, 2)], [(0, 1), (5, 1), (7, 1), (19, 1)], [(0, 1), (4, 1)], [(20, 1), (21, 1)], [(20, 1), (21, 1), (22, 1), (23, 1)], [(13, 1), (20, 1), (24, 1)], [(25, 1), (26, 1), (27, 1)], [(20, 1), (21, 1), (23, 1)], [(20, 1), (21, 1), (26, 1)], [(20, 1), (28, 1)], [(20, 1), (21, 1), (29, 1)], [(20, 1), (21, 1), (30, 1), (31, 1), (32, 1), (33, 1)], [(20, 1), (21, 1), (30, 1), (31, 1)], [(3, 1), (18, 1), (20, 1), (21, 1)], [(19, 1), (20, 1), (21, 2), (33, 1), (34, 1)], [(20, 1), (21, 1), (22, 1), (29, 1), (32, 1), (35, 1)], [(10, 1),

## LDA Model

In [7]:
# Run the LDA
lda = ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, random_state=42, iterations=5000)

In [20]:
# tampilkan topic matrix
topics_matrix = lda.show_topics(formatted=False)

for topic_no, topic_words in topics_matrix:

    print ('topic number: {}'.format(topic_no))

    # default: top-10 kata yang paling tinggi probabilitasnya
    for word, prob in topic_words:
        print (word, prob)

topic number: 0
samsung 0.1279757
galaxi 0.09727413
note 0.081398584
iphon 0.06606686
referendum 0.052219264
time 0.034927897
still 0.034894504
even 0.03485522
lost 0.034776494
tomorrow 0.03473522
topic number: 1
iphon 0.101325
catalunya 0.09613534
referendum 0.08747995
support 0.046080384
champion 0.032856427
leagu 0.03263743
referendum, 0.032056753
block 0.03204547
watch 0.032028172
year 0.03198766
topic number: 2
leagu 0.09808679
champion 0.09804178
referendum 0.051687572
catalan 0.051005125
chelsea 0.05093845
samsung 0.035849955
year 0.035549145
galaxi 0.03548846
spanish 0.0354884
atletico 0.035458796
topic number: 3
leagu 0.09576264
champion 0.095440865
catalunya 0.05143575
premier 0.05020862
kane 0.050193615
harri 0.05016487
viva 0.049813174
iphon 0.030219596
referendum 0.029424462
catalan 0.028665666


In [32]:
def get_lda_topics(model):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 10);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);

get_lda_topics(lda)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04
0,samsung,iphon,leagu,leagu
1,galaxi,catalunya,champion,champion
2,note,referendum,referendum,catalunya
3,iphon,support,catalan,premier
4,referendum,champion,chelsea,kane
5,time,leagu,samsung,harri
6,still,"referendum,",year,viva
7,even,block,galaxi,iphon
8,lost,watch,spanish,referendum
9,tomorrow,year,atletico,catalan


In [43]:
# bentuk terlebih dahulu vektor dokumen/tweet
# vektor tweet/dokumen = vektor probabilitas terhadap masing-masing topik
tweet_vectors = []
for tweet in tweets:
    probs = [prob for (_,prob) in lda.get_document_topics(dictionary.doc2bow(tweet))]
    tweet_vectors.append(probs)
tweet_vectors = np.array(tweet_vectors)

# kita set banyaknya cluster = banyaknya topik
num_clusters = num_topics

# gunakan algoritma K-Means, dan lakukan clustering !
km = KMeans(n_clusters=num_clusters)
km.fit(tweet_vectors)

# jika kita ingin melihat indeks cluster untuk setiap tweet/dokumen
clusters = km.labels_.tolist()

print(clusters)

[0, 0, 2, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 2, 3, 1, 3, 3, 3, 2, 3, 2, 2, 3, 2, 0, 3, 2, 0, 0, 2, 0, 0, 1, 1, 0, 3, 3, 2, 3, 3, 0, 0, 0, 0]


In [44]:
print(tweet_vectors)

[[0.06440544 0.80826455 0.06419452 0.06313552]
 [0.05095474 0.8467538  0.05208589 0.05020561]
 [0.13270642 0.12515919 0.1251967  0.61693764]
 [0.8734298  0.04273069 0.04197443 0.04186513]
 [0.8113674  0.06282119 0.06327626 0.06253513]
 [0.06514984 0.8065115  0.06546419 0.06287444]
 [0.84794307 0.05154614 0.05021422 0.05029653]
 [0.89169335 0.03645258 0.03599778 0.03585628]
 [0.8909706  0.03657869 0.03669829 0.03575238]
 [0.48663488 0.06704279 0.38368675 0.06263558]
 [0.13245341 0.61535233 0.12509492 0.12709935]
 [0.1322999  0.61550957 0.1250948  0.12709571]
 [0.9053029  0.03132921 0.03208936 0.03127854]
 [0.4460133  0.04428927 0.46796685 0.0417306 ]
 [0.13234101 0.6154674  0.12509483 0.12709673]
 [0.87418264 0.04184069 0.04229535 0.04168131]
 [0.5289394  0.3649977  0.05079589 0.05526695]
 [0.7362744  0.08936641 0.08347072 0.09088844]
 [0.08378638 0.08517474 0.09240819 0.73863065]
 [0.05014256 0.05129726 0.8467969  0.05176331]
 [0.45691472 0.06338743 0.06600412 0.41369376]
 [0.06254972 

In [23]:
# untuk setiap cluster center, kita sort argumen/index berdasarkan nilai probabilitasnya
# karena index/argumen adalah id topik.
#
# jadi, secara intuisi, ini adalah cara untuk mencari topik major yang dibicarakan di sebuah cluster
# nantinya, wakil kata cluster akan diambil dari 2 topik major di setiap cluster
#
# ::-1 artinya reverse list

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

cluster_names = {}
for i in range(num_clusters):
    print ("cluster %d words:" % i)
    
    # ambil 2 topik major untuk setiap cluster
    topic_words = []
    for ind in order_centroids[i, :1]:
        topic_words += [dictionary.get(word_id) for (word_id, prob) in lda.get_topic_terms(ind, topn=2)]
    
    cluster_names[i] = ','.join(topic_words)

    print (cluster_names[i])

cluster 0 words:
iphon,catalunya
cluster 1 words:
leagu,champion
cluster 2 words:
samsung,galaxi
cluster 3 words:
leagu,champion


In [77]:
print(tweet_vectors)

[[0.80618477 0.06746034 0.06336243 0.06299248]
 [0.05259623 0.8414018  0.05436617 0.0516358 ]
 [0.6244938  0.12520477 0.1251576  0.12514387]
 [0.04627742 0.04668002 0.8641156  0.04292697]
 [0.06562262 0.31141058 0.5585325  0.06443434]
 [0.1388234  0.06630667 0.06642415 0.72844577]
 [0.0520206  0.84520745 0.05082975 0.0519422 ]
 [0.03801076 0.41396883 0.51126695 0.03675345]
 [0.03630483 0.0404459  0.03735645 0.88589287]
 [0.06312862 0.42790595 0.06880143 0.44016403]
 [0.61494297 0.13042061 0.12802364 0.12661281]
 [0.614936   0.13042894 0.12802224 0.12661284]
 [0.03177203 0.3859676  0.0318071  0.5504533 ]
 [0.0421292  0.87191015 0.04315391 0.04280671]
 [0.6150019  0.13036804 0.1280188  0.12661128]
 [0.04186383 0.87270033 0.04261672 0.04281909]
 [0.05220068 0.8448282  0.05203149 0.05093963]
 [0.747111   0.08481783 0.08422901 0.08384211]
 [0.08335732 0.08499608 0.74552023 0.08612632]
 [0.05003056 0.05209616 0.8471128  0.05076052]
 [0.06254663 0.06308518 0.8091954  0.06517283]
 [0.06367312 

In [11]:
arr = np.stack((tweets_label, clusters), axis=1)

# print(arr)

df = pd.DataFrame(arr)
df

Unnamed: 0,0,1
0,"So it's the Kardashians' 10 yr anniversary, an...",2
1,iPhone is 10 years old. Apple Watch is 2 years...,2
2,"If you can't save 10% of what you make, you ar...",0
3,It's that time of year! My iPhone 8 vs. Samsun...,1
4,Selling Yeezys and a Samsung Galaxy S8 is anyo...,1
5,"iPhone 6 16gb Spacegray 10,200 pesos only! Com...",2
6,I swear even if my iPhone dressed up as a clow...,1
7,iPhone 8's A11 Bionic chip lost against the Sa...,1
8,Confused should I post a dedicated camera revi...,1
9,You guys i have a used iphone6& Samsung galaxy...,1


## LSA Model

In [73]:
# Run the LSA
lsa = lsimodel.LsiModel(corpus, num_topics=num_topics, id2word=dictionary)

In [74]:
# tampilkan topic matrix
topics_matrix_lsa = lsa.show_topics(formatted=False)

for topic_no, topic_words in topics_matrix_lsa:

    print ('topic number: {}'.format(topic_no))

    # default: top-10 kata yang paling tinggi probabilitasnya
    for word, prob in topic_words:
        print (word, prob)

topic number: 0
samsung 0.5694504832291907
galaxi 0.46000822404892383
note 0.3697781992228932
iphon 0.3058372721112972
leagu 0.2521851117614065
champion 0.2416746513195237
time 0.11026118624489052
year 0.10798110206520731
full 0.09514996018605527
still 0.08925825364615346
topic number: 1
leagu -0.6161956881090622
champion -0.5871219620209108
samsung 0.24933798093875167
galaxi 0.19947051924854084
note 0.16167142423147549
premier -0.1276878308319734
club -0.12614770428359942
iphon 0.12558737335257536
league. -0.10866317305149653
chelsea -0.10417389455442691
topic number: 2
referendum 0.6705314663999484
catalunya 0.5553513130271784
catalan 0.2239633781177419
support 0.21550103319547442
spanish 0.14628019942545026
block 0.14075350826826646
referendum, 0.12437357871861839
barcelona 0.10199862319150926
watch 0.10103118357056613
due 0.10081745606983565
topic number: 3
iphon 0.869873583097004
note -0.2824839483472342
samsung -0.1962560506204438
year 0.18683490082167384
galaxi -0.16177379814925

In [76]:
# bentuk terlebih dahulu vektor dokumen/tweet
# vektor tweet/dokumen = vektor probabilitas terhadap masing-masing topik
tweet_vectors_lsa = []
for tweet in tweets:
    probs = [prob for (_,prob) in lsa.get_topics(dictionary.doc2bow(tweet))]
    tweet_vectors.append(probs)
tweet_vectors = np.array(tweet_vectors)

# kita set banyaknya cluster = banyaknya topik
num_clusters_lsa = num_topics

# gunakan algoritma K-Means, dan lakukan clustering !
km = KMeans(n_clusters=num_clusters_lsa)
km.fit(tweet_vectors_lsa)

# jika kita ingin melihat indeks cluster untuk setiap tweet/dokumen
clusters_lsa = km.labels_.tolist()

print(clusters_lsa)

TypeError: get_topics() takes 1 positional argument but 2 were given