In [5]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import re
import numpy as np
import pandas as pd
import gensim
import nltk
import pyLDAvis
# import spacy
import pyLDAvis.gensim_models

from gensim import corpora

from nltk.stem.snowball import SnowballStemmer

from gensim.corpora import Dictionary, MmCorpus
from gensim.models import ldamodel, lsimodel

from sklearn.cluster import KMeans
from sklearn.manifold import MDS
from sklearn.decomposition import PCA

In [62]:
df= pd.read_csv("jurnal2.csv")
print(df['abstraksi'])

0      In two studies, we intend to investigate wheth...
1      The academic achievement of students in school...
2      Mobile devices have been utilized as an emergi...
3      The purpose of this research is to develop the...
4      The school literacy movement is the first step...
                             ...                        
696    Adolescents (10-19 years old) are at high risk...
697    The use of information technology in education...
698    This study was conducted in order to investiga...
699    It is possible to track multiple state reforms...
700    The objective of the research was to find out ...
Name: abstraksi, Length: 701, dtype: object


In [16]:
stemmer = SnowballStemmer("english")
stopwords = nltk.corpus.stopwords.words('english')

def preprocess(text):

    # tokenizing and lowercasing
    tokens = [word.lower() for word in text.split()]
    filtered_tokens = []

    # buat yang bukan terdiri dari alfabet, dan merupakan stopword
    for token in tokens:
        if re.search('[a-zA-Z]', token) and (token not in stopwords):
            filtered_tokens.append(token)

    # lakukan stemming dengan snowball stemmer
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [64]:
dt = [preprocess(x) for x in df['abstraksi']]

In [58]:
print(dt)

[['two', 'studies,', 'intend', 'investig', 'whether', 'spiritu', 'explain', 'relationship', 'intrins', 'religi', 'orient', '(iro)', 'emot', 'intellig', '(ei).', 'seventy-thre', 'worship', 'houses-go', 'adults,', 'age', 'particip', 'study.', 'data', 'collect', 'employ', 'intrins', 'scale', 'religi', 'orient', 'scale,', 'spiritu', 'assess', 'scale,', 'schutt', 'self-report', 'emot', 'intellig', 'test.', 'first', 'studi', 'discov', 'iro', 'signific', 'predictor', 'ei.', 'nevertheless,', 'bootstrap', 'analysi', 'sampl', 'interv', 'confid', 'indic', 'spiritu', 'fulli', 'mediat', 'link', 'intrins', 'religi', 'orient', 'emot', 'intellig', 'second', 'study.', 'words,', 'without', 'high', 'level', 'spiritu', 'one', 'religi', 'orient', 'signific', 'predict', 'emot', 'intelligence.', 'limit', 'suggest', 'discuss', 'end', 'paper.'], ['academ', 'achiev', 'student', 'school', 'subject', 'varieti', 'factors,', 'mani', 'beyond', 'control', 'student.', 'factor', 'parent', 'involvement,', 'parent', 'lev

In [65]:
# membuat term dictionary dari korpus kita, dimana setiap kata unik akan diberikan sebuah index
dictionary = Dictionary(dt)

# buang term yang:
# 1. muncul di kurang dari 2 dokumen
# 2. muncul di lebih dari 0.9*(total_dok) dokumen
dictionary.filter_extremes(no_below=2, no_above=0.9)

# ubah dictionary menjadi object bag-of-words reference
# ingat bahwa dalama LDA, dokumen diasumsikan dengan bag-of-words model
corpus = [dictionary.doc2bow(x) for x in dt]

In [26]:
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 4), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 3), (19, 1), (20, 1), (21, 1), (22, 3), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 4), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 4), (37, 1), (38, 1), (39, 2), (40, 1), (41, 1), (42, 2), (43, 4), (44, 1), (45, 1), (46, 2), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1)], [(7, 1), (17, 1), (23, 1), (24, 1), (35, 1), (44, 1), (53, 1), (54, 2), (55, 3), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 4), (76, 1), (77, 2), (78, 1), (79, 1), (80, 3), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1)], [(11, 1), (44, 4), (46, 1), (71, 13), (80, 5), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100,

In [66]:
# Run the LDA
num_topics = 10
lda = ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, random_state=42, iterations=5000)

In [67]:
# tampilkan topic matrix
topics_matrix = lda.show_topics(formatted=False)

for topic_no, topic_words in topics_matrix:

    print ('topic number: {}'.format(topic_no))

    # default: top-10 kata yang paling tinggi probabilitasnya
    for word, prob in topic_words:
        print (word, prob)

topic number: 0
research 0.017520063
use 0.012892762
educ 0.011312022
studi 0.010810363
social 0.010387476
support 0.007353417
school 0.007090401
signific 0.0070052133
data 0.006551143
student 0.0062632766
topic number: 1
studi 0.022859702
student 0.020215394
use 0.015922386
educ 0.012023332
data 0.00969088
teacher 0.009013005
research 0.008861508
result 0.0076656346
show 0.007591507
academ 0.007435824
topic number: 2
teacher 0.025600137
studi 0.023343964
learn 0.019322256
use 0.018591683
student 0.0151891
school 0.0109929405
research 0.009490697
educ 0.008813405
data 0.00879802
result 0.008448487
topic number: 3
student 0.018317714
studi 0.01667507
use 0.014347049
research 0.0116807455
educ 0.008446876
teacher 0.007462689
result 0.0069771726
aim 0.006085851
academ 0.005864923
find 0.0058554383
topic number: 4
student 0.019951213
research 0.0148213515
learn 0.014290534
educ 0.011845667
studi 0.0115707945
result 0.008943797
use 0.008913901
signific 0.008737356
effect 0.008272272
develop

In [68]:
def get_lda_topics(model):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 10);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);

get_lda_topics(lda)

AttributeError: 'NotebookFormatter' object has no attribute 'get_result'

  Topic # 01 Topic # 02 Topic # 03 Topic # 04 Topic # 05 Topic # 06  \
0   research      studi    teacher    student    student    student   
1        use    student      studi      studi   research      studi   
2       educ        use      learn        use      learn        use   
3      studi       educ        use   research       educ   research   
4     social       data    student       educ      studi      learn   
5    support    teacher     school    teacher     result     school   
6     school   research   research     result        use       educ   
7   signific     result       educ        aim   signific       data   
8       data       show       data     academ     effect     result   
9    student     academ     result       find    develop    teacher   

  Topic # 07 Topic # 08 Topic # 09 Topic # 10  
0    student    student      skill   research  
1      studi      learn      teach        use  
2        use     school        use      studi  
3      learn        use   

In [69]:
dt_vectors = []
for x in dt:
    probs = [prob for (_,prob) in lda.get_document_topics(dictionary.doc2bow(x))]
    dt_vectors.append(probs)
dt_vectors = np.array(dt_vectors)

# kita set banyaknya cluster = banyaknya topik
num_clusters = num_topics

# gunakan algoritma K-Means, dan lakukan clustering !
km = KMeans(n_clusters=num_clusters)
# km.fit(dt_vectors)

# jika kita ingin melihat indeks cluster untuk setiap abs/dokumen
# clusters = km.labels_.tolist()

print(dt_vectors)

[list([0.2223068, 0.76672536]) list([0.9833253]) list([0.9922374])
 list([0.9816237]) list([0.9894046])
 list([0.15598096, 0.61319757, 0.22421248]) list([0.8183458, 0.17152326])
 list([0.9919599]) list([0.98888445]) list([0.8978878, 0.09157721])
 list([0.18528464, 0.11042669, 0.17435059, 0.523037]) list([0.98964643])
 list([0.16071786, 0.8333952]) list([0.8006926, 0.19382446])
 list([0.98830706]) list([0.9854768]) list([0.98524064]) list([0.9908123])
 list([0.99099684]) list([0.9909041]) list([0.8696171, 0.1250447])
 list([0.98731846]) list([0.99267983]) list([0.9909944]) list([0.9909052])
 list([0.12430964, 0.8678432]) list([0.9923682])
 list([0.13215163, 0.73190516, 0.12939686]) list([0.99031764])
 list([0.9909053]) list([0.993568]) list([0.99134123]) list([0.9884562])
 list([0.80592954, 0.18704651]) list([0.98888177]) list([0.99134046])
 list([0.99188644]) list([0.1712567, 0.7428176, 0.077781044])
 list([0.9896512]) list([0.12987435, 0.8594495]) list([0.9933298])
 list([0.98953074])

In [None]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

cluster_names = {}
for i in range(num_clusters):
    print ("cluster %d words:" % i)
    
    # ambil 2 topik major untuk setiap cluster
    topic_words = []
    for ind in order_centroids[i, :1]:
        topic_words += [dictionary.get(word_id) for (word_id, prob) in lda.get_topic_terms(ind, topn=2)]
    
    cluster_names[i] = ','.join(topic_words)

    print (cluster_names[i])