# Import Data

In [219]:
import pandas as pd

In [220]:
df = pd.read_csv('data.csv')
df = df['reviews'].values
df[:3]

array(['barang original', 'brg diterima dgn baik, smoga awet',
       'tv berkerja dengan baik, produk bagus dan berkualitas, hebat'],
      dtype=object)

# Preprocessing

In [202]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from spacy.lang.id import Indonesian
from nltk.tag import CRFTagger
from tqdm import tqdm
from spacy.lang.id import Indonesian
from spellchecker import correction

In [203]:
# case folding

for i in range(len(df)):
    # mengubah jadi lowercase
    df[i] = df[i].lower()
    # menghapus angka
    df[i] = re.sub(r"\d+", "", df[i])
    # menghapus tanda baca
    df[i] = df[i].translate(str.maketrans("","",string.punctuation)).strip()

In [204]:
# stopwords
stop_factory = StopWordRemoverFactory().get_stop_words() #load defaul stopword
more_stopword = ['mantap', 'bagus', 'kan'] #menambahkan stopword
data = stop_factory + more_stopword #menggabungkan stopword

dictionary = ArrayDictionary(data)
str_ = StopWordRemover(dictionary)

for i in range(len(df)):
    df[i] = word_tokenize(str_.remove(df[i]))

listStopword =  set(stopwords.words('indonesian'))
for i in range(len(df)):
    removed = []
    for token in df[i]:
        if token not in listStopword:
            removed.append(token)
    df[i] = removed   

In [205]:
# stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()

stemmer.stem('bekerja')
for i in range(len(df)):
    tokens = []
    for kata in df[i]:
        kata_dasar = stemmer.stem(kata)
        tokens.append(kata_dasar)
    df[i] = tokens

In [206]:
# spell checking
for i in range(len(df)):
    tokens = []
    for kata in df[i]:
        kata_asli = correction(kata)
        if len(kata_asli) > 2:
            tokens.append(kata_asli)
    df[i] = tokens

df[:3]

array([list(['barang', 'orisinal']),
       list(['bug', 'terima', 'dan', 'moga', 'awet']),
       list(['kerja', 'produk', 'kualitas', 'hebat'])], dtype=object)

In [207]:
# filter tag
ct = CRFTagger()
ct.set_model_file('data/all_indo_man_tag_corpus_model.crf.tagger')

filters = ['NN', 'NNP', 'NNS', 'NNPS', 'JJ']
tagged = ct.tag_sents(df)
for i in range(len(df)):
    sent = []
    for idx, posTag in enumerate(tagged[i]):
        kata = posTag[0]
        tag = posTag[1]
        if tag in filters:
            sent.append(kata)
    df[i] = sent
print(df[:2])

[list(['barang', 'orisinal']) list(['bug', 'moga', 'awet'])]


# Fit Transform

In [208]:
from gensim.models import Phrases
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore

In [209]:
data = df

In [210]:
bigram_t = Phrases(data, min_count=5)
trigram_t = Phrases(bigram_t[data], min_count=5)
for idx, d in enumerate(data):
    for token in bigram_t[d]:
        if '_' in token:# Token is a bigram, add to document.
            data[idx].append(token)
    for token in trigram_t[d]:
        if '_' in token:# Token is a bigram, add to document.
            data[idx].append(token)

# Create a dictionary representation of the documents.
# Remove rare & common tokens
dictionary_t = Dictionary(data)
dictionary_t.filter_extremes(no_below=2, no_above=0.90)
#Create dictionary and corpus required for Topic Modeling
corpus_t = [dictionary_t.doc2bow(doc) for doc in data]
corpus_t = [t for t in corpus_t if t] # remove empty corpus
print('Number of unique tokens: %d' % len(dictionary_t))
print('Number of documents: %d' % len(corpus_t))
print(corpus_t[:1])

Number of unique tokens: 46
Number of documents: 58
[[(0, 1), (1, 1)]]


In [211]:
LDAmodel_ = LdaMulticore(corpus=corpus_t, id2word=dictionary_t, num_topics=3)
LDAmodel_.show_topics()

[(0,
  '0.105*"barang" + 0.095*"cepat" + 0.051*"aman" + 0.044*"fungsi" + 0.044*"pokok" + 0.042*"selamat" + 0.041*"mudah" + 0.039*"kirim" + 0.035*"harga" + 0.033*"awet"'),
 (1,
  '0.078*"kayu" + 0.069*"sesuai" + 0.067*"kualitas" + 0.062*"barang" + 0.056*"deskripsi" + 0.053*"paking" + 0.050*"gudang" + 0.049*"cepat" + 0.036*"mulus" + 0.034*"aman"'),
 (2,
  '0.116*"cepat" + 0.087*"barang" + 0.070*"kirim" + 0.059*"awet" + 0.042*"moga" + 0.042*"gambar" + 0.042*"sesuai" + 0.041*"kualitas" + 0.027*"digital" + 0.026*"ramah"')]

# Coherence Value

In [212]:
import numpy as np
import matplotlib.pyplot as plt
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora

In [213]:
def compute_coherence_values(id2word, corpus, texts, limit=1, start=2, step=1):
    coherence_values = []
    for num_topics in range(start, limit, step):
        LDAmodel_ = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics)
        cm = CoherenceModel(model=LDAmodel_, texts=texts, corpus=corpus, coherence='c_v')
        with np.errstate(invalid='ignore'):
            coherence_values.append(cm.get_coherence())
    return coherence_values

In [214]:
# start, step, limit = 2, 1, 10 # Ganti dengan berapa banyak Topic yang ingin di hitung/explore
# coh_t, kCV = [], 5 # hati-hati sangat LAMBAT karena cross validasi pada metode yang memang tidak efisien (LDA)
# id2word = corpora.Dictionary(data)

# print('iterasi ke: ', end = ' ')
# for i in range(kCV):
#     print(i+1, end = ', ', flush=True)
#     c = compute_coherence_values(id2word, corpus_t, data, limit=limit, start=start, step=step)
#     coh_t.append(c)
    
# coherence_t = np.mean(np.array(coh_t), axis=0)
# 'Done'

In [215]:
# # Show graph
# x = range(start, limit, step)
# plt.figure(figsize=(12,10))
# for c in coh_t:
#     plt.plot(x, c, '--', color = 'lawngreen', linewidth = 2)
# plt.plot(x, coherence_t, '-', color = 'black', linewidth = 5)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()

# pyLDAvis

In [216]:
import pyLDAvis
from pyLDAvis.gensim_models import prepare as LDAvis

In [1]:
pyLDAvis.enable_notebook()
p = LDAvis(topic_model=LDAmodel_, corpus=corpus_t, dictionary=dictionary_t)
p

NameError: name 'pyLDAvis' is not defined

# Archived

In [218]:
df[:10]

array([list(['barang', 'orisinal']), list(['bug', 'moga', 'awet']),
       list(['kerja', 'produk', 'kualitas', 'hebat']),
       list(['barang', 'harga', 'saing', 'cepat']),
       list(['barang', 'orisinal', 'garansi', 'pasang', 'gampang', 'fungsi']),
       list([]), list([]),
       list(['proses', 'mik', 'kan', 'cepat', 'mengerang', 'tapis', 'hari', 'gratis', 'barang', 'mulus', 'gambar']),
       list(['alhamdulillah', 'barang', 'kurir', 'payah']),
       list(['mantapppp', 'proses', 'cepat', 'kirim', 'cepat'])],
      dtype=object)