In [35]:
import nltk
import pandas as pd
import re
import pprint
import operator
import csv
import logging
from stop_words import get_stop_words
from collections import defaultdict
from gensim import corpora
from gensim.models import ldamodel
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\novin\AppData\Roaming\nltk_data...


True

In [75]:
STOPWORDS = set(get_stop_words('indonesian'))
CUSTOM_STOPWORDS = {'yang'}
pp = pprint.PrettyPrinter(indent=4)
regex_filter = re.compile('[a-z]{2,}')
# put your custom path here if you so choose
nltk.data.path.append('')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [76]:
def tokenize_and_clean(document, stopwords=(STOPWORDS), regex=[], lemmatizer=WordNetLemmatizer()):
    """
    :param document: a string representing a single document
    :param stopwords: a set of stopwords
    :param regex: additional regular expressions to use as a filter. Assuming these are compiled prior
    :param lemmatizer: an instance of an nltk lemmatizer
    :return: a tokenized and filtered document
    """
    raw_tokenized = nltk.tokenize.wordpunct_tokenize(document)

    tokenized = []
    for word in raw_tokenized:
        w = word.lower()
        if w not in stopwords:
            for exp in regex:
                if re.match(exp,w):
                    if lemmatizer:
                        tokenized.append(lemmatizer.lemmatize(w))
                    else:
                        tokenized.append(w)

    return tokenized

In [77]:
def word_frequency(corpus=[[]]):
    """
    :param corpus: a list of lists representing tokenized documents
    :return: a dict containing the frequency of each word in the corpus
    """
    frequency = defaultdict(int)
    for doc in corpus:
        for w in doc:
            frequency[w] += 1
    return dict(sorted(frequency.items(), key=operator.itemgetter(1), reverse=True))

In [78]:
def write_dict_to_csv(data, filepath):
    """
    Encapsulating this in a function - writes an object to a csv
    :param data: a dict containing your data
    :param filepath: the filepath for your csv file
    """
    with open(filepath, 'wb') as csv_file:
        writer = csv.writer(csv_file)
        for key, value in data:
            writer.writerow([key, value])

In [79]:
raw = pd.read_csv('./data/perdata.csv', usecols=['document'])
raw['document']


0      Skripsi ini adalah hasil penelitian lapangan (...
1      Skripsi yang ditulis dengan judul “Implementas...
2      Jenis penelitian ini adalah penelitian lapanga...
3      Skripsi ini adalah hasil penilitian lapangan (...
4      Skripsi ini adalah penelitian lapangan dengan ...
                             ...                        
169    Skripsi ini merupakan penelitian kepustakaan (...
170    Skripsi ini adalah hasil dari penelitian pusta...
171    Bagaimana cara menyelesaikan wanprestasi kerja...
172    Skripsi ini merupakan hasil penelitian dengan ...
173    Skripsi ini berjudul “Analisis Hukum Islam ter...
Name: document, Length: 174, dtype: object

In [67]:
# from sklearn.feature_extraction.text import CountVectorizer
# 
# vectorizer = CountVectorizer(stop_words='english')
# dtm = vectorizer.fit_transform(raw['document'].dropna())
# print(dtm.shape)


(174, 5138)


In [80]:
corpus = []

for i, row in raw.iterrows():
    corpus.append(row[0])
corpus


  corpus.append(row[0])


['Skripsi ini adalah hasil penelitian lapangan (field research) dengan judul “Analisis Hukum Islam dan Hukum Positif Tentang Hak Perwalian Nikah Anggota Lembaga Dakwah Islam Indonesia (LDII) Di Desa Medaeng Kecamatan Waru Kabupaten Sidoarjo”. Penelitian ini bertujuan untuk menjawab dua rumusan masalaha yaitu: Bagaimana penetapan hak perwalian nikah anggota LDII bagi calon pengantin perempuan di desa Medaeng kecamatan Waru kabupaten Sidoarjo dan bagaimana analisis hukum Islam dan hukum positif tentang hak perwalian nikah anggota LDII bagi calon pengantin perempuan. Data penelitian disajikan dengan teknik deskriptif analisis dengan pola pikir deduktif, sehingga memberikan pemahaman yang konkrit dan dapat ditarik kesimpulan. Dalam hal ini, berangkat dari teori-teori tentang wali dalam sebuah pernikahan kemudian melihat data dan fakta penetapan wali nikah anggota LDII yang memperbolehkan nasab jalur dari ibu menjadi wali nikah. Menganalisisnya yaitu mengaitkan fakta dengan dalil-dalil yang

In [82]:
tokenized_corpus = []
for doc in corpus:
    try:
        tokenized_corpus.append(tokenize_and_clean(document=doc, stopwords=STOPWORDS.union(CUSTOM_STOPWORDS), regex=[regex_filter]))
    except Exception as e:
        print(f"Error processing document: {doc}")
        print(e)
    
tokenized_corpus

[['skripsi',
  'hasil',
  'penelitian',
  'lapangan',
  'field',
  'research',
  'judul',
  'analisis',
  'hukum',
  'islam',
  'hukum',
  'positif',
  'hak',
  'perwalian',
  'nikah',
  'anggota',
  'lembaga',
  'dakwah',
  'islam',
  'indonesia',
  'ldii',
  'desa',
  'medaeng',
  'kecamatan',
  'waru',
  'kabupaten',
  'sidoarjo',
  'penelitian',
  'bertujuan',
  'untuk',
  'menjawab',
  'dua',
  'rumusan',
  'masalaha',
  'penetapan',
  'hak',
  'perwalian',
  'nikah',
  'anggota',
  'ldii',
  'calon',
  'pengantin',
  'perempuan',
  'desa',
  'medaeng',
  'kecamatan',
  'waru',
  'kabupaten',
  'sidoarjo',
  'analisis',
  'hukum',
  'islam',
  'hukum',
  'positif',
  'hak',
  'perwalian',
  'nikah',
  'anggota',
  'ldii',
  'calon',
  'pengantin',
  'perempuan',
  'data',
  'penelitian',
  'disajikan',
  'teknik',
  'deskriptif',
  'analisis',
  'pola',
  'pikir',
  'deduktif',
  'memberikan',
  'pemahaman',
  'konkrit',
  'ditarik',
  'kesimpulan',
  'berangkat',
  'teori',
  'te

In [91]:
freq = word_frequency(tokenized_corpus)

tokenized_final = [[token for token in doc if freq[token] > 10] for doc in tokenized_corpus]

vocabulary = corpora.Dictionary(tokenized_final)

vocabulary.save('data/vocabulary.dict')

print(vocabulary)

corpus = [vocabulary.doc2bow(text) for text in tokenized_final]
corpora.MmCorpus.serialize('data/ufo.mm', corpus)

ufo_corpus = corpora.MmCorpus('data/ufo.mm')

lda = ldamodel.LdaModel(corpus=ufo_corpus,alpha='auto', id2word=vocabulary, num_topics=40, update_every=0, passes=40)

2023-11-06 19:20:27,750 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2023-11-06 19:20:27,767 : INFO : built Dictionary<613 unique tokens: ['analisis', 'anggota', 'ayah', 'baik', 'berada']...> from 174 documents (total 25340 corpus positions)
2023-11-06 19:20:27,768 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<613 unique tokens: ['analisis', 'anggota', 'ayah', 'baik', 'berada']...> from 174 documents (total 25340 corpus positions)", 'datetime': '2023-11-06T19:20:27.768255', 'gensim': '4.3.2', 'python': '3.11.0 (main, Oct 24 2022, 18:26:48) [MSC v.1933 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22635-SP0', 'event': 'created'}
2023-11-06 19:20:27,768 : INFO : Dictionary lifecycle event {'fname_or_handle': 'data/vocabulary.dict', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-11-06T19:20:27.768255', 'gensim': '4.3.2', 'python': '3.11.0 (main, Oct 24 2022, 18:26:48) [MSC v.1933 64 bit (AMD64)]', 'platform': 'Windo

Dictionary<613 unique tokens: ['analisis', 'anggota', 'ayah', 'baik', 'berada']...>


2023-11-06 19:20:28,018 : INFO : -9.281 per-word bound, 622.1 perplexity estimate based on a held-out corpus of 174 documents with 25340 words
2023-11-06 19:20:28,020 : INFO : PROGRESS: pass 0, at document #174/174
  sstats[:, ids] += np.outer(expElogthetad.T, cts / phinorm)
  b = np.sum(gradf / q) / (1 / c + np.sum(1 / q))
  dprior = -(gradf - b) / q
2023-11-06 19:20:28,170 : INFO : optimized alpha [0.02605074, 0.026499102, 0.02587526, 0.025438296, 0.024757765, 0.026997596, 0.025615908, 0.029859789, 0.025819764, 0.02423483, 0.023910655, 0.029051933, 0.024489848, 0.033455037, 0.021874849, 0.03008391, 0.024176147, 0.024836482, 0.025105413, 0.024208717, 0.023583308, 0.024379667, 0.024845535, 0.025842996, 0.025198074, 0.02327335, 0.02206888, 0.025388092, 0.025012381, 0.023903169, 0.025099337, 0.023600597, 0.026094187, 0.025797673, 0.027966509, 0.0250818, 0.024632707, 0.024183951, 0.024208007, 0.02684204]
2023-11-06 19:20:28,174 : INFO : topic #14 (0.022): 0.003*"perkawinan" + 0.003*"putus

In [105]:
with open('data/lda_topics', 'w') as file:
    file.write(str(lda.print_topics(-1)))

lda.print_topics(-1)

2023-11-06 19:28:10,686 : INFO : topic #0 (0.014): 0.037*"sapi" + 0.033*"penelitian" + 0.033*"hukum" + 0.021*"islam" + 0.020*"sm" + 0.018*"mahar" + 0.018*"penggunaan" + 0.018*"analisis" + 0.017*"kab" + 0.017*"alat"
2023-11-06 19:28:10,688 : INFO : topic #1 (0.015): 0.029*"pelayanan" + 0.027*"mediasi" + 0.026*"hak" + 0.025*"penelitian" + 0.023*"narapidana" + 0.022*"untuk" + 0.021*"pihak" + 0.021*"hamil" + 0.021*"kesehatan" + 0.019*"sekolah"
2023-11-06 19:28:10,689 : INFO : topic #2 (0.016): 0.076*"wali" + 0.045*"pernikahan" + 0.032*"kua" + 0.027*"hakim" + 0.023*"mantan" + 0.022*"penelitian" + 0.022*"islam" + 0.021*"masa" + 0.021*"iddah" + 0.019*"mazhab"
2023-11-06 19:28:10,690 : INFO : topic #3 (0.014): 0.045*"apotek" + 0.037*"perjanjian" + 0.031*"apoteker" + 0.027*"pemilik" + 0.025*"hotel" + 0.024*"penelitian" + 0.023*"gresik" + 0.023*"khas" + 0.021*"pencegahan" + 0.020*"untuk"
2023-11-06 19:28:10,691 : INFO : topic #4 (0.012): 0.041*"metode" + 0.040*"hari" + 0.038*"kematian" + 0.034*"

UnicodeEncodeError: 'charmap' codec can't encode character '\u012b' in position 3285: character maps to <undefined>

In [106]:
topics = lda.print_topics(num_words=20)
for topic in topics:
    print(topic)

2023-11-06 19:28:15,572 : INFO : topic #14 (0.009): 0.002*"perkawinan" + 0.002*"putusan" + 0.002*"pertimbangan" + 0.002*"hukum" + 0.002*"nomor" + 0.002*"pa" + 0.002*"hakim" + 0.002*"undang" + 0.002*"pdt" + 0.002*"terjadi" + 0.002*"analisis" + 0.002*"agama" + 0.002*"sesuai" + 0.002*"dilakukan" + 0.002*"pengadilan" + 0.002*"berlaku" + 0.002*"untuk" + 0.002*"hati" + 0.002*"perkara" + 0.002*"poligami"
2023-11-06 19:28:15,573 : INFO : topic #16 (0.010): 0.002*"penelitian" + 0.002*"go" + 0.002*"analisis" + 0.002*"hukum" + 0.002*"islam" + 0.002*"akad" + 0.002*"hasil" + 0.002*"untuk" + 0.002*"desa" + 0.002*"poligami" + 0.002*"kabupaten" + 0.002*"praktik" + 0.002*"pihak" + 0.002*"berdasarkan" + 0.002*"pa" + 0.002*"jek" + 0.002*"beli" + 0.002*"menggunakan" + 0.002*"hakim" + 0.002*"pdt"
2023-11-06 19:28:15,574 : INFO : topic #30 (0.011): 0.038*"penelitian" + 0.033*"al" + 0.028*"mui" + 0.028*"dsn" + 0.028*"ijarah" + 0.028*"kjks" + 0.024*"akad" + 0.024*"nomor" + 0.024*"melakukan" + 0.019*"data" + 0

(14, '0.002*"perkawinan" + 0.002*"putusan" + 0.002*"pertimbangan" + 0.002*"hukum" + 0.002*"nomor" + 0.002*"pa" + 0.002*"hakim" + 0.002*"undang" + 0.002*"pdt" + 0.002*"terjadi" + 0.002*"analisis" + 0.002*"agama" + 0.002*"sesuai" + 0.002*"dilakukan" + 0.002*"pengadilan" + 0.002*"berlaku" + 0.002*"untuk" + 0.002*"hati" + 0.002*"perkara" + 0.002*"poligami"')
(16, '0.002*"penelitian" + 0.002*"go" + 0.002*"analisis" + 0.002*"hukum" + 0.002*"islam" + 0.002*"akad" + 0.002*"hasil" + 0.002*"untuk" + 0.002*"desa" + 0.002*"poligami" + 0.002*"kabupaten" + 0.002*"praktik" + 0.002*"pihak" + 0.002*"berdasarkan" + 0.002*"pa" + 0.002*"jek" + 0.002*"beli" + 0.002*"menggunakan" + 0.002*"hakim" + 0.002*"pdt"')
(30, '0.038*"penelitian" + 0.033*"al" + 0.028*"mui" + 0.028*"dsn" + 0.028*"ijarah" + 0.028*"kjks" + 0.024*"akad" + 0.024*"nomor" + 0.024*"melakukan" + 0.019*"data" + 0.019*"syariah" + 0.019*"jasa" + 0.019*"peraturan" + 0.014*"untuk" + 0.014*"menggunakan" + 0.014*"praktik" + 0.014*"lapangan" + 0.014*"

In [107]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Mempersiapkan visualisasi untuk model LDA
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda, ufo_corpus, dictionary=vocabulary)

# Menampilkan visualisasi
pyLDAvis.display(vis)


  gammad = self.alpha + expElogthetad * np.dot(cts / phinorm, expElogbetad.T)
  doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
  topic = topic / topic.sum(axis=1)[:, None]
