In [4]:
import nltk
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\novin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
abstract_a = pd.read_csv("./data/perdata.csv", index_col=1)
abstract_b = pd.read_csv("./data/pidana.csv")

abstracts_prodi_a = []
abstracts_prodi_b = []

for index, row in abstract_a.iterrows():
    abstract = ' '.join([str(row['document'])])
    abstracts_prodi_a.append(abstract)

for index, row in abstract_b.iterrows():
    abstract = ' '.join([str(row['document'])])
    abstracts_prodi_b.append(abstract)

In [20]:
stemmer = StemmerFactory().create_stemmer()

In [22]:
def tokenize_and_stem(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return stemmed_words

In [17]:
tokens_prodi_a = [tokenize_and_stem(abstract) for abstract in abstracts_prodi_a]
tokens_prodi_b = [tokenize_and_stem(abstract) for abstract in abstracts_prodi_b]

In [24]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(abstracts_prodi_a)

lda = LatentDirichletAllocation(n_components=10, random_state=0)
lda.fit(tf)

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

print_top_words(lda, tf_vectorizer.get_feature_names_out(), n_top_words=10)

Topic #0: di hukum untuk penelitian islam tidak pasar dari desa anak
Topic #1: di penelitian dari untuk perempuan hasil hukum analisis bagaimana kepada
Topic #2: istri suami penelitian kewajiban hukum untuk tentang tidak di islam
Topic #3: di jual beli penelitian tidak praktik untuk islam surabaya hukum
Topic #4: hukum anak penelitian tahun data undang dari nomor perkawinan tentang
Topic #5: nikah wali penelitian di perkawinan agama data pengadilan hukum dari
Topic #6: di pada hukum perjanjian islam penelitian akad tidak untuk menggunakan
Topic #7: hukum tidak di penelitian islam perkawinan undang untuk tentang atau
Topic #8: penelitian di desa kecamatan tidak untuk sidoarjo pihak ibu hasil
Topic #9: hakim putusan pa pdt perkara nomor pengadilan untuk agama pertimbangan


In [30]:

model_prodi_a = Word2Vec(tokens_prodi_a, min_count=1)
model_prodi_b = Word2Vec(tokens_prodi_b, min_count=1)


similarity = model_prodi_a.wv.n_similarity(tokens_prodi_a[0], tokens_prodi_b[0])
print("Kesamaan antara abstrak Prodi A dan Prodi B:", similarity)


Kesamaan antara abstrak Prodi A dan Prodi B: 0.99935174


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


all_abstracts = abstracts_prodi_a + abstracts_prodi_b

stemmer = StemmerFactory().create_stemmer()


def tokenize_and_stem(text):
    words = text.split()  
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)


preprocessed_abstracts = [tokenize_and_stem(abstract) for abstract in all_abstracts]

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(all_abstracts)

similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("Matriks Kesamaan:")
print(similarity_matrix)


Matriks Kesamaan:
[[1.         0.04829927 0.0513468  ... 0.06355455 0.07393398 0.05388955]
 [0.04829927 1.         0.0468768  ... 0.07890107 0.09665612 0.11880174]
 [0.0513468  0.0468768  1.         ... 0.06904285 0.06141292 0.05744038]
 ...
 [0.06355455 0.07890107 0.06904285 ... 1.         0.17798232 0.18330189]
 [0.07393398 0.09665612 0.06141292 ... 0.17798232 1.         0.21488878]
 [0.05388955 0.11880174 0.05744038 ... 0.18330189 0.21488878 1.        ]]


In [None]:
from textblob import TextBlob



stemmer = StemmerFactory().create_stemmer()

def tokenize_and_stem(text):
    words = text.split()  
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)

stemmed_abstracts_prodi_a = [tokenize_and_stem(abstract) for abstract in abstracts_prodi_a]
stemmed_abstracts_prodi_b = [tokenize_and_stem(abstract) for abstract in abstracts_prodi_b]


sentiments_prodi_a = [TextBlob(abstract).sentiment.polarity for abstract in stemmed_abstracts_prodi_a]

sentiments_prodi_b = [TextBlob(abstract).sentiment.polarity for abstract in stemmed_abstracts_prodi_b]

print("Sentimen rata-rata Prodi A:", sum(sentiments_prodi_a) / len(sentiments_prodi_a))
print("Sentimen rata-rata Prodi B:", sum(sentiments_prodi_b) / len(sentiments_prodi_b))
