In [None]:
# Import Library
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import pandas as pd
import operator

In [None]:
# Dataset
dataset = {}
dataset["d1"] = "Kampus STMIK AKBA berlokasi di Jalan Perintis Kemerdekaan No.75 Makassar."
dataset["d2"] = "STMIK AKBA menyelenggarakan Wisuda Program Sarjana Ke-20 pada tahun ini. Sebanyak 100 mahasiswa STMIK AKBA akan mengikuti kegiatan wisuda ini."
dataset["d3"] = "Dosen-dosen STMIK AKBA mengikuti kegiatan vaksinasi Covid-19 di aula LLDIKTI IX"
dataset["d4"] = "Walikota Makassar bertekad membawa Kota Makassar menjadi kota dunia."
dataset["d5"] = "Pemerintah kota Makassar mendukung pendirian beberapa perguruan tinggi untuk memberikan layanan pendidikan tinggi kepada masyarakat luas."
dataset["q"] = "stmik akba makassar"

In [None]:
# Case Folding
for k in dataset.keys():
    hasil_case_folding = dataset[k].lower()
    dataset[k] = hasil_case_folding
print(dataset)

In [None]:
# Tokenizing
tokenizer = RegexpTokenizer(r"\w+")
for k in dataset.keys():
    tokens = tokenizer.tokenize(dataset[k])
    dataset[k] = tokens
print(dataset)

In [None]:
# Number Removal
for k in dataset.keys():
    tokens = []
    for t in dataset[k]:
        if t.isnumeric() == False:
            tokens.append(t)
    dataset[k] = tokens
print(dataset)

In [None]:
# Stopword Removal by Sastrawi
factory = StopWordRemoverFactory()
stopword_list = factory.get_stop_words()
for k in dataset.keys():
    tokens = []
    for t in dataset[k]:
        if t not in stopword_list:
            tokens.append(t)
    dataset[k] = tokens
print(dataset)

In [None]:
# Stemming by Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()
for k in dataset.keys():
    tokens = []
    for t in dataset[k]:
        tokens.append(stemmer.stem(t))
    dataset[k] = tokens
print(dataset)

In [None]:
# Mengembalikan Format Dataset Awal
for k in dataset.keys():
    dataset[k] = " ".join(dataset[k])
print(dataset)

In [None]:
# Frekuensi Kemunculan Kata
tf = CountVectorizer()
term_doc_matrix = tf.fit_transform(dataset.values())
pd.DataFrame(term_doc_matrix.toarray(), index=dataset.keys(), columns=tf.get_feature_names())

In [None]:
# Perhitungan Euclidean Distance
ed = euclidean_distances(term_doc_matrix, term_doc_matrix)
df_ed = pd.DataFrame(ed, index=dataset.keys(), columns=dataset.keys())
df_ed

In [None]:
# Pengurutan Rangking Euclidean Distance
rank_ed = {}
for k in dataset.keys():
  if k != "q":
    rank_ed[k] = df_ed.at[k, "q"]
top_rank_ed = dict(sorted(rank_ed.items(), key=operator.itemgetter(1)))
pd.DataFrame(top_rank_ed.values(), index=top_rank_ed.keys(), columns=["Euclidean Distance"])

In [None]:
# Pembobotan TF.IDF
tfidf = TfidfVectorizer()
inverted_index = tfidf.fit_transform(dataset.values())
pd.DataFrame(inverted_index.toarray(), index=dataset.keys(), columns=tfidf.get_feature_names())

In [None]:
# Perhitungan Cosine Similarity
cs = cosine_similarity(inverted_index, inverted_index)
df_cs = pd.DataFrame(cs, index=dataset.keys(), columns=dataset.keys())
df_cs

In [None]:
# Pengurutan Rangking Cosine Similarity
rank_cs = {}
for k in dataset.keys():
  if k != "q":
    rank_cs[k] = df_cs.at[k, "q"]
top_rank_cs = dict(sorted(rank_cs.items(), key=operator.itemgetter(1), reverse=True))
pd.DataFrame(top_rank_cs.values(), index=top_rank_cs.keys(), columns=["Cosine Similarity"])