In [2]:
""""""
from collections import defaultdict
import math
import sys
from functools import reduce
from pathlib import Path
import PyPDF2
from docx import Document

#misal kita punya database dokumnet dengan id angka (0, 2, 5....)
document_filenames = {
    0: "documents/bapak perjuangan.txt",
    1: "documents/cara cepat pintar.txt",
    2: "documents/manusia.txt",
    5: "documents/dok1.docx",
    7: "documents/jago coding.pdf",
    3: "documents/tomat buah.txt"
}

# Ukuran korpus
N = len(document_filenames)

# dictionary: menyimpan semua istilah (kata) dalam korpus dokumen
dictionary = set()

# postings: menyimpan daftar posting untuk setiap istilah
postings = defaultdict(dict)

# document_frequency: menyimpan jumlah dokumen yang mengandung istilah tertentu
document_frequency = defaultdict(int)

# length: menyimpan panjang Euclidean dari vektor dokumen
length = defaultdict(float)

# Daftar karakter yang akan dihapus saat tokenisasi
characters = " .,!#$%^&*();:\n\t\\\"?!{}[]<>"

def main():
    initialize_terms_and_postings()
    initialize_document_frequencies()
    initialize_lengths()
    while True:
        do_search()

def initialize_terms_and_postings():
    """Memproses setiap dokumen, menambahkan istilah baru ke dictionary,
    dan memperbarui daftar posting."""
    global dictionary, postings
    for id, filepath in document_filenames.items():
        document = read_document(filepath)
        terms = tokenize(document)
        unique_terms = set(terms)
        dictionary = dictionary.union(unique_terms)
        for term in unique_terms:
            postings[term][id] = terms.count(term)

def read_document(filepath):
    """Membaca isi dokumen berdasarkan format file."""
    ext = Path(filepath).suffix.lower()
    if ext == ".txt":
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()
    elif ext == ".pdf":
        return read_pdf(filepath)
    elif ext == ".docx":
        return read_docx(filepath)
    else:
        print(f"Format file {ext} tidak didukung.")
        return ""

def read_pdf(filepath):
    """Membaca isi dokumen PDF."""
    text = ""
    with open(filepath, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text()
    return text

def read_docx(filepath):
    """Membaca isi dokumen DOCX."""
    doc = Document(filepath)
    return "\n".join(paragraph.text for paragraph in doc.paragraphs)

def tokenize(document):
    """Mengembalikan daftar istilah setelah tokenisasi dan normalisasi."""
    terms = document.lower().split()
    return [term.strip(characters) for term in terms]

def initialize_document_frequencies():
    """Menghitung jumlah dokumen yang mengandung setiap istilah."""
    global document_frequency
    for term in dictionary:
        document_frequency[term] = len(postings[term])

def initialize_lengths():
    """Menghitung panjang vektor untuk setiap dokumen."""
    global length
    for id in document_filenames:
        l = 0
        for term in dictionary:
            l += imp(term, id)**2
        length[id] = math.sqrt(l)

def imp(term, id):
    """Mengembalikan bobot istilah dalam dokumen tertentu."""
    if id in postings[term]:
        return postings[term][id] * inverse_document_frequency(term)
    else:
        return 0.0

def inverse_document_frequency(term):
    """Mengembalikan frekuensi dokumen terbalik (IDF) dari istilah."""
    if term in dictionary:
        return math.log(N / document_frequency[term], 2)
    else:
        return 0.0

def do_search():
    """Meminta input kueri pengguna dan menampilkan dokumen yang relevan."""
    query = tokenize(input("Search query >> "))
    if query == []:
        sys.exit()
    relevant_document_ids = intersection(
        [set(postings[term].keys()) for term in query if term in postings]
    )
    if not relevant_document_ids:
        print("Tidak ada dokumen yang cocok dengan kueri.")
    else:
        scores = sorted(
            [(id, similarity(query, id)) for id in relevant_document_ids],
            key=lambda x: x[1],
            reverse=True
        )
        print("Skor kemiripan dengan query - Nama file:")
        for id, score in scores:
            print(f"{score}: {document_filenames[id]}")

def intersection(sets):
    """Mengembalikan irisan dari semua set."""
    return reduce(set.intersection, sets) if sets else set()

def similarity(query, id):
    """Menghitung kesamaan kosinus antara kueri dan dokumen tertentu."""
    similarity = 0.0
    for term in query:
        if term in dictionary:
            similarity += inverse_document_frequency(term) * imp(term, id)
    similarity = similarity / length[id] if length[id] > 0 else 0.0
    return similarity

if __name__ == "__main__":
    main()

Skor kemiripan dengan query - Nama file:
1.780818812544124: documents/manusia.txt


SystemExit: 