# **1. Impor Modul Dasar & Persiapan Data (Corpus)**

In [1]:
import math
import string
import urllib.request
import pandas as pd
from typing import List, Dict, Union

# 1. URL Raw dari file dataset.txt yang telah saya buat sebelumnya
url_dataset = "https://gist.githubusercontent.com/pisondev/41b34953411fb0b5b3d1aa3d4d9b34e3/raw/4791a0f7298be592b3a75f2099940c5d0840f694/dataset.txt"
file_name = "dataset.txt"

# 2. Saya buat auto-download file agar bisa langsung run tanpa upload manual
try:
    urllib.request.urlretrieve(url_dataset, file_name)
    print("Dataset berhasil diunduh ke sesi Colab.")
except Exception as e:
    print(f"Gagal mengunduh dataset: {e}")

# 3. Membaca dokumen dari file yang sudah didownload
documents = []
with open(file_name, "r", encoding="utf-8") as file:
    # Membaca setiap baris, membersihkan spasi/newline, dan mengabaikan baris kosong
    documents = [line.strip() for line in file.readlines() if line.strip()]

# Stopwords sederhana untuk filtering
STOPWORDS = {"a", "is", "to", "has", "the", "in", "and"}

print(f"Berhasil memuat {len(documents)} dokumen untuk diproses.")

Dataset berhasil diunduh ke sesi Colab.
Berhasil memuat 3 dokumen untuk diproses.


# **2. Fungsi Preprocessing & N-Gram**

In [2]:
def preprocess_text(text: str, n_gram: int = 1) -> List[str]:
    """
    Membersihkan teks (case folding, hapus tanda baca) dan melakukan tokenisasi,
    dengan opsi dukungan N-Gram.
    """
    # 1. Case Folding & Punctuation Removal
    text = text.lower()
    for punct in string.punctuation:
        text = text.replace(punct, "")

    # 2. Tokenization & Filtering (menghapus stopwords)
    words = text.split()
    # Hanya perlu mengecek stopwords karena tanda baca sudah hilang
    filtered_words = [word for word in words if word not in STOPWORDS]

    # 3. N-Gram Handling
    if n_gram == 1:
        return filtered_words
    else:
        ngrams = []
        for i in range(len(filtered_words) - n_gram + 1):
            ngrams.append("_".join(filtered_words[i:i+n_gram]))
        return filtered_words + ngrams # Mengembalikan unigram + n-gram

# Uji coba preprocessing untuk semua dokumen
print("Hasil Preprocessing Semua Dokumen (dengan Bigram):\n")

for index, doc in enumerate(documents):
    hasil_prep = preprocess_text(doc, n_gram=2)
    print(f"Dokumen {index + 1}:")
    print(hasil_prep)
    print("-" * 40)

Hasil Preprocessing Semua Dokumen (dengan Bigram):

Dokumen 1:
['jazz', 'music', 'swing', 'rhythm', 'jazz_music', 'music_swing', 'swing_rhythm']
----------------------------------------
Dokumen 2:
['swing', 'hard', 'explain', 'swing_hard', 'hard_explain']
----------------------------------------
Dokumen 3:
['swing', 'rhythm', 'natural', 'rhythm', 'swing_rhythm', 'rhythm_natural', 'natural_rhythm']
----------------------------------------


# **3. Membangun Vocabulary & Fungsi Term Frequency (TF)**

In [3]:
def build_vocabulary(corpus_tokens: List[List[str]]) -> List[str]:
    """Membangun kamus kata unik dari seluruh dokumen."""
    vocab = set()
    for tokens in corpus_tokens:
        vocab.update(tokens)
    return sorted(list(vocab))

def compute_tf(document_tokens: List[str], vocab: List[str]) -> Dict[str, float]:
    """Menghitung Term Frequency (TF) dengan normalisasi panjang dokumen."""
    tf_dict = dict.fromkeys(vocab, 0.0)
    total_words = len(document_tokens)

    for word in document_tokens:
        if word in tf_dict:
            tf_dict[word] += 1

    # Normalisasi TF
    for word in tf_dict:
        if total_words > 0:
            tf_dict[word] = tf_dict[word] / total_words

    return tf_dict

# Diterapkan ke semua dokumen
corpus_tokens = [preprocess_text(doc, n_gram=1) for doc in documents]
vocabulary = build_vocabulary(corpus_tokens)

print("Vocabulary:", vocabulary)

Vocabulary: ['explain', 'hard', 'jazz', 'music', 'natural', 'rhythm', 'swing']


# **4. Perhitungan Inverse Document Frequency (IDF)**

In [4]:
def compute_idf(corpus_tokens: List[List[str]], vocab: List[str]) -> Dict[str, float]:
    """Menghitung Inverse Document Frequency (IDF) untuk setiap kata di vocabulary."""
    idf_dict = dict.fromkeys(vocab, 0.0)
    total_documents = len(corpus_tokens)

    for word in vocab:
        # Menghitung berapa dokumen yang mengandung kata "word"
        doc_count = sum([1 for tokens in corpus_tokens if word in tokens])

        # Perhitungan IDF sesuai rumus dari slide
        if doc_count > 0:
            idf_dict[word] = 1 + math.log10(total_documents / doc_count)
        else:
            idf_dict[word] = 0.0

    return idf_dict

idf_values = compute_idf(corpus_tokens, vocabulary)

# **5. Kalkulasi TF-IDF Akhir & Visualisasi Data Frame**

In [5]:
def compute_tfidf(corpus_tokens: List[List[str]], vocab: List[str]) -> List[Dict[str, float]]:
    """Menghitung nilai akhir TF-IDF untuk seluruh korpus."""
    idf_values = compute_idf(corpus_tokens, vocab)
    tfidf_matrix = []

    for tokens in corpus_tokens:
        tf_values = compute_tf(tokens, vocab)
        tfidf_doc = {}
        for word in vocab:
            tfidf_doc[word] = tf_values[word] * idf_values[word]
        tfidf_matrix.append(tfidf_doc)

    return tfidf_matrix

# Hitung matriks TF-IDF
tfidf_matrix = compute_tfidf(corpus_tokens, vocabulary)

# Menampilkan hasil dengan Pandas DataFrame agar elegan
df_tfidf = pd.DataFrame(tfidf_matrix)
df_tfidf.index = [f"Doc_{i+1}" for i in range(len(documents))]
print("\n--- Matriks TF-IDF ---")
display(df_tfidf)


--- Matriks TF-IDF ---


Unnamed: 0,explain,hard,jazz,music,natural,rhythm,swing
Doc_1,0.0,0.0,0.36928,0.36928,0.0,0.294023,0.25
Doc_2,0.492374,0.492374,0.0,0.0,0.0,0.0,0.333333
Doc_3,0.0,0.0,0.0,0.0,0.36928,0.588046,0.25


# **6. Cosine Similarity & Fitur Search**

In [6]:
def cosine_similarity(vec1: Dict[str, float], vec2: Dict[str, float]) -> float:
    """Menghitung kemiripan antara dua vektor (dokumen/query)."""
    dot_product = sum(vec1[key] * vec2.get(key, 0.0) for key in vec1)
    mag1 = math.sqrt(sum(val**2 for val in vec1.values()))
    mag2 = math.sqrt(sum(val**2 for val in vec2.values()))

    if mag1 == 0 or mag2 == 0:
        return 0.0
    return dot_product / (mag1 * mag2)

def search_query(query: str, documents: List[str], tfidf_matrix: List[Dict[str, float]], vocab: List[str], idf_values: Dict[str, float]):
    """Mencari dokumen paling relevan berdasarkan query dengan penanganan OOV."""
    print(f"Mencari query: '{query}'\n")
    query_tokens = preprocess_text(query, n_gram=1)

    # HANDLING OOV: Mengecek dan memberitahu jika ada kata di luar kamus
    oov_words = [word for word in query_tokens if word not in vocab]
    if oov_words:
        print(f"[WARN] Kata berikut tidak ada di kamus (OOV) dan diabaikan: {oov_words}\n")

    # Hitung TF-IDF untuk query
    query_tf = compute_tf(query_tokens, vocab)
    query_tfidf = {word: query_tf[word] * idf_values.get(word, 0.0) for word in vocab}

    # Hitung kemiripan query dengan setiap dokumen
    results = []
    for i, doc_tfidf in enumerate(tfidf_matrix):
        score = cosine_similarity(query_tfidf, doc_tfidf)
        results.append((score, documents[i]))

    # Urutkan dari skor tertinggi
    results.sort(reverse=True, key=lambda x: x[0])

    for rank, (score, doc) in enumerate(results, start=1):
        print(f"Rank {rank} (Skor: {score:.4f}): {doc}")

# Uji Coba Pencarian dengan kata yang tidak ada di dokumen (contoh: 'guitar' dan 'pop')
search_query("natural rhythm with pop guitar", documents, tfidf_matrix, vocabulary, idf_values)

Mencari query: 'natural rhythm with pop guitar'

[WARN] Kata berikut tidak ada di kamus (OOV) dan diabaikan: ['with', 'pop', 'guitar']

Rank 1 (Skor: 0.8878): Swing rhythm is a natural rhythm.
Rank 2 (Skor: 0.2820): Jazz music has a swing rhythm.
Rank 3 (Skor: 0.0000): Swing is hard to explain.


# **7. Keseluruhan Kode dalam Bentuk Versi Class OOP**

In [7]:
class TFIDFProcessor:
    """
    Kelas untuk memproses teks, mengekstrak fitur TF-IDF,
    dan melakukan pencarian dokumen.
    """
    def __init__(self, stopwords: set, n_gram: int = 1):
        self.stopwords = stopwords
        self.n_gram = n_gram
        self.vocab = []
        self.idf_values = {}
        self.tfidf_matrix = []
        self.documents = []

    def _preprocess(self, text: str) -> List[str]:
        text = text.lower()
        for punct in string.punctuation:
            text = text.replace(punct, "")
        words = text.split()
        filtered = [word for word in words if word not in self.stopwords]

        if self.n_gram > 1:
            ngrams = ["_".join(filtered[i:i+self.n_gram]) for i in range(len(filtered) - self.n_gram + 1)]
            return filtered + ngrams
        return filtered

    def fit_transform(self, documents: List[str]) -> pd.DataFrame:
        """Melatih model dengan dokumen dan mengembalikan DataFrame TF-IDF."""
        self.documents = documents
        corpus_tokens = [self._preprocess(doc) for doc in documents]

        # Build Vocabulary
        vocab_set = set()
        for tokens in corpus_tokens:
            vocab_set.update(tokens)
        self.vocab = sorted(list(vocab_set))

        # Compute IDF
        self.idf_values = dict.fromkeys(self.vocab, 0.0)
        total_docs = len(corpus_tokens)
        for word in self.vocab:
            doc_count = sum([1 for tokens in corpus_tokens if word in tokens])
            if doc_count > 0:
                self.idf_values[word] = 1 + math.log10(total_docs / doc_count)

        # Compute TF-IDF Matrix
        self.tfidf_matrix = []
        for tokens in corpus_tokens:
            tf_dict = dict.fromkeys(self.vocab, 0.0)
            total_words = len(tokens)
            for word in tokens:
                if word in tf_dict:
                    tf_dict[word] += 1

            tfidf_doc = {}
            for word in self.vocab:
                tf = (tf_dict[word] / total_words) if total_words > 0 else 0.0
                tfidf_doc[word] = tf * self.idf_values[word]
            self.tfidf_matrix.append(tfidf_doc)

        df = pd.DataFrame(self.tfidf_matrix)
        df.index = [f"Doc_{i+1}" for i in range(len(documents))]
        return df

    def search(self, query: str):
        """Mencari dokumen yang paling sesuai dengan query."""
        if not self.tfidf_matrix:
            print("Model belum dilatih. Jalankan fit_transform() terlebih dahulu.")
            return

        print(f"\n--- Hasil Pencarian untuk: '{query}' ---")
        query_tokens = self._preprocess(query)

        # OOV Handling
        oov_words = [w for w in query_tokens if w not in self.vocab]
        if oov_words:
            print(f"[Peringatan OOV] Kata diabaikan: {oov_words}")

        # Hitung TF-IDF Query
        query_tfidf = {}
        total_words = len(query_tokens)
        for word in self.vocab:
            tf = (query_tokens.count(word) / total_words) if total_words > 0 else 0.0
            query_tfidf[word] = tf * self.idf_values.get(word, 0.0)

        # Hitung Kemiripan (Cosine Similarity)
        results = []
        mag_query = math.sqrt(sum(val**2 for val in query_tfidf.values()))

        for i, doc_tfidf in enumerate(self.tfidf_matrix):
            dot_product = sum(query_tfidf[key] * doc_tfidf.get(key, 0.0) for key in query_tfidf)
            mag_doc = math.sqrt(sum(val**2 for val in doc_tfidf.values()))
            score = (dot_product / (mag_query * mag_doc)) if (mag_query > 0 and mag_doc > 0) else 0.0
            results.append((score, self.documents[i]))

        results.sort(reverse=True, key=lambda x: x[0])
        for rank, (score, doc) in enumerate(results, start=1):
            print(f"Rank {rank} (Skor: {score:.4f}): {doc}")

# Uji Coba Penggunaan Class OOP
print("CLASS OOP:")
# Inisialisasi Objek (Sangat bersih dan rapi!)
model = TFIDFProcessor(stopwords=STOPWORDS, n_gram=1)

# Latih model dan tampilkan matriks
df_result = model.fit_transform(documents)
display(df_result)

# Lakukan pencarian
model.search("Explain the jazz rhythm to me")

CLASS OOP:


Unnamed: 0,explain,hard,jazz,music,natural,rhythm,swing
Doc_1,0.0,0.0,0.36928,0.36928,0.0,0.294023,0.25
Doc_2,0.492374,0.492374,0.0,0.0,0.0,0.0,0.333333
Doc_3,0.0,0.0,0.0,0.0,0.36928,0.588046,0.25



--- Hasil Pencarian untuk: 'Explain the jazz rhythm to me' ---
[Peringatan OOV] Kata diabaikan: ['me']
Rank 1 (Skor: 0.5725): Jazz music has a swing rhythm.
Rank 2 (Skor: 0.3930): Swing is hard to explain.
Rank 3 (Skor: 0.3909): Swing rhythm is a natural rhythm.
