In [3]:
# ============================
# 📦 Manual Text Encoding Utils
# ============================

def tokenize_documents(raw_documents):
    return [doc.lower().split() for doc in raw_documents]

def build_vocab(tokenized_documents):
    return sorted(set(word for doc in tokenized_documents for word in doc))

# 1️⃣ Manual Bag of Words (BoW)
def manual_bow(tokenized_documents, vocab):
    vectors = []
    for doc in tokenized_documents:
        vector = [doc.count(word) for word in vocab]
        vectors.append(vector)
    return vectors

# 2️⃣ Manual One-Hot Encoding (Word Level)
def word_to_one_hot(word, vocab_dict):
    vec = [0] * len(vocab_dict)
    if word in vocab_dict:
        vec[vocab_dict[word]] = 1
    return vec

def document_word_level_one_hot(tokenized_document, vocab_dict):
    return [word_to_one_hot(word, vocab_dict) for word in tokenized_document]

# 3️⃣ Manual One-Hot (Binary BoW / Doc Level)
def manual_binary_bow(tokenized_documents, vocab):
    vectors = []
    for doc in tokenized_documents:
        vector = [1 if word in doc else 0 for word in vocab]
        vectors.append(vector)
    return vectors

# 4️⃣ Gabungan Word-Level + Dokumen
def combined_word_doc_one_hot(tokenized_documents, vocab_dict):
    result = []
    for doc in tokenized_documents:
        one_hot_words = []
        for word in doc:
            one_hot_words.append((word, word_to_one_hot(word, vocab_dict)))
        result.append(one_hot_words)
    return result

# ============================
# 🧪 Contoh Penggunaan
# ============================

if __name__ == "__main__":
    documents = [
        "saya suka belajar machine learning",
        "machine learning sangat menarik",
        "saya suka belajar"
    ]

    tokenized_docs = tokenize_documents(documents)
    vocab = build_vocab(tokenized_docs)
    vocab_dict = {word: i for i, word in enumerate(vocab)}

    print("📌 Vocab:", vocab)

    print("\n🔢 Bag of Words (BoW):")
    for vec in manual_bow(tokenized_docs, vocab):
        print(vec)

    print("\n🔢 Binary BoW (Document Level):")
    for vec in manual_binary_bow(tokenized_docs, vocab):
        print(vec)

    print("\n🔠 Word-Level One-Hot (Dokumen 1):")
    for word, vec in combined_word_doc_one_hot(tokenized_docs, vocab_dict)[0]:
        print(f"{word} → {vec}")


📌 Vocab: ['belajar', 'learning', 'machine', 'menarik', 'sangat', 'saya', 'suka']

🔢 Bag of Words (BoW):
[1, 1, 1, 0, 0, 1, 1]
[0, 1, 1, 1, 1, 0, 0]
[1, 0, 0, 0, 0, 1, 1]

🔢 Binary BoW (Document Level):
[1, 1, 1, 0, 0, 1, 1]
[0, 1, 1, 1, 1, 0, 0]
[1, 0, 0, 0, 0, 1, 1]

🔠 Word-Level One-Hot (Dokumen 1):
saya → [0, 0, 0, 0, 0, 1, 0]
suka → [0, 0, 0, 0, 0, 0, 1]
belajar → [1, 0, 0, 0, 0, 0, 0]
machine → [0, 0, 1, 0, 0, 0, 0]
learning → [0, 1, 0, 0, 0, 0, 0]


In [None]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer

docs = [
    "saya suka belajar machine learning",
    "machine learning sangat menarik",
    "saya suka belajar"
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("BoW Matrix:\n", X.toarray())

Vocabulary: ['belajar' 'learning' 'machine' 'menarik' 'sangat' 'saya' 'suka']
BoW Matrix:
 [[1 1 1 0 0 1 1]
 [0 1 1 1 1 0 0]
 [1 0 0 0 0 1 1]]


In [6]:
# one hot doc
vectorizer_bin=CountVectorizer(binary=True)

X_bin=vectorizer_bin.fit_transform(docs)


print("Vocabulary:", vectorizer_bin.get_feature_names_out())
print("One-Hot (Binary) Matrix:\n", X_bin.toarray())

Vocabulary: ['belajar' 'learning' 'machine' 'menarik' 'sangat' 'saya' 'suka']
One-Hot (Binary) Matrix:
 [[1 1 1 0 0 1 1]
 [0 1 1 1 1 0 0]
 [1 0 0 0 0 1 1]]


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(docs)

print("Vocabulary:", tfidf.get_feature_names_out())
print("TF-IDF Matrix:\n", X_tfidf.toarray())


Vocabulary: ['belajar' 'learning' 'machine' 'menarik' 'sangat' 'saya' 'suka']
TF-IDF Matrix:
 [[0.4472136  0.4472136  0.4472136  0.         0.         0.4472136
  0.4472136 ]
 [0.         0.42804604 0.42804604 0.5628291  0.5628291  0.
  0.        ]
 [0.57735027 0.         0.         0.         0.         0.57735027
  0.57735027]]


In [9]:
# n gram text representation for handling message
from sklearn.feature_extraction.text import CountVectorizer

sentences = ["saya suka belajar NLP"]

vectorizer = CountVectorizer(ngram_range=(2, 2))  # bigram
X = vectorizer.fit_transform(sentences)

print("N-gram:", vectorizer.get_feature_names_out())
print(X.toarray())

from nltk import ngrams
from nltk.tokenize import word_tokenize

text = "saya suka belajar NLP"
tokens = word_tokenize(text)

bigrams = list(ngrams(tokens, 2))
print(bigrams)



N-gram: ['belajar nlp' 'saya suka' 'suka belajar']
[[1 1 1]]
[('saya', 'suka'), ('suka', 'belajar'), ('belajar', 'NLP')]


In [3]:
# ==========================================
# 📄 PDF ke Word2Vec Embedding - Gensim
# ==========================================

# ✅ Install package (jalankan di terminal sebelum pakai):
# pip install pymupdf gensim nltk

import fitz  # PyMuPDF untuk membaca isi PDF
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import string
import nltk
import numpy as np
from gensim.models import Word2Vec

# 📥 Unduh resource NLTK yang dibutuhkan (tokenizer dan stopwords)
# Cukup jalankan sekali saja, setelah itu bisa diberi komentar
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('stopwords')


# ------------------------------------------
# 1️⃣ Fungsi: Baca PDF dan ubah ke string
# ------------------------------------------
def pdf_to_text(path):
    """
    Membaca file PDF dan menggabungkan semua teks dari setiap halaman menjadi satu string.
    """
    doc = fitz.open(path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text


# ------------------------------------------
# 2️⃣ Fungsi: Preprocessing Teks
# ------------------------------------------
def preprocess(text):
    """
    Lowercase, tokenisasi, hapus stopwords dan tanda baca.
    """
    # Menggunakan stopwords bahasa Inggris sesuai dengan default NLTK
    stop_words = set(stopwords.words('english'))
    
    # Tokenisasi
    tokens = word_tokenize(text.lower())
    
    # Hapus stopwords dan tanda baca
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    return tokens


# ------------------------------------------
# 3️⃣ Fungsi: Split ke kalimat dan tokenize
# ------------------------------------------
def get_sentences(text):
    """
    Memecah teks menjadi kalimat, lalu memproses tiap kalimat menjadi token.
    Dibutuhkan untuk input Word2Vec (list of list).
    """
    sentences = sent_tokenize(text)
    # Memastikan tidak ada list kosong yang masuk ke model Word2Vec
    return [tokens for tokens in (preprocess(sentence) for sentence in sentences) if tokens]


# ------------------------------------------
# 4️⃣ Fungsi: Konversi dokumen ke vektor
# ------------------------------------------
def document_vector(model, doc_tokens):
    """
    Mendapatkan representasi vektor dari dokumen dengan menghitung
    rata-rata dari semua vektor kata yang ada di dalam model Word2Vec.
    """
    # Filter token yang ada di vocabulary model saja
    tokens = [token for token in doc_tokens if token in model.wv]
    
    # Jika tidak ada token dari dokumen yang ada di model, kembalikan vektor nol
    if not tokens:
        return np.zeros(model.vector_size)
        
    return np.mean(model.wv[tokens], axis=0)


# ==========================================
# 🚀 EKSEKUSI
# ==========================================

if __name__ == "__main__":
    # 🔹 Masukkan nama file PDF
    # Pastikan path ini benar sesuai lokasi file Anda
    file_path = "../text/text-1.pdf"  # ganti dengan path PDF kamu

    try:
        # 1. Ekstraksi teks dari PDF
        print(f"📄 Membaca PDF dari '{file_path}'...")
        pdf_text = pdf_to_text(file_path)

        # 2. Tokenisasi kalimat dan kata
        print("🧹 Preprocessing teks...")
        sentences = get_sentences(pdf_text)
        
        if not sentences:
            print("❌ Tidak ada teks yang bisa diproses setelah preprocessing. Cek isi PDF Anda.")
        else:
            # 3. Latih model Word2Vec
            print("🧠 Training Word2Vec...")
            # min_count=1 agar model tetap terlatih meskipun dokumen pendek
            model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

            # 4. Ambil vektor dokumen
            print("🔢 Menghitung vektor dokumen...")
            full_tokens = preprocess(pdf_text)
            doc_vector = document_vector(model, full_tokens)

            print("\n✅ Proses Selesai!")
            print("   Shape vektor dokumen:", doc_vector.shape)
            print("   Vektor dokumen (10 elemen pertama):", doc_vector)

            # 5. Simpan model (opsional)
            # model.save("word2vec_pdf.model")
            # print("💾 Model disimpan sebagai word2vec_pdf.model")

    except FileNotFoundError:
        print(f"❌ ERROR: File tidak ditemukan di path '{file_path}'. Pastikan nama dan lokasi file sudah benar.")
    except Exception as e:
        print(f"❌ Terjadi error: {e}")

📄 Membaca PDF dari '../text/text-1.pdf'...
🧹 Preprocessing teks...
🧠 Training Word2Vec...
🔢 Menghitung vektor dokumen...

✅ Proses Selesai!
   Shape vektor dokumen: (100,)
   Vektor dokumen (10 elemen pertama): [-9.86640691e-04  1.57771318e-03  6.12215896e-04  6.41604362e-04
  4.66454978e-04 -2.80584698e-03  1.25432049e-03  4.56800032e-03
 -2.55997456e-03 -1.33377558e-03 -1.07330538e-03 -3.26242903e-03
 -2.95098725e-04  1.30282994e-03  1.17111404e-03 -1.22856896e-03
  1.27894466e-03 -1.38315524e-03 -1.38783734e-03 -4.60860692e-03
  9.04934772e-04  5.94044977e-04  1.87951420e-03 -9.64887266e-04
 -1.27456238e-04  2.48710858e-04 -1.75537600e-03 -6.22847700e-04
 -1.40363164e-03  6.93301437e-04  1.95686566e-03 -2.54198705e-04
  6.96307514e-04 -1.93546771e-03 -1.65105704e-03  2.27620057e-03
  8.87568807e-04 -1.94619247e-03 -1.12640450e-03 -2.92021688e-03
  2.39968547e-04 -1.77604274e-03 -9.66460328e-04 -1.04185296e-04
  1.78332918e-03 -4.98769223e-04 -1.39121234e-03 -2.83539935e-04
  1.29950