In [1]:
import os
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models
from qdrant_client.models import VectorParams, Distance
import uuid

In [2]:
# Path file hasil ekstraksi PDF
data_path = "/Users/mhdfarhanali/Documents/InstruRAG/data/raw/instrumentation_sensors.txt"

with open(data_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Panjang teks (karakter):", len(raw_text))
print("Cuplikan awal teks:\n", raw_text[:500])

Panjang teks (karakter): 670951
Cuplikan awal teks:
 --- [HALAMAN 1] --- This E-Book and More From http://ali-almukhtar.blogspot.com --- [HALAMAN 2] --- Introduction to Instrumentation, Sensors, and Process Control --- [HALAMAN 3] --- For a listing of related titles from Artech House, turn to the back of this book --- [HALAMAN 4] --- Introduction to Instrumentation, Sensors, and Process Control William C. Dunn artechhouse.com --- [HALAMAN 5] --- Library of Congress Cataloging-in-Publication Data Dunn, William C. Introduction to instrumentation, se


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)
chunks = text_splitter.split_text(raw_text)

print(f"Jumlah chunks: {len(chunks)}")
print("Contoh chunk:\n", chunks[0][:300])

Jumlah chunks: 1677
Contoh chunk:
 --- [HALAMAN 1] --- This E-Book and More From http://ali-almukhtar.blogspot.com --- [HALAMAN 2] --- Introduction to Instrumentation, Sensors, and Process Control --- [HALAMAN 3] --- For a listing of related titles from Artech House, turn to the back of this book --- [HALAMAN 4] --- Introduction to I


In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load model
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Encode + normalisasi
embeddings = embedding_model.encode(
    chunks,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

print("Embedding selesai!")
print(f"Jumlah chunks: {len(chunks)}")
print(f"Dimensi vektor: {embeddings.shape[1]}")


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

Embedding selesai!
Jumlah chunks: 1677
Dimensi vektor: 384


In [5]:
# Hubungkan ke Qdrant
from qdrant_client import QdrantClient, models

qdrant = QdrantClient(host="localhost", port=6333)
collection_name = "instru_collection"

# Buat koleksi jika belum ada
try:
    qdrant.get_collection(collection_name)
    print(f"Koleksi '{collection_name}' sudah ada")
except:
    qdrant.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=embeddings.shape[1],
            distance=models.Distance.COSINE
        )
    )
    print(f"Koleksi '{collection_name}' berhasil dibuat")

# Filter chunk kosong dulu
chunks = [c for c in chunks if c and c.strip()]

# Siapkan data points
points = [
    models.PointStruct(
        id=i,
        vector=embeddings[i],
        payload={"page_content": str(chunks[i])} 
    )
    for i in range(len(chunks))
]

# Upload data ke Qdrant
qdrant.upsert(
    collection_name=collection_name,
    points=points
)
print("Data berhasil diunggah ke Qdrant")

Koleksi 'instru_collection' sudah ada
Data berhasil diunggah ke Qdrant


In [6]:
import pandas as pd

df = pd.DataFrame({
    "text": chunks,
    "embedding": embeddings.tolist()
})
df.to_csv("/Users/mhdfarhanali/Documents/InstruRAG/data/processed/instru_chunks.csv", index=False)
print("Data disimpan di data/processed/instru_chunks.csv")

Data disimpan di data/processed/instru_chunks.csv
