### Extract Document

In [13]:

file_paths = [
  "../document/PERATURAN REKTOR NO 11 TAHUN 2020 TENTANG PANDUAN MAGANG ITK.pdf",
  "../document/PERATURAN REKTOR NO 12 TAHUN 2020 TENTANG PANDUAN TUGAS AKHIR.pdf",
  "../document/PERATURAN REKTOR NO 13 TAHUN 2020 TENTANG PANDUAN KERJA PRAKTIK.pdf",
  "../document/Peraturan Rektor Nomor 10 Tahun 2021 Tentang PENYELENGGARAAN KEGIATAN MERDEKA BELAJAR - KAMPUS MERDEKA.pdf"]



In [14]:
from langchain_community.document_loaders import PyMuPDFLoader
import fitz
import os
from PIL import Image
def extract_text_image_from_pdf(file_path) -> list:
  loader = PyMuPDFLoader(file_path)
  text_pages = [page for page in loader.lazy_load()]

  doc = fitz.open(file_path)
  pdf_name = os.path.splitext(os.path.basename(file_path))[0]
  
  main_folder = "extracted_images"
  os.makedirs(main_folder, exist_ok=True)

  output_folder = os.path.join(main_folder, pdf_name)
  os.makedirs(output_folder, exist_ok=True)
  
  for page_index, page in enumerate(doc):
    images_info = page.get_images(full=True)
    image_paths = []
  
    for img_index, img in enumerate(images_info):
      xref = img[0]
      pix = fitz.Pixmap(doc, xref)
      img_path = f"{output_folder}/page{page.number}_{img_index+1}.png"
      if pix.n >= 5:    
        pix = fitz.Pixmap(fitz.csRGB, pix)

      if pix.alpha:
        img = Image.frombytes("RGBA", [pix.width, pix.height], pix.samples)
        img = img.convert("RGB")
        img.save(img_path)
      else:
        pix.save(img_path)
      
      pix = None
      image_paths.append(img_path)
    if page_index < len(text_pages):
       text_pages[page_index].metadata['images'] = str(image_paths)
  return text_pages
kp_documents = extract_text_image_from_pdf(file_paths[2])

In [15]:
kp_documents[0].metadata

{'producer': 'www.ilovepdf.com',
 'creator': '',
 'creationdate': '',
 'source': '../document/PERATURAN REKTOR NO 13 TAHUN 2020 TENTANG PANDUAN KERJA PRAKTIK.pdf',
 'file_path': '../document/PERATURAN REKTOR NO 13 TAHUN 2020 TENTANG PANDUAN KERJA PRAKTIK.pdf',
 'total_pages': 110,
 'format': 'PDF 1.7',
 'title': '',
 'author': '',
 'subject': '',
 'keywords': '',
 'moddate': '2020-11-23T08:42:12+00:00',
 'trapped': '',
 'modDate': 'D:20201123084212Z',
 'creationDate': '',
 'page': 0,
 'images': "['extracted_images/PERATURAN REKTOR NO 13 TAHUN 2020 TENTANG PANDUAN KERJA PRAKTIK/page0_1.png']"}

In [16]:
kp_documents

[Document(metadata={'producer': 'www.ilovepdf.com', 'creator': '', 'creationdate': '', 'source': '../document/PERATURAN REKTOR NO 13 TAHUN 2020 TENTANG PANDUAN KERJA PRAKTIK.pdf', 'file_path': '../document/PERATURAN REKTOR NO 13 TAHUN 2020 TENTANG PANDUAN KERJA PRAKTIK.pdf', 'total_pages': 110, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2020-11-23T08:42:12+00:00', 'trapped': '', 'modDate': 'D:20201123084212Z', 'creationDate': '', 'page': 0, 'images': "['extracted_images/PERATURAN REKTOR NO 13 TAHUN 2020 TENTANG PANDUAN KERJA PRAKTIK/page0_1.png']"}, page_content=''),
 Document(metadata={'producer': 'www.ilovepdf.com', 'creator': '', 'creationdate': '', 'source': '../document/PERATURAN REKTOR NO 13 TAHUN 2020 TENTANG PANDUAN KERJA PRAKTIK.pdf', 'file_path': '../document/PERATURAN REKTOR NO 13 TAHUN 2020 TENTANG PANDUAN KERJA PRAKTIK.pdf', 'total_pages': 110, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'mo

### Chunking Documents

Metode pembuatan chunk dokumen berdasarkan kalimat

In [17]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document
def chunk_by_sentences(documents: list[Document]) -> list[Document]:
  text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50,
  )  
  return text_splitter.split_documents(documents)

In [18]:
chunk_kp = chunk_by_sentences(kp_documents)
chunk_kp

[Document(metadata={'producer': 'www.ilovepdf.com', 'creator': '', 'creationdate': '', 'source': '../document/PERATURAN REKTOR NO 13 TAHUN 2020 TENTANG PANDUAN KERJA PRAKTIK.pdf', 'file_path': '../document/PERATURAN REKTOR NO 13 TAHUN 2020 TENTANG PANDUAN KERJA PRAKTIK.pdf', 'total_pages': 110, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2020-11-23T08:42:12+00:00', 'trapped': '', 'modDate': 'D:20201123084212Z', 'creationDate': '', 'page': 3, 'images': "['extracted_images/PERATURAN REKTOR NO 13 TAHUN 2020 TENTANG PANDUAN KERJA PRAKTIK/page3_1.png']"}, page_content='PANDUAN \nKERJA PRAKTIK \n2020'),
 Document(metadata={'producer': 'www.ilovepdf.com', 'creator': '', 'creationdate': '', 'source': '../document/PERATURAN REKTOR NO 13 TAHUN 2020 TENTANG PANDUAN KERJA PRAKTIK.pdf', 'file_path': '../document/PERATURAN REKTOR NO 13 TAHUN 2020 TENTANG PANDUAN KERJA PRAKTIK.pdf', 'total_pages': 110, 'format': 'PDF 1.7', 'title': '', 'author': '', 'sub

### Storing & Indexing Vector Databases

In [19]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import OllamaLLM
from langchain_chroma import Chroma
import logging

# Konfigurasi logging
logging.basicConfig(level=logging.INFO)

def init_embedding(model_name="firqaaa/indo-sentence-bert-base"):
    try:
        if "llama" in model_name:
            logging.info(f"Initializing LLaMA model: {model_name}")
            return OllamaEmbeddings(model=model_name, base_url="http://localhost:11434")
        logging.info(f"Initializing Hugging Face model: {model_name}")
        return HuggingFaceEmbeddings(model_name=model_name)
    except Exception as e:
        logging.error(f"Error occurred while initializing embedding: {e}", exc_info=True)

    
def init_llm(model="llama3.2:1b"):
  return OllamaLLM(model=model,base_url="http://localhost:11434", )

def init_vector_db(embedding_func,name):
  return Chroma(
    embedding_function=embedding_func,
    collection_name="chat_bot_itk",
    persist_directory="./vector_db"+ name
  )

In [20]:
llm = init_llm()


In [21]:

from uuid import uuid4
from langchain.schema import Document
from langchain_chroma import Chroma

def storing_to_vector_db(vector_db:Chroma,documents:list[Document]):
  uuids = [str(uuid4()) for _ in range(len(documents))]
  print(f"Adding {len(documents)} documents to the vector database.")
  vector_db.add_documents(documents= documents, ids=uuids)
  print(f"Successfully added {len(documents)} documents to the vector database with {len(uuids)} unique IDs.")


In [22]:
len(chunk_kp)

452

In [23]:
embedding_func = init_embedding()
vector_db = init_vector_db(embedding_func,"_indo-sentence-bert")
storing_to_vector_db(vector_db,chunk_kp)

INFO:root:Initializing Hugging Face model: firqaaa/indo-sentence-bert-base
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda:0
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: firqaaa/indo-sentence-bert-base
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Adding 452 documents to the vector database.
Successfully added 452 documents to the vector database with 452 unique IDs.


In [None]:
from langchain_chroma import Chroma
from langchain_ollama import OllamaLLM
from langchain_core.prompts import PromptTemplate

def prompt_template(question, context):
  prompt_template = PromptTemplate("""
   Anda adalah asisten akademik digital yang membantu mahasiswa memahami informasi kegiatan seperti magang, kerja praktik, atau seminar. Jawablah pertanyaan berdasarkan dokumen berikut. Jika jawabannya tidak tersedia, katakan dengan sopan bahwa informasi tidak ditemukan.

  Pertanyaan: {question}

  Dokumen Akademik:
  {context}

  Jawaban:
  """)
  prompt_template.invoke({"question":question, "context":context})
  return prompt_template

def retrieval_document(vector_db:Chroma,query):
  retrieve_docs =  vector_db.similarity_search(query,k=5)
  return {
    "context": retrieve_docs
  }
  
def generate_response(llm: OllamaLLM, vector_db:Chroma,query):
  retrieve_docs = retrieval_document(vector_db,query)
  docs_content = "\n\n".join([doc.page_content for doc in retrieve_docs["context"]])
  messages = prompt_template(query,docs_content)
  response = llm.invoke(messages)
  return {"question": query,"answer": response.content, "context": retrieve_docs["context"]}




