In [8]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    PDFPlumberLoader,
    AzureAIDocumentIntelligenceLoader,
)

from langchain_core.documents import Document

from langchain_upstage import UpstageLayoutAnalysisLoader
from dotenv import load_dotenv

from sentence_transformers import SentenceTransformer

from qdrant_client import QdrantClient
from qdrant_client.http import models
from qdrant_client.models import PointStruct, VectorParams, Distance

import pickle
import re
import os

load_dotenv("../.env")

os.environ["UPSTAGE_API_KEY"] = os.getenv("UPSTAGE_API_KEY")

In [43]:
filename = "Peraturan_Kemahasiswaan_2022.pdf"
# filename = "Peraturan_Akademik_2021.pdf"

loader = AzureAIDocumentIntelligenceLoader(
    api_endpoint=os.getenv("AZURE_ENDPOINT"),
    api_key=os.getenv("AZURE_API_KEY"),
    file_path="pdf/" + filename,
    api_model="prebuilt-read",
)

data = loader.load()


pickle.dump(data, open("pickle_pdf/" + filename + ".pkl", "wb"))

In [48]:
filename = "Peraturan_Akademik_2021.pdf"
data = pickle.load(open("pickle_pdf/" + filename + ".pkl", "rb"))

In [49]:
def parse_documents(data: Document) -> list[Document]:
    all_text = data.page_content

    # split the all_text based on the occurence of the "BAB <roman number>" string. use regex
    chapters = re.split(r"BAB\s*[IVXLCDM]+\s*", all_text)
    # chapters = all_text.split("BAB ")
    chapters = ["BAB " + text for text in chapters]
    print("Num of chapters:", len(chapters))

    # front page
    documents = [
        Document(
            page_content=chapters[0], metadata={"judul": filename, "bab": 0, "pasal": 0}
        )
    ]
    total_sections = 0

    # print(chapters[5])
    for i, chapter in enumerate(chapters):
        # split each chapter based on the occurence of "Pasal <INTEGER>" format followed with a \n
        # the \n may not be directly after the "Pasal <INTEGER>". use regex
        # each section is a pasal

        # sections = re.split(r"Pasal\s+\d+\s+\n", chapter)
        sections = re.split(r"\nPasal \d+", chapter)
        sections = [s for s in sections if "BAB" not in s]

        print("Num of sections in Chapter", i, ":", len(sections))

        documents += [
            Document(
                page_content=section,
                metadata={"judul": filename, "bab": i, "pasal": total_sections + j + 1},
            )
            for j, section in enumerate(sections)
        ]

        total_sections += len(sections)

    print("Total sections:", total_sections)

    # remove whitespaces from each page content (e.g. \n)
    # page content is stored in the page_content property of the document object
    # also remove double spaces
    for doc in documents:
        doc.page_content = " ".join(doc.page_content.split())
        doc.page_content = doc.page_content.replace("  ", " ")

    return documents


res = parse_documents(data[0])
pickle.dump(res, open("pickle_res/" + filename + ".pkl", "wb"))

Num of chapters: 15
Num of sections in Chapter 0 : 0
Num of sections in Chapter 1 : 1
Num of sections in Chapter 2 : 10
Num of sections in Chapter 3 : 8
Num of sections in Chapter 4 : 7
Num of sections in Chapter 5 : 9
Num of sections in Chapter 6 : 16
Num of sections in Chapter 7 : 12
Num of sections in Chapter 8 : 2
Num of sections in Chapter 9 : 1
Num of sections in Chapter 10 : 1
Num of sections in Chapter 11 : 1
Num of sections in Chapter 12 : 1
Num of sections in Chapter 13 : 1
Num of sections in Chapter 14 : 1
Total sections: 71


In [5]:
# read the pickle_res folder, load all the pickle files, and concatenate the page_content of each document

res_files = os.listdir("pickle_res")
res = []

for file in res_files:
    res += pickle.load(open("pickle_res/" + file, "rb"))

# embed results
model = SentenceTransformer("infgrad/stella_en_400M_v5", trust_remote_code=True).cuda()

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")
Some weights of the model checkpoint at infgrad/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
input_texts = [doc.page_content for doc in res]
embeddings = model.encode(input_texts, normalize_embeddings=False)

print(len(embeddings))

103


In [9]:
client = QdrantClient(os.getenv("QDRANT_URL"))

client.recreate_collection(
    collection_name="peraturan_stella",
    vectors_config=VectorParams(size=len(embeddings[0]), distance=Distance.COSINE),
)


client.upsert(
    collection_name="peraturan_stella",
    points=[
        PointStruct(
            id=i,
            vector=vector,
            payload={
                "page_content": res[i].page_content,
                "metadata": res[i].metadata,
            },
        )
        for i, vector in enumerate(embeddings)
    ],
)

  client.recreate_collection(


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)