## Import Libraries, Initialize Models, etc


In [2]:
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

from dotenv import load_dotenv

from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance

import pandas as pd
import os

load_dotenv("../.env")

True

In [3]:
model_name = "BAAI/bge-base-en-v1.5"

embedding_model = HuggingFaceInferenceAPIEmbeddings(
    api_key=os.getenv("HUGGINGFACE_API_KEY"), model_name=model_name
)

# test if embedding model works
res = embedding_model.embed_query("The quick brown fox jumps over the lazy dog")
res[:3]

[-0.042558297514915466, -0.06038237363100052, 0.03893209621310234]

## Process Standard Sheets


In [4]:
filepath = "xlsx/[ANNOTATED] Informasi Umum ITB.xlsx"


# standard sheets: readable ones with one column
standard_sheets = [
    "Tentang ITB",
    "Penerimaan",
    "Pendidikan",
    "Penelitian",
    "Pengabdian",
    "Multikampus",
    "FAQ",
]


def read_dfs(sheet_names: list[str]) -> dict[str, pd.DataFrame]:
    # filepath = "xlsx/[ANNOTATED] Informasi Umum ITB.xlsx"
    # dfs = pd.read_excel(filepath, sheet_name=sheet_names)
    url = "https://docs.google.com/spreadsheets/d/1p0_lwcGKLP5NtsLx_cd5kCyc7i58VuTV/edit?usp=sharing&ouid=103581786644820929582&rtpof=true&sd=true"
    url_pandas = url.replace("/edit?usp=sharing", "/export?format=xlsx")
    dfs = pd.read_excel(url_pandas, sheet_name=sheet_names)

    for name in sheet_names:
        # for each df, drop a row if the entire row is null
        # also drop cells with the content "kembali ke halaman utama"
        dfs[name] = dfs[name].dropna(how="all").reset_index(drop=True)
        dfs[name] = dfs[name][
            ~dfs[name].iloc[:, 0].str.contains("Kembali ke Halaman Utama", na=False)
        ]

        # do not make the first row as the title of the column
        dfs[name].columns = range(dfs[name].shape[1])

    return dfs


# read standard sheets
standard_dfs = read_dfs(standard_sheets)


# print the first df for sanity check
print(standard_dfs[standard_sheets[0]].head())

                                                   0
0                                   [H1] TENTANG ITB
1  Institut Teknologi Bandung (ITB) merupakan sek...
2                                       [H1] Sejarah
3  Sejarah Pendidikan Tinggi Teknik di Indonesia ...
4  Sejak resmi dibuka untuk tahun kuliah 1920-192...


In [5]:
all_documents = []

In [6]:
def process_standard_sheets(sheet_name: str, df: pd.DataFrame):
    # we want to read cell by cell: there is only one column in this dataframe
    # essentially, we want to group the content between cells with the tag [H1]
    # get the indices of the cells with the tag [H1]
    documents = []
    h1_indices = df[df.iloc[:, 0].str.contains("\[H1\]")].index
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )

    for i in range(len(h1_indices)):
        index = h1_indices[i]
        next_idx = h1_indices[i + 1] if i + 1 < len(h1_indices) else len(df)

        # get the title and the content of the part between the [H1] tags
        title = df.iloc[index, 0]
        content = "".join(str(df.iloc[index + 1 : next_idx, 0].values))
        content = content.replace("\xa0", " ")

        title = title.replace("[H1] ", "")

        # documents.append(
        #     Document(
        #         page_content=content,
        #         metadata={"title": title, "sheet_name": sheet_name},
        #     )
        # )

        texts = text_splitter.create_documents([content])

        for i in range(len(texts)):
            texts[i].metadata = {"title": title, "sheet_name": sheet_name}

        documents.extend(texts)

    return documents


for sheet_name, df in standard_dfs.items():
    docs = process_standard_sheets(sheet_name, df)
    all_documents.extend(docs)

len(all_documents)

165

In [7]:
def process_sheets_with_tables(
    sheet_name: str, df: pd.DataFrame, process_text_only: bool = True
):
    documents = []
    h1_indices = df[df.iloc[:, 0].str.contains("\[H1\]", na=False)].index
    table_indices = df[df.iloc[:, 0].str.contains("\[TABLE\]", na=False)].index

    print(h1_indices)
    print(table_indices)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )

    for i in range(len(h1_indices)):
        index = h1_indices[i]
        next_idx = h1_indices[i + 1] if i + 1 < len(h1_indices) else len(df)

        # find the table index that is more than index but less than next idx
        table_index = table_indices[
            (table_indices > index) & (table_indices < next_idx)
        ]

        # update the next_idx to be the table index if it exists
        if len(table_index) > 0:
            next_idx = table_index[0]

        # get the title and the content of the part between the [H1] tags
        title = df.iloc[index, 0]
        content = "".join(str(df.iloc[index + 1 : next_idx, 0].values))
        content = content.replace("\xa0", " ")

        title = title.replace("[H1] ", "")

        texts = text_splitter.create_documents([content])

        for i in range(len(texts)):
            texts[i].metadata = {"title": title, "sheet_name": sheet_name}

        documents.extend(texts)

    return documents


sheets_with_tables = ["Info Pendaftaran - S1 - Cleaned"]
sheets_with_tables_dfs = read_dfs(sheets_with_tables)

for sheet_name, df in sheets_with_tables_dfs.items():
    docs = process_sheets_with_tables(sheet_name, df)
    all_documents.extend(docs)

Index([  0,   8,  19,  33,  53, 124, 134, 147, 152, 164, 175, 182, 186, 197,
       211, 218, 236, 243, 247, 254, 265, 272, 276, 283, 307, 376, 382, 386,
       401, 441, 457, 461, 473, 477, 492, 505, 513, 540, 542, 544, 579, 602,
       671, 680, 699, 717, 752, 755],
      dtype='int64')
Index([362, 486, 650, 673, 759, 792, 800, 862], dtype='int64')


In [9]:
all_documents

[Document(metadata={'title': 'TENTANG ITB', 'sheet_name': 'Tentang ITB'}, page_content="['Institut Teknologi Bandung (ITB) merupakan sekolah tinggi teknik pertama di Indonesia yang didirikan pada tanggal 2 Maret 1959 di Jawa Barat yang mengemban misi pengabdian ilmu pengetahuan dan teknologi untuk memajukan Indonesia. Lahir dalam suasana penuh dinamika yang dilandasi dengan semangat perjuangan Proklamasi Kemerdekaan, ITB hadir untuk mengoptimalkan pembangunan bangsa yang maju dan bermartabat.']"),
 Document(metadata={'title': 'Sejarah', 'sheet_name': 'Tentang ITB'}, page_content="['Sejarah Pendidikan Tinggi Teknik di Indonesia berawal pada abad ke-20, ketika pemerintah kolonial Belanda mendirikan de Techniche Hoogeschool te Bandung (TH) pada 3 Juli 1920 di lahan seluas 30 hektar di Bandung. Saat itu hanya terdapat satu fakultas yaitu de Faculteit van Technische Wetenschap dan hanya satu jurusan yaitu de afdeeling der We gen Waterbouw. Pendirian perguruan tinggi ini dimaksudkan untuk me

## Embedding and Inserting to Vector Database


In [10]:
input_texts = [doc.page_content for doc in all_documents]

embeddings = []

for i, text in enumerate(input_texts):
    print(f"Processing document {i+1}/{len(input_texts)}")
    embeddings.append(embedding_model.embed_query(text))

Processing document 1/265
Processing document 2/265
Processing document 3/265
Processing document 4/265
Processing document 5/265
Processing document 6/265
Processing document 7/265
Processing document 8/265
Processing document 9/265
Processing document 10/265
Processing document 11/265
Processing document 12/265
Processing document 13/265
Processing document 14/265
Processing document 15/265
Processing document 16/265
Processing document 17/265
Processing document 18/265
Processing document 19/265
Processing document 20/265
Processing document 21/265
Processing document 22/265
Processing document 23/265
Processing document 24/265
Processing document 25/265
Processing document 26/265
Processing document 27/265
Processing document 28/265
Processing document 29/265
Processing document 30/265
Processing document 31/265
Processing document 32/265
Processing document 33/265
Processing document 34/265
Processing document 35/265
Processing document 36/265
Processing document 37/265
Processing

In [11]:
client = QdrantClient(os.getenv("VECTOR_DB_ENDPOINT"))


client.recreate_collection(
    collection_name="informasi_umum",
    vectors_config=VectorParams(size=len(embeddings[0]), distance=Distance.COSINE),
)

client.upsert(
    collection_name="informasi_umum",
    points=[
        PointStruct(
            id=i,
            vector=vector,
            payload={
                "page_content": all_documents[i].page_content,
                "metadata": all_documents[i].metadata,
            },
        )
        for i, vector in enumerate(embeddings)
    ],
)

  client.recreate_collection(


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)