## Import Libraries, Initialize Models, etc


In [96]:
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_core.documents import Document

from dotenv import load_dotenv

from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance

import pandas as pd
import os

load_dotenv("../.env")

True

In [97]:
model_name = "BAAI/bge-base-en-v1.5"

embedding_model = HuggingFaceInferenceAPIEmbeddings(
    api_key=os.getenv("HUGGINGFACE_API_KEY"), model_name=model_name
)

# test if embedding model works
res = embedding_model.embed_query("The quick brown fox jumps over the lazy dog")
res[:3]

[-0.042558297514915466, -0.06038237363100052, 0.03893209621310234]

## Process Standard Sheets


In [98]:
filepath = "xlsx/[ANNOTATED] Informasi Umum ITB.xlsx"


# standard sheets: readable ones with one column
standard_sheets = [
    "Tentang ITB",
    "Penerimaan",
    # "Pendidikan",
    "Penelitian",
    "Pengabdian",
    "Multikampus",
    "FAQ",
]


def read_dfs(sheet_names: list[str]) -> dict[str, pd.DataFrame]:
    filepath = "xlsx/[ANNOTATED] Informasi Umum ITB.xlsx"
    dfs = pd.read_excel(filepath, sheet_name=sheet_names)

    for name in sheet_names:
        # for each df, drop a row if the entire row is null
        # also drop cells with the content "kembali ke halaman utama"
        dfs[name] = dfs[name].dropna(how="all").reset_index(drop=True)
        dfs[name] = dfs[name][
            ~dfs[name].iloc[:, 0].str.contains("Kembali ke Halaman Utama")
        ]

        # do not make the first row as the title of the column
        dfs[name].columns = range(dfs[name].shape[1])

    return dfs


# read standard sheets
standard_dfs = read_dfs(standard_sheets)


# print the first df for sanity check
print(standard_dfs[standard_sheets[0]].head())

                                                   0
0                                   [H1] TENTANG ITB
1  Institut Teknologi Bandung (ITB) merupakan sek...
2                                       [H1] Sejarah
3  Sejarah Pendidikan Tinggi Teknik di Indonesia ...
4  Sejak resmi dibuka untuk tahun kuliah 1920-192...


In [99]:
def process_standard_sheets(sheet_name: str, df: pd.DataFrame):
    # we want to read cell by cell: there is only one column in this dataframe
    # essentially, we want to group the content between cells with the tag [H1]
    # get the indices of the cells with the tag [H1]
    documents = []
    h1_indices = df[df.iloc[:, 0].str.contains("\[H1\]")].index

    for i in range(len(h1_indices)):
        index = h1_indices[i]
        next_idx = h1_indices[i + 1] if i + 1 < len(h1_indices) else len(df)

        # get the title and the content of the part between the [H1] tags
        title = df.iloc[index, 0]
        content = " ".join(df.iloc[index + 1 : next_idx, 0].values)

        documents.append(
            Document(
                page_content=content,
                metadata={"title": title, "sheet_name": sheet_name},
            )
        )
    return documents


all_documents = []

for sheet_name, df in standard_dfs.items():
    docs = process_standard_sheets(sheet_name, df)
    all_documents.extend(docs)

## Embedding and Inserting to Vector Database


In [101]:
input_texts = [doc.page_content for doc in all_documents]
embeddings = embedding_model.embed_query(input_texts[0])

In [102]:
input_texts = [doc.page_content for doc in all_documents]

embeddings = [embedding_model.embed_query(text) for text in input_texts]

  client.recreate_collection(


ResponseHandlingException: [Errno 111] Connection refused

In [None]:
client = QdrantClient(os.getenv("QDRANT_URL"))


client.recreate_collection(
    collection_name="informasi_umum",
    vectors_config=VectorParams(size=len(embeddings[0]), distance=Distance.COSINE),
)

client.upsert(
    collection_name="informasi_umum",
    points=[
        PointStruct(
            id=i,
            vector=vector,
            payload={
                "page_content": res[i].page_content,
                "metadata": res[i].metadata,
            },
        )
        for i, vector in enumerate(embeddings)
    ],
)