## Import Libraries, Initialize Models, etc


In [2]:
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

from dotenv import load_dotenv

from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance

import pandas as pd
import os

load_dotenv("../.env")

True

In [5]:
# model_name = "BAAI/bge-m3"

# embedding_model = HuggingFaceInferenceAPIEmbeddings(
#     api_key=os.getenv("HUGGINGFACE_API_KEY"), model_name=model_name
# )
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")

# test if embedding model works
res = embedding_model.embed_query("The quick brown fox jumps over the lazy dog")
res[:3]

[-0.009787405841052532, 0.002850267803296447, -0.0009010981302708387]

## Process Standard Sheets


In [8]:
filepath = "xlsx/[ANNOTATED] Informasi Umum ITB.xlsx"


# standard sheets: readable ones with one column
standard_sheets = [
    "Tentang ITB",
    "Penerimaan",
    "Pendidikan",
    "Penelitian",
    "Pengabdian",
    "Multikampus",
    "FAQ",
]


def read_dfs(sheet_names: list[str]) -> dict[str, pd.DataFrame]:
    # filepath = "xlsx/[ANNOTATED] Informasi Umum ITB.xlsx"
    # dfs = pd.read_excel(filepath, sheet_name=sheet_names)
    url = "https://docs.google.com/spreadsheets/d/1p0_lwcGKLP5NtsLx_cd5kCyc7i58VuTV/edit?usp=sharing&ouid=103581786644820929582&rtpof=true&sd=true"
    url_pandas = url.replace("/edit?usp=sharing", "/export?format=xlsx")
    dfs = pd.read_excel(url_pandas, sheet_name=sheet_names)

    for name in sheet_names:
        # for each df, drop a row if the entire row is null
        # also drop cells with the content "kembali ke halaman utama"
        dfs[name] = dfs[name].dropna(how="all").reset_index(drop=True)
        dfs[name] = dfs[name][
            ~dfs[name].iloc[:, 0].str.contains("Kembali ke Halaman Utama", na=False)
        ]

        # do not make the first row as the title of the column
        dfs[name].columns = range(dfs[name].shape[1])

    return dfs


# read standard sheets
standard_dfs = read_dfs(standard_sheets)


# print the first df for sanity check
print(standard_dfs[standard_sheets[0]].head())

                                                   0
0                                   [H1] TENTANG ITB
1  Institut Teknologi Bandung (ITB) merupakan sek...
2                                       [H1] Sejarah
3  Sejarah Pendidikan Tinggi Teknik di Indonesia ...
4  Sejak resmi dibuka untuk tahun kuliah 1920-192...


In [7]:
all_documents = []

In [9]:
def process_standard_sheets(sheet_name: str, df: pd.DataFrame):
    # we want to read cell by cell: there is only one column in this dataframe
    # essentially, we want to group the content between cells with the tag [H1]
    # get the indices of the cells with the tag [H1]
    documents = []
    h1_indices = df[df.iloc[:, 0].str.contains("\[H1\]")].index
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )

    for i in range(len(h1_indices)):
        index = h1_indices[i]
        next_idx = h1_indices[i + 1] if i + 1 < len(h1_indices) else len(df)

        # get the title and the content of the part between the [H1] tags
        title = df.iloc[index, 0]
        content = "".join(str(df.iloc[index + 1 : next_idx, 0].values))
        content = content.replace("\\xa0", " ").replace("\n", "")

        title = title.replace("[H1] ", "")

        # documents.append(
        #     Document(
        #         page_content=content,
        #         metadata={"title": title, "sheet_name": sheet_name},
        #     )
        # )

        texts = text_splitter.create_documents([content])

        for i in range(len(texts)):
            texts[i].metadata = {"title": title, "sheet_name": sheet_name}

        documents.extend(texts)

    return documents


for sheet_name, df in standard_dfs.items():
    docs = process_standard_sheets(sheet_name, df)
    all_documents.extend(docs)

len(all_documents)

78

In [15]:
def process_sheets_with_tables(
    sheet_name: str, df: pd.DataFrame, process_text_only: bool = True
):
    documents = []
    h1_indices = df[df.iloc[:, 0].str.contains("\[H1\]", na=False)].index
    table_indices = df[df.iloc[:, 0].str.contains("\[TABLE\]", na=False)].index

    print(h1_indices)
    print(table_indices)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )

    for i in range(len(h1_indices)):
        index = h1_indices[i]
        next_idx = h1_indices[i + 1] if i + 1 < len(h1_indices) else len(df)

        # find the table index that is more than index but less than next idx
        table_index = table_indices[
            (table_indices > index) & (table_indices < next_idx)
        ]

        # update the next_idx to be the table index if it exists
        if len(table_index) > 0:
            next_idx = table_index[0]

        # get the title and the content of the part between the [H1] tags
        title = df.iloc[index, 0]
        content = "".join(str(df.iloc[index + 1 : next_idx, 0].values))
        content = content.replace("\\xa0", " ").replace("\n", "")

        title = title.replace("[H1] ", "")

        texts = text_splitter.create_documents([content])

        for i in range(len(texts)):
            texts[i].metadata = {"title": title, "sheet_name": sheet_name}

        documents.extend(texts)

    return documents


sheets_with_tables = ["Info Pendaftaran - S1 - Cleaned"]
sheets_with_tables_dfs = read_dfs(sheets_with_tables)

for sheet_name, df in sheets_with_tables_dfs.items():
    docs = process_sheets_with_tables(sheet_name, df)
    all_documents.extend(docs)

Index([  0,   8,  19,  33,  53, 124, 134, 147, 152, 164, 175, 182, 186, 197,
       211, 218, 236, 243, 247, 254, 265, 272, 276, 283, 307, 376, 382, 386,
       401, 441, 457, 461, 473, 477, 492, 505, 513, 540, 542, 544, 579, 602,
       671, 680, 699, 717, 752, 755],
      dtype='int64')
Index([362, 486, 650, 673, 759, 792, 800, 862], dtype='int64')


In [10]:
len(all_documents)

78

## Embedding and Inserting to Vector Database


In [11]:
input_texts = [doc.page_content for doc in all_documents]

# embeddings = [embedding_model.embed_query(text) for text in input_texts]
embeddings = embedding_model.embed_documents(input_texts)

# embeddings = []

# for i, text in enumerate(input_texts):
#     print(f"Processing document {i+1}/{len(input_texts)}")
#     embeddings.append(embedding_model.embed_query(text))

In [12]:
embeddings

[[0.009476748295128345,
  0.001023535500280559,
  -0.0014938716776669025,
  -0.03299589082598686,
  -0.011082660406827927,
  -0.02476092241704464,
  -0.02595601975917816,
  0.01147480122745037,
  0.012119033373892307,
  0.015788355842232704,
  0.002934057265520096,
  -0.0026796322781592607,
  0.06636524945497513,
  0.01971910521388054,
  -0.04638471454381943,
  0.006862472742795944,
  0.01532152108848095,
  -0.0006524017662741244,
  -0.0008980736020021141,
  -0.01189495250582695,
  -0.004556308500468731,
  -0.019037526100873947,
  -0.041044123470783234,
  -0.04970857873558998,
  -0.0025185742415487766,
  0.009476748295128345,
  -0.001707448624074459,
  0.0005491145420819521,
  -0.014135760255157948,
  0.022482767701148987,
  0.023042969405651093,
  -0.013342141173779964,
  -0.03441506624221802,
  -0.02265082858502865,
  0.022389400750398636,
  -0.013650251552462578,
  -0.024032659828662872,
  0.05837303400039673,
  -0.0850759893655777,
  -0.022277360782027245,
  0.003071773564442992,
 

In [13]:
client = QdrantClient(os.getenv("VECTOR_DB_ENDPOINT"))


client.recreate_collection(
    collection_name="informasi_umum",
    vectors_config=VectorParams(size=len(embeddings[0]), distance=Distance.COSINE),
)

client.upsert(
    collection_name="informasi_umum",
    points=[
        PointStruct(
            id=i,
            vector=vector,
            payload={
                "page_content": all_documents[i].page_content,
                "metadata": all_documents[i].metadata,
            },
        )
        for i, vector in enumerate(embeddings)
    ],
)

  client.recreate_collection(


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)