## Creating an index and populating it with documents using Milvus and Nomic AI Embeddings

Simple example on how to ingest PDF documents, then web pages content into a Milvus VectorStore. . In this example, the embeddings are the fully open source ones released by NomicAI, [nomic-embed-text-v1](https://huggingface.co/nomic-ai/nomic-embed-text-v1).

As described in [this blog post](https://blog.nomic.ai/posts/nomic-embed-text-v1), those embeddings feature a "8192 context-length that outperforms OpenAI Ada-002 and text-embedding-3-small on both short and long context tasks". In additions, they are:

- Open source
- Open data
- Open training code
- Fully reproducible and auditable

Requirements:
- A Milvus instance, either standalone or cluster.

### Needed packages and imports

In [None]:
!pip install -q einops==0.7.0 langchain==0.1.9 pypdf==4.0.2 pymilvus==2.3.6 sentence-transformers==2.4.0 python-docx unstructured[docx,pptx] python-pptx docx2txt

In [None]:
import requests
import os
from langchain.document_loaders import PyPDFDirectoryLoader, WebBaseLoader, Docx2txtLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Milvus
import docx2txt

### Base parameters, the Milvus connection info

In [None]:
MILVUS_HOST = "vectordb-milvus"
MILVUS_PORT = 19530
MILVUS_USERNAME = "root"
MILVUS_PASSWORD = "Milvus"
MILVUS_COLLECTION = "redhat_notes"

## Initial index creation and document ingestion

#### Load pfs

In [None]:
pdf_folder_path = "../../knowledge_base_data"
# Create a list to store the loaded data from all files
all_data_pdfs = []
success_counter = 0

# Iterate over all files in the directory
for filename in os.listdir(pdf_folder_path):
    if filename.endswith(".pdf"):
        file_path = os.path.join(pdf_folder_path, filename)
        try:
            loader = PyPDFLoader(file_path)
            data = loader.load()
            all_data_pdfs.append(data)
            success_counter += 1
        except Exception as e:
            print(f"Error loading file '{filename}': {e}")
            continue  # Skip to the next iteration
print(f"Successfully loaded '{success_counter}' pdfs")

In [None]:
all_data_pdfs[0]

In [None]:
docs = []
for sublist_pdfs in all_data_pdfs:
    for subitem in sublist_pdfs:
        docs.append(subitem)

In [None]:
len(docs)

#### Load docx Files

In [None]:
docx_folder_path = "../../knowledge_base_data"

# Create a list to store the loaded data from all files
all_data_docx = []
success_counter = 0

# Iterate over all files in the directory
for filename in os.listdir(docx_folder_path):
    if filename.endswith(".docx"):
        file_path = os.path.join(docx_folder_path, filename)
        try:
            loader = Docx2txtLoader(file_path)
            data = loader.load()
            all_data_docx.append(data)
            success_counter += 1
        except Exception as e:
            print(f"Error loading file '{filename}': {e}")
            continue  # Skip to the next iteration
print(f"Successfully loaded '{success_counter}' documents")

In [None]:
for sublist_docx in all_data_docx:
    for subitem in sublist_docx:
        docs.append(subitem)

In [None]:
len(docs)

#### Load pptx files

In [None]:
pptx_folder_path = "../../knowledge_base_data"

# Create a list to store the loaded data from all files
all_data_pptx = []
success_counter = 0

# Iterate over all files in the directory
for filename in os.listdir(pptx_folder_path):
    if filename.endswith(".pptx"):
        file_path = os.path.join(pptx_folder_path, filename)
        try:
            loader = UnstructuredPowerPointLoader(file_path)
            data = loader.load()
            all_data_pptx.append(data)
            success_counter += 1
        except Exception as e:
            print(f"Error loading file '{filename}': {e}")
            continue  # Skip to the next iteration
print(f"Successfully loaded '{success_counter}' documents")

In [None]:
for sublist_pptx in all_data_pptx:
    for subitem in sublist_pptx:
        docs.append(subitem)

In [None]:
print(f"Loaded '{len(docs)}' files in total")

#### Inject metadata

#### Merge both types of docs

#### Split documents into chunks with some overlap

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                               chunk_overlap=120)
all_splits = text_splitter.split_documents(docs)
all_splits[15]

len(docs)len(docs)#### Create the index and ingest the documents

In [None]:
# If you don't want to use a GPU, you can remove the 'device': 'cuda' argument
# model_kwargs = {'trust_remote_code': True, 'device': 'cuda'}
model_kwargs = {'trust_remote_code': True, 'device': 'cuda'}
embeddings = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1",
    model_kwargs=model_kwargs,
    show_progress=True
)

In [None]:
db = Milvus(
    embedding_function=embeddings,
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT, "user": MILVUS_USERNAME, "password": MILVUS_PASSWORD},
    collection_name=MILVUS_COLLECTION,
    metadata_field="metadata",
    text_field="page_content",
    auto_id=True,
    drop_old=True
    )

In [None]:
db.add_documents(all_splits)

#### Alternatively, add new documents

In [None]:
# If you don't want to use a GPU, you can remove the 'device': 'cuda' argument
# model_kwargs = {'trust_remote_code': True, 'device': 'cuda'}
# embeddings = HuggingFaceEmbeddings(
#     model_name="nomic-ai/nomic-embed-text-v1",
#     model_kwargs=model_kwargs,
#     show_progress=True
# )

# db = Milvus(
#     embedding_function=embeddings,
#     connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT, "user": MILVUS_USERNAME, "password": MILVUS_PASSWORD},
#     collection_name=MILVUS_COLLECTION,
#     metadata_field="metadata",
#     text_field="page_content",
#     auto_id=True,
#     drop_old=False
#     )

# db.add_documents(all_splits)

#### Test query

In [None]:
query = "Who is someone?"
docs_with_score = db.similarity_search_with_score(query)

In [None]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)