pymilvus.exception.paramerror length of string exceeds max length. #36736
Replies: 2 comments
-
|
The current milvus is not a good place to store large-size content. You can use an external database to store large-size content, and define a varchar field to store the remote path of the content. Search vector by milvus, it returns topk items, then you can use the remote path of each item to fetch the content from the external database. |
Beta Was this translation helpful? Give feedback.
-
|
@heisenbruhh Milvus is primarily designed for storing vectors, with any additional data (of any datatype) functioning as supplementary or attached information. Currently, Milvus does not offer a way to modify the size limits, though future updates may address this. Here are some possible solutions including @yhmo's suggestions-
Few recommendations:
|
Beta Was this translation helpful? Give feedback.
Uh oh!
There was an error while loading. Please reload this page.
-
I noticed that the string type in Milvus currently has a limitation of 64K bytes as its maximum size. While this works for most cases, I’m dealing with scenarios where the data I need to store exceeds this size constraint.
What are the best practices for efficiently storing larger strings or text data in Milvus? Specifically, I’m interested in exploring alternatives to manually splitting the data into smaller chunks. Are there any recommended strategies, approaches, or Milvus-native solutions that handle large data more effectively? Additionally, are there any potential limitations, performance considerations, or trade-offs to be aware of when working with large text data in Milvus?
the code for chunking:
import os
import re
import logging
from tqdm import tqdm
from pdfbox import PDFBox
from pdf2image import convert_from_path
import pytesseract
import fitz
from langchain_core.documents import Document
from concurrent.futures import ThreadPoolExecutor, as_completed
from pptx import Presentation
from docx import Document as DocxDocument
import openpyxl
import csv
from langchain_community.vectorstores import Milvus
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 0})
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("document_processing.log"),
# logging.StreamHandler() # Output to console as well
]
)
logger = logging.getLogger(name)
def get_total_pages(pdf_path):
""" Get the total number of pages in a PDF using PyMuPDF """
try:
pdf_document = fitz.open(pdf_path)
total_pages = pdf_document.page_count
pdf_document.close()
logger.info(f"Total pages in {pdf_path}: {total_pages}")
return total_pages
except Exception as e:
logger.error(f"Error getting page count from {pdf_path}: {e}")
return None
def extract_text_page_by_page(pdf_path, output_dir):
""" Extract text page by page using pdfbox-python """
p = PDFBox()
total_pages = get_total_pages(pdf_path)
if total_pages is None:
return []
def extract_text_with_ocr(pdf_path):
""" Extract text using OCR from each page of the PDF """
try:
images = convert_from_path(pdf_path)
text_by_page = []
for i, image in enumerate(images):
text = pytesseract.image_to_string(image)
text_by_page.append((i + 1, text))
logger.info(f"Extracted OCR text from page {i + 1} of {pdf_path}")
return text_by_page
except Exception as e:
logger.error(f"Error extracting OCR text from {pdf_path}: {e}")
return []
def process_pptx(file_path):
""" Process .pptx file and extract text slide by slide """
try:
prs = Presentation(file_path)
text_by_slide = []
for i, slide in enumerate(prs.slides):
slide_text = "\n".join([shape.text for shape in slide.shapes if hasattr(shape, "text")])
text_by_slide.append((i + 1, slide_text))
logger.info(f"Extracted text from {len(prs.slides)} slides in {file_path}")
return text_by_slide
except Exception as e:
logger.exception(f"Error processing PPTX file {file_path}")
return []
def process_docx(file_path):
""" Process .docx file and extract text """
try:
doc = DocxDocument(file_path)
text_by_paragraph = [(i + 1, para.text) for i, para in enumerate(doc.paragraphs)]
logger.info(f"Extracted {len(doc.paragraphs)} paragraphs from {file_path}")
return text_by_paragraph
except Exception as e:
logger.exception(f"Error processing DOCX file {file_path}")
return []
def process_txt(file_path):
""" Process .txt file and extract text """
try:
with open(file_path, 'r') as f:
text = f.read()
return [(1, text)] # Text file considered as one page
except Exception as e:
logger.error(f"Error processing TXT file {file_path}: {e}")
return []
def process_xlsx(file_path):
""" Process .xlsx file and extract text sheet by sheet """
try:
workbook = openpyxl.load_workbook(file_path)
text_by_sheet = []
for sheet in workbook.sheetnames:
worksheet = workbook[sheet]
sheet_text = "\n".join([",".join([str(cell.value) for cell in row]) for row in worksheet.iter_rows()])
text_by_sheet.append((sheet, sheet_text))
logger.info(f"Extracted text from {len(workbook.sheetnames)} sheets in {file_path}")
return text_by_sheet
except Exception as e:
logger.exception(f"Error processing XLSX file {file_path}")
return []
def process_csv(file_path):
""" Process .csv file and extract text """
try:
with open(file_path, 'r') as f:
reader = csv.reader(f)
text = "\n".join([",".join(row) for row in reader])
return [(1, text)] # Consider CSV as one page
except Exception as e:
logger.error(f"Error processing CSV file {file_path}: {e}")
return []
def clean_text(text):
""" Clean the extracted text by removing unwanted characters and formatting """
text = re.sub(r'.{3,}', '.', text)
lines = text.split('\n')
cleaned_lines = [line.strip() for line in lines if len(line.split()) >= 4]
cleaned_text = '\n'.join(cleaned_lines)
cleaned_text = re.sub(r'^\s*$', '', cleaned_text, flags=re.MULTILINE)
return cleaned_text.strip()
def read_and_split_text(text_by_page, min_chunk_size=800, max_chunk_size=1200):
""" Split text into chunks, respecting page boundaries """
chunks = []
current_chunk = ""
current_page = 1
ERROR_FOLDER = "error_files"
def move_file_to_error(file_path):
""" Move failed files to the error folder """
try:
os.makedirs(ERROR_FOLDER, exist_ok=True)
error_file_path = os.path.join(ERROR_FOLDER, os.path.basename(file_path))
os.rename(file_path, error_file_path)
logger.error(f"Moved {file_path} to error folder: {ERROR_FOLDER}")
except Exception as e:
logger.error(f"Failed to move {file_path} to error folder: {e}")
def process_document(file_path):
""" Process a single document """
try:
text_by_page = []
if file_path.endswith('.pdf'):
output_dir = file_path.replace('.pdf', '_pages')
os.makedirs(output_dir, exist_ok=True)
text_by_page = extract_text_page_by_page(file_path, output_dir)
elif file_path.endswith('.pptx'):
text_by_page = process_pptx(file_path)
elif file_path.endswith('.docx'):
text_by_page = process_docx(file_path)
elif file_path.endswith('.txt'):
text_by_page = process_txt(file_path)
elif file_path.endswith('.xlsx'):
text_by_page = process_xlsx(file_path)
elif file_path.endswith('.csv'):
text_by_page = process_csv(file_path)
def process_ocr_document(file_path):
""" Process OCR-based document """
try:
text_by_page = extract_text_with_ocr(file_path)
if text_by_page:
logger.info(f"Successfully extracted OCR text from {file_path}")
return [(page_num, f"{text}\nThe standard and the year of the standard is: {file_path}") for page_num, text in text_by_page]
except Exception as e:
logger.exception(f"Error processing OCR document {file_path}")
return []
def create_langchain_documents(folder_path):
"""
Create langchain documents from the extracted and processed text.
source_folder = "/home/it-engg2/Desktop/Working-envirnment/LLM Dataset"
print(create_langchain_documents(source_folder))
the code for uploading to milvus:
import logging
from datetime import datetime
from file_process import create_langchain_documents, process_document
from langchain_community.vectorstores import Milvus
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 0})
Logging configuration
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[logging.FileHandler("file_selection.log")]
)
logger = logging.getLogger(name)
def main():
source_folder = "/home/it-engg2/Desktop/Working-envirnment/LLM Dataset"
processed_documents = create_langchain_documents(source_folder)
# print(len(processed_documents))
# if process_document:
# Milvus.from_documents(processed_documents, embeddings, collection_name="SDC_all_2", connection_args={"uri": "http://localhost:19530"},drop_old=True)
logger.info("Processing complete")
if name == "main":
main()
Beta Was this translation helpful? Give feedback.
All reactions