Skip to content

Commit

Permalink
remove length limitation of embedding (#347)
Browse files Browse the repository at this point in the history
* remove length limitation of embedding

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* rm files

* update haystack code

* fix

* resolve dependency

* revert for llama_index

* limit version haystack

* fix ver

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
Spycsh and pre-commit-ci[bot] authored Jul 30, 2024
1 parent 90e367e commit edcd1e8
Show file tree
Hide file tree
Showing 28 changed files with 47 additions and 158 deletions.
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ A `Microservices` can be created by using the decorator `register_microservice`.
from langchain_community.embeddings import HuggingFaceHubEmbeddings
from langsmith import traceable

from comps import register_microservice, EmbedDoc768, ServiceType, TextDoc
from comps import register_microservice, EmbedDoc, ServiceType, TextDoc


@register_microservice(
Expand All @@ -185,13 +185,12 @@ from comps import register_microservice, EmbedDoc768, ServiceType, TextDoc
host="0.0.0.0",
port=6000,
input_datatype=TextDoc,
output_datatype=EmbedDoc768,
output_datatype=EmbedDoc,
)
@traceable(run_type="embedding")
def embedding(input: TextDoc) -> EmbedDoc768:
def embedding(input: TextDoc) -> EmbedDoc:
embed_vector = embeddings.embed_query(input.text)
embed_vector = embed_vector[:768] # Keep only the first 768 elements
res = EmbedDoc768(text=input.text, embedding=embed_vector)
res = EmbedDoc(text=input.text, embedding=embed_vector)
return res
```

Expand Down
3 changes: 1 addition & 2 deletions comps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
Audio2TextDoc,
Base64ByteStrDoc,
DocPath,
EmbedDoc768,
EmbedDoc1024,
EmbedDoc,
GeneratedDoc,
LLMParamsDoc,
SearchedDoc,
Expand Down
9 changes: 2 additions & 7 deletions comps/cores/proto/docarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ class DocPath(BaseDoc):
table_strategy: str = "fast"


class EmbedDoc768(BaseDoc):
class EmbedDoc(BaseDoc):
text: str
embedding: conlist(float, min_length=768, max_length=768)
embedding: conlist(float, min_length=0)
search_type: str = "similarity"
k: int = 4
distance_threshold: Optional[float] = None
Expand All @@ -58,11 +58,6 @@ class Audio2TextDoc(AudioDoc):
)


class EmbedDoc1024(BaseDoc):
text: str
embedding: conlist(float, min_length=1024, max_length=1024)


class SearchedDoc(BaseDoc):
retrieved_docs: DocList[TextDoc]
initial_query: str
Expand Down
2 changes: 1 addition & 1 deletion comps/dataprep/redis/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ python prepare_doc_redis_on_ray.py

## 2.1 Start Redis Stack Server

Please refer to this [readme](../../../vectorstores/langchain/redis/README.md).
Please refer to this [readme](../../vectorstores/langchain/redis/README.md).

## 2.2 Setup Environment Variables

Expand Down
7 changes: 1 addition & 6 deletions comps/dataprep/redis/langchain/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

# Embedding model

EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5")
EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-large-en-v1.5")

# Redis Connection Information
REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
Expand Down Expand Up @@ -61,9 +61,4 @@ def format_redis_conn_from_env():
INDEX_NAME = os.getenv("INDEX_NAME", "rag-redis")
KEY_INDEX_NAME = os.getenv("KEY_INDEX_NAME", "file-keys")

current_file_path = os.path.abspath(__file__)
parent_dir = os.path.dirname(current_file_path)
REDIS_SCHEMA = os.getenv("REDIS_SCHEMA", "schema_dim_768.yml")
TIMEOUT_SECONDS = int(os.getenv("TIMEOUT_SECONDS", 600))
schema_path = os.path.join(parent_dir, REDIS_SCHEMA)
INDEX_SCHEMA = schema_path
4 changes: 1 addition & 3 deletions comps/dataprep/redis/langchain/prepare_doc_redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

# from pyspark import SparkConf, SparkContext
import redis
from config import EMBED_MODEL, INDEX_NAME, INDEX_SCHEMA, KEY_INDEX_NAME, REDIS_HOST, REDIS_PORT, REDIS_URL
from config import EMBED_MODEL, INDEX_NAME, KEY_INDEX_NAME, REDIS_HOST, REDIS_PORT, REDIS_URL
from fastapi import Body, File, Form, HTTPException, UploadFile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings
Expand Down Expand Up @@ -151,7 +151,6 @@ def ingest_data_to_redis(doc_path: DocPath):
texts=batch_texts,
embedding=embedder,
index_name=INDEX_NAME,
index_schema=INDEX_SCHEMA,
redis_url=REDIS_URL,
)
print(f"keys: {keys}")
Expand Down Expand Up @@ -195,7 +194,6 @@ async def ingest_link_to_redis(link_list: List[str]):
texts=content,
embedding=embedder,
index_name=INDEX_NAME,
index_schema=INDEX_SCHEMA,
redis_url=REDIS_URL,
)
print(f"keys: {keys}")
Expand Down
14 changes: 0 additions & 14 deletions comps/dataprep/redis/langchain/schema.yml

This file was deleted.

14 changes: 0 additions & 14 deletions comps/dataprep/redis/langchain/schema_dim_1024.yml

This file was deleted.

14 changes: 0 additions & 14 deletions comps/dataprep/redis/langchain/schema_dim_768.yml

This file was deleted.

18 changes: 0 additions & 18 deletions comps/dataprep/redis/langchain/schema_lcdocs_dim_768.yml

This file was deleted.

5 changes: 0 additions & 5 deletions comps/dataprep/redis/langchain_ray/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,4 @@ def format_redis_conn_from_env():
# Vector Index Configuration
INDEX_NAME = os.getenv("INDEX_NAME", "rag-redis")

current_file_path = os.path.abspath(__file__)
parent_dir = os.path.dirname(current_file_path)
REDIS_SCHEMA = os.getenv("REDIS_SCHEMA", "schema_dim_768.yml")
TIMEOUT_SECONDS = int(os.getenv("TIMEOUT_SECONDS", 600))
schema_path = os.path.join(parent_dir, REDIS_SCHEMA)
INDEX_SCHEMA = schema_path
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from typing import Callable, List, Optional, Union

import pandas as pd
from config import EMBED_MODEL, INDEX_NAME, INDEX_SCHEMA, REDIS_URL, TIMEOUT_SECONDS
from config import EMBED_MODEL, INDEX_NAME, REDIS_URL, TIMEOUT_SECONDS
from fastapi import Body, File, Form, HTTPException, UploadFile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings
Expand Down Expand Up @@ -195,7 +195,6 @@ def data_to_redis(data):
texts=batch_texts,
embedding=embedder,
index_name=INDEX_NAME,
index_schema=INDEX_SCHEMA,
redis_url=REDIS_URL,
)
# print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}")
Expand Down
9 changes: 4 additions & 5 deletions comps/embeddings/langchain-mosec/embedding_mosec.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from langsmith import traceable

from comps import (
EmbedDoc768,
EmbedDoc,
ServiceType,
TextDoc,
opea_microservices,
Expand Down Expand Up @@ -51,15 +51,14 @@ def empty_embedding() -> List[float]:
host="0.0.0.0",
port=6000,
input_datatype=TextDoc,
output_datatype=EmbedDoc768,
output_datatype=EmbedDoc,
)
@traceable(run_type="embedding")
@register_statistics(names=["opea_service@embedding_mosec"])
def embedding(input: TextDoc) -> EmbedDoc768:
def embedding(input: TextDoc) -> EmbedDoc:
start = time.time()
embed_vector = embeddings.embed_query(input.text)
embed_vector = embed_vector[:768] # Keep only the first 768 elements
res = EmbedDoc768(text=input.text, embedding=embed_vector)
res = EmbedDoc(text=input.text, embedding=embed_vector)
statistics_dict["opea_service@embedding_mosec"].append_latency(time.time() - start, None)
return res

Expand Down
9 changes: 4 additions & 5 deletions comps/embeddings/langchain/embedding_tei.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from langsmith import traceable

from comps import (
EmbedDoc768,
EmbedDoc,
ServiceType,
TextDoc,
opea_microservices,
Expand All @@ -25,15 +25,14 @@
host="0.0.0.0",
port=6000,
input_datatype=TextDoc,
output_datatype=EmbedDoc768,
output_datatype=EmbedDoc,
)
@traceable(run_type="embedding")
@register_statistics(names=["opea_service@embedding_tei_langchain"])
def embedding(input: TextDoc) -> EmbedDoc768:
def embedding(input: TextDoc) -> EmbedDoc:
start = time.time()
embed_vector = embeddings.embed_query(input.text)
embed_vector = embed_vector[:768] # Keep only the first 768 elements
res = EmbedDoc768(text=input.text, embedding=embed_vector)
res = EmbedDoc(text=input.text, embedding=embed_vector)
statistics_dict["opea_service@embedding_tei_langchain"].append_latency(time.time() - start, None)
return res

Expand Down
8 changes: 4 additions & 4 deletions comps/embeddings/langchain/local_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from langchain_community.embeddings import HuggingFaceBgeEmbeddings

from comps import EmbedDoc1024, ServiceType, TextDoc, opea_microservices, opea_telemetry, register_microservice
from comps import EmbedDoc, ServiceType, TextDoc, opea_microservices, opea_telemetry, register_microservice


@register_microservice(
Expand All @@ -13,12 +13,12 @@
host="0.0.0.0",
port=6000,
input_datatype=TextDoc,
output_datatype=EmbedDoc1024,
output_datatype=EmbedDoc,
)
@opea_telemetry
def embedding(input: TextDoc) -> EmbedDoc1024:
def embedding(input: TextDoc) -> EmbedDoc:
embed_vector = embeddings.embed_query(input.text)
res = EmbedDoc1024(text=input.text, embedding=embed_vector)
res = EmbedDoc(text=input.text, embedding=embed_vector)
return res


Expand Down
9 changes: 4 additions & 5 deletions comps/embeddings/llama_index/embedding_tei.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from langsmith import traceable
from llama_index.embeddings.text_embeddings_inference import TextEmbeddingsInference

from comps import EmbedDoc768, ServiceType, TextDoc, opea_microservices, register_microservice
from comps import EmbedDoc, ServiceType, TextDoc, opea_microservices, register_microservice


@register_microservice(
Expand All @@ -16,13 +16,12 @@
host="0.0.0.0",
port=6000,
input_datatype=TextDoc,
output_datatype=EmbedDoc768,
output_datatype=EmbedDoc,
)
@traceable(run_type="embedding")
def embedding(input: TextDoc) -> EmbedDoc768:
def embedding(input: TextDoc) -> EmbedDoc:
embed_vector = embeddings._get_query_embedding(input.text)
embed_vector = embed_vector[:768] # Keep only the first 768 elements
res = EmbedDoc768(text=input.text, embedding=embed_vector)
res = EmbedDoc(text=input.text, embedding=embed_vector)
return res


Expand Down
8 changes: 4 additions & 4 deletions comps/embeddings/llama_index/local_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from langsmith import traceable
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from comps import EmbedDoc1024, ServiceType, TextDoc, opea_microservices, register_microservice
from comps import EmbedDoc, ServiceType, TextDoc, opea_microservices, register_microservice


@register_microservice(
Expand All @@ -14,12 +14,12 @@
host="0.0.0.0",
port=6000,
input_datatype=TextDoc,
output_datatype=EmbedDoc1024,
output_datatype=EmbedDoc,
)
@traceable(run_type="embedding")
def embedding(input: TextDoc) -> EmbedDoc1024:
def embedding(input: TextDoc) -> EmbedDoc:
embed_vector = embeddings.get_text_embedding(input.text)
res = EmbedDoc1024(text=input.text, embedding=embed_vector)
res = EmbedDoc(text=input.text, embedding=embed_vector)
return res


Expand Down
3 changes: 2 additions & 1 deletion comps/retrievers/haystack/qdrant/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
docarray[full]
easyocr
fastapi
haystack-ai
haystack-ai==2.2.4
langchain_community
langsmith
opentelemetry-api
opentelemetry-exporter-otlp
opentelemetry-sdk
prometheus_fastapi_instrumentator
pymupdf
qdrant-haystack
sentence_transformers
Expand Down
4 changes: 2 additions & 2 deletions comps/retrievers/haystack/qdrant/retriever_qdrant.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from langsmith import traceable
from qdrant_config import EMBED_DIMENSION, EMBED_ENDPOINT, EMBED_MODEL, INDEX_NAME, QDRANT_HOST, QDRANT_PORT

from comps import EmbedDoc768, SearchedDoc, ServiceType, TextDoc, opea_microservices, register_microservice
from comps import EmbedDoc, SearchedDoc, ServiceType, TextDoc, opea_microservices, register_microservice


# Create a pipeline for querying a Qdrant document store
Expand All @@ -29,7 +29,7 @@ def initialize_qdrant_retriever() -> QdrantEmbeddingRetriever:
port=7000,
)
@traceable(run_type="retriever")
def retrieve(input: EmbedDoc768) -> SearchedDoc:
def retrieve(input: EmbedDoc) -> SearchedDoc:
search_res = retriever.run(query_embedding=input.embedding)["documents"]
searched_docs = [TextDoc(text=r.content) for r in search_res]
result = SearchedDoc(retrieved_docs=searched_docs, initial_query=input.text)
Expand Down
4 changes: 2 additions & 2 deletions comps/retrievers/langchain/milvus/retriever_milvus.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from langsmith import traceable

from comps import (
EmbedDoc768,
EmbedDoc,
SearchedDoc,
ServiceType,
TextDoc,
Expand Down Expand Up @@ -65,7 +65,7 @@ def empty_embedding() -> List[float]:
)
@traceable(run_type="retriever")
@register_statistics(names=["opea_service@retriever_milvus"])
def retrieve(input: EmbedDoc768) -> SearchedDoc:
def retrieve(input: EmbedDoc) -> SearchedDoc:
vector_db = Milvus(
embeddings,
connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
Expand Down
Loading

0 comments on commit edcd1e8

Please sign in to comment.