Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding functionality for multi-column ingestion into vector databases and skills #8990

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -161,7 +161,7 @@ def predict(self, df: DataFrame, args) -> DataFrame:
embeddings = model.embed_documents(df_texts.tolist())

# create a new dataframe with the embeddings
df_embeddings = df.copy().assign(**{target: embeddings})
df_embeddings = df.copy().assign(**{'embedding_context': df_texts, target: embeddings})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The sentence transformer is more transparent, the input to the model is just the document, so we can duplicate that entry in the dataframe.


return df_embeddings

Expand Down
Expand Up @@ -63,7 +63,10 @@ def predict(self, df, args=None):

embeddings = model.embed_documents(texts=content)

embeddings_df = pd.DataFrame(data={"content": content, "embeddings": embeddings, "metadata": metadata})
embeddings_df = pd.DataFrame(data={"content": content,
"embedding_context": content,
"embeddings": embeddings,
"metadata": metadata})

return embeddings_df

Expand Down
3 changes: 2 additions & 1 deletion mindsdb/integrations/libs/vectordatabase_handler.py
Expand Up @@ -35,6 +35,7 @@ class TableField(Enum):

ID = "id"
CONTENT = "content"
CONTEXT = "embedding_context"
EMBEDDINGS = "embeddings"
METADATA = "metadata"
SEARCH_VECTOR = "search_vector"
Expand Down Expand Up @@ -139,7 +140,7 @@ def _is_columns_allowed(self, columns: List[str]) -> bool:
return set(columns).issubset(allowed_columns)

def _is_condition_allowed(self, condition: FilterCondition) -> bool:
allowed_field_values = set([field.value for field in TableField])
allowed_field_values = set([field['name'] for field in self.SCHEMA])
if condition.column in allowed_field_values:
return True
else:
Expand Down
13 changes: 12 additions & 1 deletion mindsdb/interfaces/knowledge_base/controller.py
Expand Up @@ -19,6 +19,8 @@
from mindsdb.integrations.libs.vectordatabase_handler import TableField
from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError

from mindsdb.utilities import log
logger = log.getLogger(__name__)

class KnowledgeBaseTable:
"""
Expand Down Expand Up @@ -126,6 +128,15 @@ def insert(self, df: pd.DataFrame):
df_emb = self._df_to_embeddings(df)
df = pd.concat([df, df_emb], axis=1)

# drop original 'content' column if it exists
if TableField.CONTENT.value in df.columns:
df = df.rename(columns={TableField.CONTENT.value: "original_context"})

# rename model's 'embedding_context' column to 'content'
df = df.rename(
columns={TableField.CONTEXT.value: TableField.CONTENT.value}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This currently only works with langchain_embedding_handler, because it is the only handler that adds this embedding_context column.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gotcha, adding it to the sentence transformer.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@QuantumPlumber perhaps worth us creating a base Embedding class like we have for vector stores

)

# send to vector db
db_handler = self._get_vector_db()
db_handler.do_upsert(self._kb.vector_database_table, df)
Expand Down Expand Up @@ -185,7 +196,7 @@ def _df_to_embeddings(self, df: pd.DataFrame) -> pd.DataFrame:
if target != TableField.EMBEDDINGS.value:
# adapt output for vectordb
df_out = df_out.rename(columns={target: TableField.EMBEDDINGS.value})
df_out = df_out[[TableField.EMBEDDINGS.value]]
df_out = df_out[[TableField.CONTEXT.value, TableField.EMBEDDINGS.value]]

return df_out

Expand Down