In [1]:
from typing import Any, Optional, List

import clickhouse_connect
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.retrievers import SelfQueryRetriever
from langchain_community.vectorstores import MyScale, MyScaleSettings
from langchain_openai import ChatOpenAI
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.query_constructors.myscale import MyScaleTranslator
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate, \
    HumanMessagePromptTemplate
from sentence_transformers import SentenceTransformer
import logging
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
logger = logging.getLogger(__name__)

MYSCALE_HOST = "msc-950b9f1f.us-east-1.aws.myscale.com"
MYSCALE_PORT = 443
MYSCALE_USER = "chatdata"
MYSCALE_PASSWORD = "myscale_rocks"
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
os.environ["OPENAI_API_BASE"] = "https://one-api.myscale.cloud/v1"
os.environ["OPENAI_API_KEY"] = "sk-YAjQhont68ggwDRz08Af5d0c8b494b11A98b23Eb3e55Ef13"

def print_results(results):
    print(' | '.join(results.column_names))
    print('---')
    for row in results.result_rows:
        print(' | '.join([str(each) for each in row]))


def search(query: str, where: str = None):
    where = 'WHERE ' + where if where else ''
    embedding = model.encode([query])[0].tolist()
    print(f'\nsearch for "{query}" {where}')
    res = client.query(
        f'SELECT title, text, views, DISTANCE(emb, {embedding}) AS dist '
        f'FROM wiki.Wikipedia {where} ORDER BY dist LIMIT 3')
    print_results(res)


client = clickhouse_connect.get_client(
    host=MYSCALE_HOST,
    port=MYSCALE_PORT,
    username=MYSCALE_USER,
    password=MYSCALE_PASSWORD,
)
print('number of rows in wiki.Wikipedia:', client.command('select count(*) from wiki.Wikipedia'))

print('\nsample data of Wikipedia:')
results = client.query('SELECT title, text, url, views, langs FROM wiki.Wikipedia LIMIT 1')
print_results(results)

search("US president")
search("NBA stars")
search("NBA stars", "views > 3000")

number of rows in wiki.Wikipedia: 35167920

sample data of Wikipedia:
title | text | url | views | langs
---
Deaths in 2022 | The following notable deaths occurred in 2022. Names are reported under the date of death, in alphabetical order. A typical entry reports information in the following sequence: | https://en.wikipedia.org/wiki?curid=69407798 | 5674.44921875 | 38

search for "US president" 
title | text | views | dist
---
President of the United States | The president of the United States (POTUS) is the head of state and head of government of the United States of America. The president directs the executive branch of the federal government and is the commander-in-chief of the United States Armed Forces. | 3683.144287109375 | 0.2250821590423584
The American President | The American President is a 1995 American romantic comedy-drama film directed and produced by Rob Reiner and written by Aaron Sorkin. The film stars Michael Douglas, Annette Bening, Martin Sheen, Michael J. Fox, and 

## 说明
下面的代码是用来执行 SelfQuery 的示例代码，可以理解为向 LLM 提问，由 LLM 生成相关的 filter 并访问数据库获得结果。

In [2]:
# execute self query.
emb_model = SentenceTransformerEmbeddings(
    model_name='sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
)
wiki_table_config = {
    "database": "wiki",
    "table": "Wikipedia",
    "table_contents": "Snapshort from Wikipedia for 2022. All in English.",
    "doc_prompt": PromptTemplate(
        input_variables=["page_content", "url", "title", "views"],
        template="Title for Doc {title}\n\tviews: {views}\n\tcontent: {page_content}\nSOURCE: {url}"
    ),
    "metadata_col_attributes": [
        AttributeInfo(name="title", description="title of the wikipedia page", type="string"),
        AttributeInfo(name="text", description="paragraph from this wiki page", type="string"),
        AttributeInfo(name="views", description="number of views", type="float")
    ],
    "must_have_col_names": ['id', 'title', 'url', 'text', 'views'],
    "vector_col_name": "emb",
    "text_col_name": "text",
    "metadata_col_name": "metadata",
    "emb_model": lambda: SentenceTransformerEmbeddings(
        model_name='sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
    )
}


class MyScaleWithoutMetadataJson(MyScale):
    def __init__(
            self,
            embedding: Embeddings,
            config: Optional[MyScaleSettings] = None,
            must_have_cols: List[str] = [],
            **kwargs: Any
    ) -> None:
        try:
            super().__init__(embedding, config, **kwargs)
        except Exception as e:
            # 我们提供的 table 是 readonly 的
            logger.warning(e)
        self.must_have_cols: List[str] = must_have_cols

    def _build_qstr(
            self,
            q_emb: List[float],
            topk: int,
            where_str: Optional[str] = None
    ) -> str:
        q_emb_str = ",".join(map(str, q_emb))
        if where_str:
            where_str = where_str.replace("metadata.", "")
            where_str = f"PREWHERE {where_str}"
        else:
            where_str = ""

        q_str = f"""
            SELECT {self.config.column_map['text']}, dist, {','.join(self.must_have_cols)}
            FROM {self.config.database}.{self.config.table}
            {where_str}
            ORDER BY distance({self.config.column_map['vector']}, [{q_emb_str}]) 
                AS dist {self.dist_order}
            LIMIT {topk}
            """
        return q_str

    def similarity_search_by_vector(self, embedding: List[float], k: int = 4, where_str: Optional[str] = None,
                                    **kwargs: Any) -> List[Document]:
        q_str = self._build_qstr(embedding, k, where_str)
        try:
            return [
                Document(
                    page_content=r[self.config.column_map["text"]],
                    metadata={k: r[k] for k in self.must_have_cols},
                )
                for r in self.client.query(q_str).named_results()
            ]
        except Exception as e:
            logger.error(
                f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m")
            return []


myscale_settings = MyScaleSettings(
    host=MYSCALE_HOST,
    port=MYSCALE_PORT,
    username=MYSCALE_USER,
    password=MYSCALE_PASSWORD,
    database=wiki_table_config.get("database"),
    table=wiki_table_config.get("table"),
    column_map={
        "id": "id",
        "text": wiki_table_config.get("text_col_name"),
        "vector": wiki_table_config.get("vector_col_name"),
        "metadata": wiki_table_config.get("metadata_col_name")
    }
)

myscale_vector_store = MyScaleWithoutMetadataJson(
    embedding=emb_model,
    config=myscale_settings,
    must_have_cols=wiki_table_config.get("must_have_col_names")
)

retriever: SelfQueryRetriever = SelfQueryRetriever.from_llm(
    llm=ChatOpenAI(
        model_name="gpt-3.5-turbo-0125",
        temperature=0
    ),
    vectorstore=myscale_vector_store,
    document_contents=wiki_table_config.get("table_contents"),
    metadata_field_info=wiki_table_config.get("metadata_col_attributes"),
    use_original_query=False,
    structured_query_translator=MyScaleTranslator()
)

relevant_docs = retriever.invoke(
    input="Give some information about Obama, with views greater than 100.",
)

for doc in relevant_docs:
    print(f"{doc.page_content}\n")

:HTTPDriver for https://msc-950b9f1f.us-east-1.aws.myscale.com:443 returned response code 500)
 Code: 497. DB::Exception: chatdata: Not enough privileges. To execute this query it's necessary to have grant CREATE TABLE ON wiki.Wikipedia. (ACCESS_DENIED) (version 23.3.2.1)



query='Obama' filter=Comparison(comparator=<Comparator.GT: 'gt'>, attribute='views', value=100) limit=None
Barack Hussein Obama II ( ; born August 4, 1961) is an American politician who served as the 44th president of the United States from 2009 to 2017. A member of the Democratic Party, Obama was the first African-American president of the United States. He previously served as a U.S. senator from Illinois from 2005 to 2008 and as an Illinois state senator from 1997 to 2004, and previously worked as a civil rights lawyer before entering politics.

Thanks, Obama is an Internet meme both seriously and satirically used in regard to former U.S. President Barack Obama's policies.

During the speech, Obama referred to advancements made during his presidency, such as reversing the Great Recession, creating many new jobs, shutting down Iran's nuclear weapons programme and achieving marriage equality. Obama suggested one who is frustrated with elected officials in their state should run for of

## 说明
使用 SelfQuery 能够直接从 MyScaleDB 中获取相关的文档，现在我们需要将这些文档发给 LLM，让 LLM 结合 MyScaleDB 存储的数据回答问题。 

In [3]:
COMBINE_PROMPT_TEMPLATE = (
    "You are a helpful document assistant. "
    "Your task is to provide information and answer any questions related to documents given below. "
    "You should use the sections, title and abstract of the selected documents as your source of information "
    "and try to provide concise and accurate answers to any questions asked by the user. "
    "If you are unable to find relevant information in the given sections, "
    "you will need to let the user know that the source does not contain relevant information but still try to "
    "provide an answer based on your general knowledge. You must refer to the corresponding section name and page "
    "that you refer to when answering. "
    "The following is the related information about the document that will help you answer users' questions, "
    "you MUST answer it using question's language:\n\n {summaries} "
    "Now you should answer user's question. Remember you must use `Doc #` to refer papers:\n\n"
)
COMBINE_PROMPT = ChatPromptTemplate.from_messages(
    messages=[(SystemMessagePromptTemplate, COMBINE_PROMPT_TEMPLATE),
              (HumanMessagePromptTemplate, '{question}')])

stuff_document_chain = StuffDocumentsChain(
    llm_chain=LLMChain(
        prompt=COMBINE_PROMPT,
        llm=ChatOpenAI(
            model_name="gpt-3.5-turbo-0125",
            temperature=0.6
        ),
    ),
    document_prompt=wiki_table_config.get("doc_prompt"),
    document_variable_name="summaries",
)
chain = RetrievalQAWithSourcesChain(
    retriever=retriever,
    combine_documents_chain=stuff_document_chain,
    return_source_documents=True,
    max_tokens_limit=12000,
)

chain_results = chain.invoke(input="Give some information about Obama, with views greater than 100.")

print(chain_results['answer'])

  warn_deprecated(


query='Obama' filter=Comparison(comparator=<Comparator.GT: 'gt'>, attribute='views', value=100) limit=None
Certainly! The document titled "Barack Obama" has views of 4191.8388671875. It provides information about Barack Hussein Obama II, who served as the 44th president of the United States from 2009 to 2017. Obama was the first African-American president of the United States and a member of the Democratic Party. Additionally, he served as a U.S. senator from Illinois and worked as a civil rights lawyer before entering politics. You can find more details in Doc #1.
