## 开源大模型结合外部知识库的自动问答


In [None]:
!pip3 install langchain langchain-experimental text_generation

In [1]:
from langchain.embeddings import SentenceTransformerEmbeddings

emb_model = SentenceTransformerEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
)

In [2]:
from sqlalchemy import create_engine, MetaData

MYSCALE_USER = "chatdata"
MYSCALE_PASSWORD = "myscale_rocks"
MYSCALE_HOST = "msc-1decbcc9.us-east-1.aws.staging.myscale.cloud"
MYSCALE_PORT = 443

engine = create_engine(
    f"clickhouse://{MYSCALE_USER}:{MYSCALE_PASSWORD}@{MYSCALE_HOST}:{MYSCALE_PORT}/default?protocol=https"
)
metadata = MetaData(bind=engine)

In [3]:
from typing import List, Dict, Any
from langchain_experimental.sql.vector_sql import VectorSQLOutputParser


class VectorSQLRetrieveCustomOutputParser(VectorSQLOutputParser):
    """Based on VectorSQLOutputParser
    It also modify the SQL to get all columns
    """

    must_have_columns: List[str]

    @property
    def _type(self) -> str:
        return "vector_sql_retrieve_custom"

    def parse(self, text: str) -> Dict[str, Any]:
        text = text.strip()
        start = text.upper().find("SELECT")
        if start >= 0:
            end = text.upper().find("FROM")
            text = text.replace(
                text[start + len("SELECT") + 1 : end - 1],
                ", ".join(self.must_have_columns),
            )
        return super().parse(text)

In [14]:
from prompts import _myscale_prompt
from langchain.prompts import PromptTemplate
from langchain.sql_database import SQLDatabase
from langchain_experimental.retrievers.vector_sql_database import (
    VectorSQLDatabaseChainRetriever,
)
from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain
from langchain.llms import HuggingFaceTextGenInference


must_have_cols = ["id", "title", "url", "text", "views"]

PROMPT = PromptTemplate(
    input_variables=["input", "table_info", "top_k"],
    template=_myscale_prompt,
)
output_parser = VectorSQLRetrieveCustomOutputParser.from_embeddings(
    model=emb_model, must_have_columns=must_have_cols
)
query_llm = HuggingFaceTextGenInference(
    inference_server_url="http://10.1.3.28:8080/",
    max_new_tokens=100,
    do_sample=False,
    stop_sequences=["\n\n"],
    timeout=600,
)

sql_query_chain = VectorSQLDatabaseChain.from_llm(
    llm=query_llm,
    prompt=PROMPT,
    top_k=10,
    return_direct=True,
    db=SQLDatabase(engine, None, metadata, max_string_length=1024),
    sql_cmd_parser=output_parser,
    native_format=True,
)
sql_retriever = VectorSQLDatabaseChainRetriever(
    sql_db_chain=sql_query_chain, page_content_key="text"
)

In [15]:
from langchain.callbacks import StdOutCallbackHandler

docs = sql_retriever.get_relevant_documents("Introduce some applications of GANs published around 2019.",
                                            callbacks=[StdOutCallbackHandler()])
docs



[1m> Entering new VectorSQLDatabaseChain chain...[0m
Introduce some applications of GANs published around 2019.
SQLQuery:

[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a MyScale expert. Given an input question, first create a syntactically correct MyScale query to run, then look at the results of the query and return the answer to the input question.
MyScale queries has a vector distance function called `DISTANCE(column, array)` to compute relevance to the user's question and sort the feature array column by the relevance. 
When the query is asking for 10 closest row, you have to use this distance function to calculate distance to entity's array on vector column and order by the distance to retrieve relevant rows.
*NOTICE*: `DISTANCE(column, array)` only accept an array column as its first argument and a `NeuralArray(entity)` as its second argument. You also need a user defined function called `NeuralArray(entity)` to retrieve the entity's 

In [None]:
from langchain import LLMChain
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

from prompts import combine_prompt_template

COMBINE_PROMPT = PromptTemplate(template=combine_prompt_template)

doc_prompt = PromptTemplate(
    input_variables=["page_content", "id", "title", "ref_id",
                     "authors", "pubdate", "categories"],
    template=("Title for Doc #{ref_id}: {title}\n\tAbstract: {page_content}\n\tAuthors: {authors}\n\t"
              "Date of Publication: {pubdate}\n\tCategories: {categories}\nSOURCE: {id}"))

ask_llm = HuggingFaceTextGenInference(
                inference_server_url="http://10.1.3.28:8080/",
                max_new_tokens=30,
                temperature=0.6,
                )

chain = RetrievalQAWithSourcesChain(
            retriever=sql_retriever,
            combine_documents_chain=StuffDocumentsChain(
                llm_chain=LLMChain(
                    prompt=COMBINE_PROMPT,
                    llm=ask_llm,
                ),
                document_prompt=,
                document_variable_name="summaries",

            ),
            return_source_documents=True,
            max_tokens_limit=12000,
        )

In [5]:
from huggingface_hub import InferenceClient

qstr = """You are a MyScale expert. Given an input question, first create a syntactically correct MyScale query to run, then look at the results of the query and return the answer to the input question.
MyScale queries has a vector distance function called `DISTANCE(column, array)` to compute relevance to the user's question and sort the feature array column by the relevance. 
When the query is asking for 10 closest row, you have to use this distance function to calculate distance to entity's array on vector column and order by the distance to retrieve relevant rows.

*NOTICE*: `DISTANCE(column, array)` only accept an array column as its first argument and a `NeuralArray(entity)` as its second argument. You also need a user defined function called `NeuralArray(entity)` to retrieve the entity's array. 

Unless the user specifies in the question a specific number of examples to obtain, query for at most 10 results using the LIMIT clause as per MyScale. You should only order according to the distance function.
Never query for all columns from a table. You must query only the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers.
Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
Pay attention to use today() function to get the current date, if the question involves "today". `ORDER BY` clause should always be after `WHERE` clause. DO NOT add semicolon to the end of SQL. Pay attention to the comment in table schema.
Pay attention to the data type when using functions. Always use `AND` to connect conditions in `WHERE` and never use comma.
Make sure you never write an isolated `WHERE` keyword and never use undesired condition to conrtain the query.

Use the following format:

======== table info ========
<some table infos>

Question: "Question here"
SQLQuery: "SQL Query to run"


Here are some examples:

======== table info ========
CREATE TABLE "ChatPaper" (
	abstract String, 
	id String, 
	vector Array(Float32), 
) ENGINE = ReplicatedReplacingMergeTree()
 ORDER BY id
 PRIMARY KEY id
 
Question: What is Feartue Pyramid Network?
SQLQuery: SELECT ChatPaper.title, ChatPaper.id, ChatPaper.authors FROM ChatPaper ORDER BY DISTANCE(vector, NeuralArray(PaperRank contribution)) LIMIT 10


======== table info ========
CREATE TABLE "ChatPaper" (
	abstract String, 
	id String, 
	vector Array(Float32), 
	categories Array(String), 
	pubdate DateTime, 
	title String, 
	authors Array(String), 
	primary_category String
) ENGINE = ReplicatedReplacingMergeTree()
 ORDER BY id
 PRIMARY KEY id
 
Question: What is PaperRank? What is the contribution of those works? Use paper with more than 2 categories.
SQLQuery: SELECT ChatPaper.title, ChatPaper.id, ChatPaper.authors FROM ChatPaper WHERE length(categories) > 2 ORDER BY DISTANCE(vector, NeuralArray(PaperRank contribution)) LIMIT 10


======== table info ========
CREATE TABLE "ChatArXiv" (
	primary_category String
	categories Array(String), 
	pubdate DateTime, 
	abstract String, 
	title String, 
	paper_id String, 
	vector Array(Float32), 
	authors Array(String), 
) ENGINE = MergeTree()
 ORDER BY paper_id
 PRIMARY KEY paper_id
 
Question: Did Geoffrey Hinton wrote about Capsule Neural Networks? Please use articles published later than 2021.
SQLQuery: SELECT ChatArXiv.title, ChatArXiv.paper_id, ChatArXiv.authors FROM ChatArXiv WHERE has(authors, 'Geoffrey Hinton') AND pubdate > parseDateTimeBestEffort('2021-01-01') ORDER BY DISTANCE(vector, NeuralArray(Capsule Neural Networks)) LIMIT 10


======== table info ========
CREATE TABLE "PaperDatabase" (
	abstract String, 
	categories Array(String), 
	vector Array(Float32), 
	pubdate DateTime, 
	id String, 
	comments String,
	title String, 
	authors Array(String), 
	primary_category String
) ENGINE = MergeTree()
 ORDER BY id
 PRIMARY KEY id
 
Question: Find papers whose abstract has Mutual Information in it.
SQLQuery: SELECT PaperDatabase.title, PaperDatabase.id FROM PaperDatabase WHERE abstract ILIKE '%Mutual Information%' ORDER BY DISTANCE(vector, NeuralArray(Mutual Information)) LIMIT 10

 
Let's begin:

======== table info ========

CREATE TABLE "ChatArXiv" (
	abstract String, 
	id String, 
	vector Array(Float32), 
	pubdate DateTime, 
	title String, 
	categories Array(String), 
	authors Array(String), 
	comment String, 
	primary_category String
) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/bb56d04b-baf0-4676-b051-64fe88c4377c/{shard}', '{replica}')
 ORDER BY id
 PRIMARY KEY id

/*
3 rows from ChatArXiv table:
abstract	id	vector	pubdate	title	categories	authors	comment	primary_category
  Adaptive networks appear in many biological applications. They combine
topological evolution of th	http://arxiv.org/abs/0709.1858v2	[0.007527284,-0.0030175236,0.009047618,-0.04547687,-0.06116927,-0.07906006,-0.06741533,0.0097926,-0.	2007-09-12 21:26:00	Adaptive Coevolutionary Networks: A Review	['physics.soc-ph','cond-mat.stat-mech','q-bio.PE']	['Thilo Gross','Bernd Blasius']	13 pages, 5 figures	physics.soc-ph
  Experimental analysis of data from particle collisions is typically expressed
as statistical limit	http://arxiv.org/abs/1203.6642v3	[0.012555987,0.012607349,-0.004974116,-0.020995466,-0.058307458,-0.042527035,-0.07622006,0.009503578	2012-03-30 03:45:09	Reinterpretion of Experimental Results with Basis Templates	['hep-ex','hep-ph']	['Kanishka Rao','Daniel Whiteson']		hep-ex
  We study the relationship between derived categories of factorizations on
gauged Landau-Ginzburg m	http://arxiv.org/abs/1203.6643v4	[-0.004498747,0.0004967965,0.012601328,-0.0291525,-0.06666261,-0.07153263,-0.073795326,0.011703003,-	2012-03-30 03:45:37	Variation of geometric invariant theory quotients and derived categories	['math.AG']	['Matthew Ballard','David Favero','Ludmil Katzarkov']	Updated references and addresses	math.AG
*/

Question: Introduce some applications of GANs published around 2019.
SQLQuery:"""


client = InferenceClient(model="http://10.1.3.28:8080")
client.text_generation(prompt=qstr, max_new_tokens=100, do_sample=False)

" SELECT ChatArXiv.title, ChatArXiv.id, ChatArXiv.authors, ChatArXiv.comment FROM ChatArXiv WHERE pubdate >= parseDateTime('2019-1-01') ORDER BY DISTANCE(vector, NeuralArray(GAN)) LIMIT 10"