## 数据源配置

In [71]:
from langchain_community.utilities import SQLDatabase

db = SQLDatabase.from_uri("sqlite:///Chinook.db")
print(db.dialect)
print(db.get_usable_table_names())
db.run("SELECT * FROM Artist LIMIT 10;")

sqlite
['Album', 'Artist', 'Customer', 'Employee', 'Genre', 'Invoice', 'InvoiceLine', 'MediaType', 'Playlist', 'PlaylistTrack', 'Track']


"[(1, 'AC/DC'), (2, 'Accept'), (3, 'Aerosmith'), (4, 'Alanis Morissette'), (5, 'Alice In Chains'), (6, 'Antônio Carlos Jobim'), (7, 'Apocalyptica'), (8, 'Audioslave'), (9, 'BackBeat'), (10, 'Billy Cobham')]"

In [73]:
from typing import Dict, Optional
from langchain_core.runnables.utils import Input
from langchain_core.runnables import Runnable, RunnableConfig
from langchain_core.load import Serializable

class StdOutputRunnable(Serializable, Runnable[Input, Input]):
    @property
    def lc_serializable(self) -> bool:
        return True

    def invoke(self, input: Dict, config: Optional[RunnableConfig] = None) -> Input:
        print(input)
        return self._call_with_config(lambda x: x, input, config)

## 多表查询

In [97]:
from langchain.chains.openai_tools import create_extraction_chain_pydantic
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
import logging

logging.basicConfig(level=logging.DEBUG)

llm = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)


class Table(BaseModel):
    """Table in SQL database."""

    name: str = Field(description="Name of table in SQL database.")
    #schemas: str = Field(description="Schema of table in SQL database.")


table_names = "\n".join(db.get_usable_table_names())
system = f"""Return the names and schemas of ALL the SQL tables that MIGHT be relevant to the user question. \
The tables are:

{table_names}

Remember to include ALL POTENTIALLY RELEVANT tables, even if you're not sure that they're needed."""
print(system)
table_chain = create_extraction_chain_pydantic(Table, llm, system_message=system)
table_chain.invoke({"input": "What are all the genres of Alanis Morisette songs"})

DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False
DEBUG:httpx:load_verify_locations cafile='D:\\dev\\miniconda3\\envs\\langchain\\lib\\site-packages\\certifi\\cacert.pem'
DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False
DEBUG:httpx:load_verify_locations cafile='D:\\dev\\miniconda3\\envs\\langchain\\lib\\site-packages\\certifi\\cacert.pem'
DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False
DEBUG:httpx:load_verify_locations cafile='D:\\dev\\miniconda3\\envs\\langchain\\lib\\site-packages\\certifi\\cacert.pem'
DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False
DEBUG:httpx:load_verify_locations cafile='D:\\dev\\miniconda3\\envs\\langchain\\lib\\site-packages\\certifi\\cacert.pem'
DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False
DEBUG:httpx:load_verify_locations cafile='D:\\dev\\miniconda3\\envs\\langchain\\lib\\site-packages\\certifi\\cacert.pem'
DEBUG

Return the names and schemas of ALL the SQL tables that MIGHT be relevant to the user question. The tables are:

Album
Artist
Customer
Employee
Genre
Invoice
InvoiceLine
MediaType
Playlist
PlaylistTrack
Track

Remember to include ALL POTENTIALLY RELEVANT tables, even if you're not sure that they're needed.


DEBUG:httpcore.proxy:start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x000001AF668DF310>
DEBUG:httpcore.http11:send_request_headers.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_headers.complete
DEBUG:httpcore.http11:send_request_body.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_body.complete
DEBUG:httpcore.http11:receive_response_headers.started request=<Request [b'POST']>
DEBUG:httpcore.http11:receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Thu, 22 Feb 2024 07:42:20 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'Cache-Control', b'no-cache, must-revalidate'), (b'openai-model', b'gpt-3.5-turbo-1106'), (b'openai-organization', b'user-afy5yzv93geh2cq6ef0fp6be'), (b'openai-processing-ms', b'33857'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security

[Table(name='Genre'), Table(name='Artist'), Table(name='Track')]

In [98]:
from typing import List


def get_tables(categories: List[Table]) -> List[str]:
    tables = []
    for category in categories:
        tables.append(category.name)
    return tables


table_chain = table_chain | get_tables  # noqa
table_chain.invoke({"input": "What are all the genres of Alanis Morisette songs"})

DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': "Return the names and schemas of ALL the SQL tables that MIGHT be relevant to the user question. The tables are:\n\nAlbum\nArtist\nCustomer\nEmployee\nGenre\nInvoice\nInvoiceLine\nMediaType\nPlaylist\nPlaylistTrack\nTrack\n\nRemember to include ALL POTENTIALLY RELEVANT tables, even if you're not sure that they're needed."}, {'role': 'user', 'content': 'What are all the genres of Alanis Morisette songs'}], 'model': 'gpt-3.5-turbo-1106', 'n': 1, 'stream': False, 'temperature': 0.0, 'tools': [{'type': 'function', 'function': {'name': 'Table', 'description': 'Table in SQL database.', 'parameters': {'type': 'object', 'properties': {'name': {'description': 'Name of table in SQL database.', 'type': 'string'}}, 'required': ['name']}}}]}}
DEBUG:httpcore.connection:connect_tcp.started host='127.0.0.1' port=1089 local_address=None timeou

['Genre', 'Artist', 'Track']

In [99]:
from langchain.chains import create_sql_query_chain
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter

from langchain_core.prompts import PromptTemplate

template = '''Given an input question, first create a syntactically correct postgres query to run, then look at the results of the query and return the answer, show {top_k}.
Use the following format:

Question: "Question here"
SQLQuery: "SQL Query to run"
SQLResult: "Result of the SQLQuery"
Answer: "Final answer here"

Only use the following tables:

{table_info}.

Question: {input}'''
prompt = PromptTemplate.from_template(template)

query_chain = create_sql_query_chain(llm, db)
table_chain = {"input": itemgetter("question")} | table_chain | StdOutputRunnable()
full_chain = RunnablePassthrough.assign(table_names_to_use=table_chain) | StdOutputRunnable() | query_chain

full_chain.invoke(
    {"question": "What are all the genres of Alanis Morisette songs"}
)

DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': "Return the names and schemas of ALL the SQL tables that MIGHT be relevant to the user question. The tables are:\n\nAlbum\nArtist\nCustomer\nEmployee\nGenre\nInvoice\nInvoiceLine\nMediaType\nPlaylist\nPlaylistTrack\nTrack\n\nRemember to include ALL POTENTIALLY RELEVANT tables, even if you're not sure that they're needed."}, {'role': 'user', 'content': 'What are all the genres of Alanis Morisette songs'}], 'model': 'gpt-3.5-turbo-1106', 'n': 1, 'stream': False, 'temperature': 0.0, 'tools': [{'type': 'function', 'function': {'name': 'Table', 'description': 'Table in SQL database.', 'parameters': {'type': 'object', 'properties': {'name': {'description': 'Name of table in SQL database.', 'type': 'string'}}, 'required': ['name']}}}]}}
DEBUG:httpcore.connection:connect_tcp.started host='127.0.0.1' port=1089 local_address=None timeou

['Genre', 'Artist', 'Track']
{'question': 'What are all the genres of Alanis Morisette songs', 'table_names_to_use': ['Genre', 'Artist', 'Track']}


DEBUG:httpcore.http11:receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Thu, 22 Feb 2024 07:42:53 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'Cache-Control', b'no-cache, must-revalidate'), (b'openai-model', b'gpt-3.5-turbo-1106'), (b'openai-organization', b'user-afy5yzv93geh2cq6ef0fp6be'), (b'openai-processing-ms', b'2145'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=15724800; includeSubDomains'), (b'x-ratelimit-limit-requests', b'200'), (b'x-ratelimit-limit-tokens', b'40000'), (b'x-ratelimit-remaining-requests', b'186'), (b'x-ratelimit-remaining-tokens', b'39373'), (b'x-ratelimit-reset-requests', b'1h35m4.293s'), (b'x-ratelimit-reset-tokens', b'940ms'), (b'x-request-id', b'req_57ed0d52ae841f0e1218b8d4765bd103'), (b'CF-Cache-Status', b'DYNAMIC'), (b'Server', b'cloudflare'), (b'CF-RAY', b'8595984338ce69e3-

'SELECT "Genre"."Name"\nFROM "Genre"\nJOIN "Track" ON "Genre"."GenreId" = "Track"."GenreId"\nJOIN "Album" ON "Track"."AlbumId" = "Album"."AlbumId"\nJOIN "Artist" ON "Album"."ArtistId" = "Artist"."ArtistId"\nWHERE "Artist"."Name" = \'Alanis Morissette\'\nLIMIT 5;'

In [51]:
system = """Return the names of the SQL tables that are relevant to the user question. \
The tables are:

Music
Business"""
category_chain = create_extraction_chain_pydantic(Table, llm, system_message=system)
category_chain.invoke({"input": "What are all the genres of Alanis Morisette songs"})

DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': 'Return the names of the SQL tables that are relevant to the user question. The tables are:\n\nMusic\nBusiness'}, {'role': 'user', 'content': 'What are all the genres of Alanis Morisette songs'}], 'model': 'gpt-3.5-turbo-1106', 'n': 1, 'stream': False, 'temperature': 0.0, 'tools': [{'type': 'function', 'function': {'name': 'Table', 'description': 'Table in SQL database.', 'parameters': {'type': 'object', 'properties': {'name': {'description': 'Name of table in SQL database.', 'type': 'string'}, 'schemas': {'description': 'Schema of table in SQL database.', 'type': 'string'}}, 'required': ['name', 'schemas']}}}]}}
DEBUG:httpcore.http11:send_request_headers.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_headers.complete
DEBUG:httpcore.http11:send_request_body.started request=<Request [b'POST']>
DEBUG:http

[Table(name='Music', schemas='*'), Table(name='Business', schemas='*')]

In [52]:
from typing import List


def get_tables(categories: List[Table]) -> List[str]:
    tables = []
    for category in categories:
        if category.name == "Music":
            tables.extend(
                [
                    "Album",
                    "Artist",
                    "Genre",
                    "MediaType",
                    "Playlist",
                    "PlaylistTrack",
                    "Track",
                ]
            )
        elif category.name == "Business":
            tables.extend(["Customer", "Employee", "Invoice", "InvoiceLine"])
    return tables


table_chain = category_chain | get_tables  # noqa
table_chain.invoke({"input": "What are all the genres of Alanis Morisette songs"})

DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': 'Return the names of the SQL tables that are relevant to the user question. The tables are:\n\nMusic\nBusiness'}, {'role': 'user', 'content': 'What are all the genres of Alanis Morisette songs'}], 'model': 'gpt-3.5-turbo-1106', 'n': 1, 'stream': False, 'temperature': 0.0, 'tools': [{'type': 'function', 'function': {'name': 'Table', 'description': 'Table in SQL database.', 'parameters': {'type': 'object', 'properties': {'name': {'description': 'Name of table in SQL database.', 'type': 'string'}, 'schemas': {'description': 'Schema of table in SQL database.', 'type': 'string'}}, 'required': ['name', 'schemas']}}}]}}
DEBUG:httpcore.http11:send_request_headers.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_headers.complete
DEBUG:httpcore.http11:send_request_body.started request=<Request [b'POST']>
DEBUG:http

['Album',
 'Artist',
 'Genre',
 'MediaType',
 'Playlist',
 'PlaylistTrack',
 'Track',
 'Album',
 'Artist',
 'Genre',
 'MediaType',
 'Playlist',
 'PlaylistTrack',
 'Track']

## 高基数列（xx）

In [53]:
import ast
import re


def query_as_list(db, query):
    res = db.run(query)
    res = [el for sub in ast.literal_eval(res) for el in sub if el]
    res = [re.sub(r"\b\d+\b", "", string).strip() for string in res]
    return res


proper_nouns = query_as_list(db, "SELECT Name FROM Artist")
proper_nouns += query_as_list(db, "SELECT Title FROM Album")
proper_nouns += query_as_list(db, "SELECT Name FROM Genre")
print(len(proper_nouns))
proper_nouns[:5]

647


['AC/DC', 'Accept', 'Aerosmith', 'Alanis Morissette', 'Alice In Chains']

In [54]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

vector_db = FAISS.from_texts(proper_nouns, OpenAIEmbeddings())
retriever = vector_db.as_retriever(search_kwargs={"k": 15})

DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False
DEBUG:httpx:load_verify_locations cafile='D:\\dev\\miniconda3\\envs\\langchain\\lib\\site-packages\\certifi\\cacert.pem'
DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False
DEBUG:httpx:load_verify_locations cafile='D:\\dev\\miniconda3\\envs\\langchain\\lib\\site-packages\\certifi\\cacert.pem'
DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False
DEBUG:httpx:load_verify_locations cafile='D:\\dev\\miniconda3\\envs\\langchain\\lib\\site-packages\\certifi\\cacert.pem'
DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False
DEBUG:httpx:load_verify_locations cafile='D:\\dev\\miniconda3\\envs\\langchain\\lib\\site-packages\\certifi\\cacert.pem'
DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False
DEBUG:httpx:load_verify_locations cafile='D:\\dev\\miniconda3\\envs\\langchain\\lib\\site-packages\\certifi\\cacert.pem'
DEBUG

In [55]:
from langchain.chains import create_sql_query_chain
from operator import itemgetter

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

system = """You are a SQLite expert. Given an input question, create a syntactically \
correct SQLite query to run. Unless otherwise specificed, do not return more than \
{top_k} rows.\n\nHere is the relevant table info: {table_info}\n\nHere is a non-exhaustive \
list of possible feature values. If filtering on a feature value make sure to check its spelling \
against this list first:\n\n{proper_nouns}"""

prompt = ChatPromptTemplate.from_messages([("system", system), ("human", "{input}")])

query_chain = create_sql_query_chain(llm, db, prompt=prompt)
retriever_chain = (
    itemgetter("question")
    | retriever
    | (lambda docs: "\n".join(doc.page_content for doc in docs))
)
chain = RunnablePassthrough.assign(proper_nouns=retriever_chain) | query_chain

In [56]:
# Without retrieval
query = query_chain.invoke(
    {"question": "What are all the genres of elenis moriset songs", "proper_nouns": ""}
)
print(query)
db.run(query)

DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': 'You are a SQLite expert. Given an input question, create a syntactically correct SQLite query to run. Unless otherwise specificed, do not return more than 5 rows.\n\nHere is the relevant table info: \nCREATE TABLE "Album" (\n\t"AlbumId" INTEGER NOT NULL, \n\t"Title" NVARCHAR(160) NOT NULL, \n\t"ArtistId" INTEGER NOT NULL, \n\tPRIMARY KEY ("AlbumId"), \n\tFOREIGN KEY("ArtistId") REFERENCES "Artist" ("ArtistId")\n)\n\n/*\n3 rows from Album table:\nAlbumId\tTitle\tArtistId\n1\tFor Those About To Rock We Salute You\t1\n2\tBalls to the Wall\t2\n3\tRestless and Wild\t2\n*/\n\n\nCREATE TABLE "Artist" (\n\t"ArtistId" INTEGER NOT NULL, \n\t"Name" NVARCHAR(120), \n\tPRIMARY KEY ("ArtistId")\n)\n\n/*\n3 rows from Artist table:\nArtistId\tName\n1\tAC/DC\n2\tAccept\n3\tAerosmith\n*/\n\n\nCREATE TABLE "Customer" (\n\t"CustomerId" INTEGER N

```sql
SELECT DISTINCT g.Name AS Genre
FROM Track t
JOIN Album a ON t.AlbumId = a.AlbumId
JOIN Genre g ON t.GenreId = g.GenreId
WHERE t.Composer LIKE '%Elenis Moriset%'
```


OperationalError: (sqlite3.OperationalError) near "```sql
SELECT DISTINCT g.Name AS Genre
FROM Track t
JOIN Album a ON t.AlbumId = a.AlbumId
JOIN Genre g ON t.GenreId = g.GenreId
WHERE t.Composer LIKE '%Elenis Moriset%'
```": syntax error
[SQL: ```sql
SELECT DISTINCT g.Name AS Genre
FROM Track t
JOIN Album a ON t.AlbumId = a.AlbumId
JOIN Genre g ON t.GenreId = g.GenreId
WHERE t.Composer LIKE '%Elenis Moriset%'
```]
(Background on this error at: https://sqlalche.me/e/20/e3q8)