In [1]:
import os
import getpass

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')
os.environ['MYSCALE_HOST'] = getpass.getpass('MyScale URL:')
os.environ['MYSCALE_PORT'] = getpass.getpass('MyScale Port:')
os.environ['MYSCALE_USERNAME'] = getpass.getpass('MyScale Username:')
os.environ['MYSCALE_PASSWORD'] = getpass.getpass('MyScale Password:')

### Create a MyScale vectorstore and insert data

In [48]:
import json
from langchain.schema import Document
from langchain.vectorstores import MyScale
from langchain.embeddings import HuggingFaceEmbeddings

def str2doc(_str):
    j = json.loads(_str)
    return Document(page_content=j['abstract'], metadata=j['metadata'])

with open('func_call_data.jsonl') as f:
    docs = [str2doc(l) for l in f.readlines()]

vectorstore = MyScale.from_documents(documents=docs, embedding=HuggingFaceEmbeddings())

Inserting data...: 100%|██████████| 8/8 [00:01<00:00,  4.41it/s]


### Define metedata columns

We used `AttributeInfo` from [LangChain](https://python.langchain.com/en/latest/), and format metadata into strings. That data will be later used as prompt.

In [69]:
from langchain.chains.query_constructor.base import _format_attribute_info, AttributeInfo

metadata_field_info=[
    AttributeInfo(
        name="metadata.pubdate",
        description="The date when paper is published, need to use `parseDateTime32BestEffort() to convert timestamps in string format to comparable format.` ", 
        type="timestamp", 
    ),
    AttributeInfo(
        name="metadata.authors",
        description="List of author names", 
        type="list[string]", 
    ),
    AttributeInfo(
        name="metadata.title",
        description="Title of the paper", 
        type="string", 
    ),
    AttributeInfo(
        name="text",
        description="Abstract of the paper", 
        type="string", 
    ),
    AttributeInfo(
        name="metadata.categories",
        description="arxiv categories to this paper",
        type="list[string]"
    ),
    AttributeInfo(
        name="length(metadata.categories)",
        description="length of arxiv categories to this paper",
        type="int"
    ),
]

formated = _format_attribute_info(metadata_field_info)
print(formated)


{{
    "metadata.pubdate": {{
        "description": "The date when paper is published, need to use `parseDateTime32BestEffort() to convert timestamps in string format to comparable format.` ",
        "type": "timestamp"
    }},
    "metadata.authors": {{
        "description": "List of author names",
        "type": "list[string]"
    }},
    "metadata.title": {{
        "description": "Title of the paper",
        "type": "string"
    }},
    "text": {{
        "description": "Abstract of the paper",
        "type": "string"
    }},
    "metadata.categories": {{
        "description": "arxiv categories to this paper",
        "type": "list[string]"
    }},
    "length(metadata.categories)": {{
        "description": "length of arxiv categories to this paper",
        "type": "int"
    }}
}}


### ... And here is the magic! Function Call from OpenAI

Still, prompting is important. We inject metadata and some instruction with conversation context.

We used the plain text way to call with openai, following the [official documentation](https://platform.openai.com/docs/guides/gpt/function-calling).

In [70]:
import openai

query = "What is a Bayesian network? Please use articles published later than Feb 2013 and whose abstract like `artificial` with more than 2 categories and must have `cs.CV` in its category."

completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-0613",
        temperature=0,
        functions=[{"name": "to_structued_sql", 
                    "description": "Convert the query into a query string and a where string to filter this query. When checking if elements is in a list, please use `has(column, element)`",
                    "parameters": {"type": "object", 
                                   "properties": {"query": {"type": "string"},
                                                  "where_str": {"type": "string",},
                                                  "limit": {"type": "integer", "description": "default to 4"}},
                                   "required": ["subject", "where_str", "limit"]
                                   }
                    },],
        function_call="auto",
        messages=[
            {
                "role": "system",
                "content": "You need to provide `metadata` to construct SQL. I will use `parseDateTime32BestEffort()` to convert timestamps in string format to comparable format.",
            },
            {
                 "role": "user",
                "content": f"Metadata: {formated}"
            },
            {
                "role": "system",
                "content": "Now you can input your query",
            },
            {
                "role": "user",
                "content": query
            },
        ],
    )

import json
search_kwargs = json.loads(completion.choices[0].message.function_call.arguments)
print(search_kwargs)


{'query': 'What is a Bayesian network?', 'where_str': "parseDateTime32BestEffort(metadata.pubdate) > parseDateTime32BestEffort('2013-02-01') AND text LIKE '%artificial%' AND length(metadata.categories) > 2 AND has(metadata.categories, 'cs.CV')", 'limit': 10}


### Then search!

This is the exact same compared to LangChain self-query retrievers. And it is more flexible - it can write any SQL... and even user defined functions. 

It is up to you to redefine how databases with vector search interact with AGI systems!!

In [71]:

ret = vectorstore.similarity_search(**search_kwargs)
print(len(ret))
for r in ret:
    print(r)

1
page_content='  Artificial intelligence (AI) has become a part of everyday conversation and\nour lives. It is considered as the new electricity that is revolutionizing the\nworld. AI is heavily invested in both industry and academy. However, there is\nalso a lot of hype in the current AI debate. AI based on so-called deep\nlearning has achieved impressive results in many problems, but its limits are\nalready visible. AI has been under research since the 1940s, and the industry\nhas seen many ups and downs due to over-expectations and related\ndisappointments that have followed.\n  The purpose of this book is to give a realistic picture of AI, its history,\nits potential and limitations. We believe that AI is a helper, not a ruler of\nhumans. We begin by describing what AI is and how it has evolved over the\ndecades. After fundamentals, we explain the importance of massive data for the\ncurrent mainstream of artificial intelligence. The most common representations\nfor AI, methods, an