In [1]:
# Keep notebook output clean for now
import warnings
warnings.filterwarnings('ignore')

In [2]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts.prompt import PromptTemplate
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
# from langchain_experimental.text_splitter import SemanticChunker
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

from pathlib import Path

In [3]:
loader = DirectoryLoader('./sources', glob="**/*.txt", loader_cls=TextLoader)
docs = loader.load()

for doc in docs:
    doc_name = (doc.metadata['source'].split('/')[1].split('.')[0])
    doc.metadata['law_name'] = doc_name 
    doc.metadata['alt_law_name'] = doc_name.split('_')[1]
    del doc.metadata['source']
    print(doc.metadata)

{'law_name': 'TX_SB2102', 'alt_law_name': 'SB2102'}
{'law_name': 'TX_SB2588', 'alt_law_name': 'SB2588'}


In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
embeddings = GPT4AllEmbeddings()
# embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(splits, embeddings)

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


In [5]:
metadata_field_info = [
    AttributeInfo(
        name="law_name",
        description="The name of the law or piece of legislation",
        type="string",
    ),
    AttributeInfo(
        name="alt_law_name",
        description="The name of the law or piece of legislation",
        type="string",
    ),
]

document_content_description = "The contents of a law or piece of legislation"

In [6]:
llm = ChatOpenAI(base_url="http://localhost:1234/v1", api_key="not-needed", temperature=0.7, max_tokens=1000, streaming=True, callbacks=[StreamingStdOutCallbackHandler()])

In [7]:
retriever = SelfQueryRetriever.from_llm(llm, vectorstore, document_content_description, metadata_field_info, verbose=True)

In [8]:
retriever.get_relevant_documents("What does the law SB2102 ential")

```json
{
    "query": "SB2102",
    "filter": "eq(\"law_name\", \"SB2102\") or eq(\"alt_law_name\", \"SB2102\")"
}
```

OutputParserException: Parsing text
```json
{
    "query": "SB2102",
    "filter": "eq(\"law_name\", \"SB2102\") or eq(\"alt_law_name\", \"SB2102\")"
}
```
 raised following error:
Unexpected token Token('CNAME', 'or') at line 1, column 26.
Expected one of: 
	* $END
Previous tokens: [Token('RPAR', ')')]


In [None]:
# llm = ChatOpenAI(base_url="http://localhost:1234/v1", api_key="not-needed", temperature=0.7, max_tokens=1000, streaming=True, callbacks=[StreamingStdOutCallbackHandler()])
# qa = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=retriever,
#     return_source_documents=True,
#     chain_type_kwargs={
#         'document_prompt': PromptTemplate(
#             input_variables=["page_content", "source"], 
#             template="Source: {source}\nContext: {page_content}"
#         ),
# 	},
# )

In [None]:
#Use semicolon to suppress additional printed output
def askLLM(query):
    result = qa({"query": query});
    print("\n --- SOURCES --- \n")
    for idx, doc in enumerate((result['source_documents'])):
        print(f"Source {idx+1} ({doc.metadata['source']}):\n", doc.page_content, "\n")

In [None]:
askLLM("What was in SB2102?")

In [None]:
askLLM("What was in SB2588?")

In [None]:
askLLM("What fees are included as part of SB2102?")