In [1]:
# Keep notebook output clean for now
import warnings
warnings.filterwarnings('ignore')

In [2]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts.prompt import PromptTemplate
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

from pathlib import Path

In [3]:
loader = DirectoryLoader('./sources', glob="**/*.txt", loader_cls=TextLoader)
docs = loader.load()

for doc in docs:
    doc_name = (doc.metadata['source'].split('/')[1].split('.')[0])
    doc.metadata['Law Name'] = doc_name 
    doc.metadata['Alt Law Name'] = doc_name.split('_')[1]
    del doc.metadata['source']
    print(doc.metadata)

{'Law Name': 'TX_SB2102', 'Alt Law Name': 'SB2102'}
{'Law Name': 'TX_SB2588', 'Alt Law Name': 'SB2588'}


In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
splits = text_splitter.split_documents(docs)

In [5]:
print(len(splits))

75


In [6]:
print(splits[0])

page_content='S.B. No. 2102\n \n \n \n \t\nAN ACT\n \trelating to the initial registration and inspection period for\n \tcertain rental vehicles; authorizing fees.\n \t       BE IT ENACTED BY THE LEGISLATURE OF THE STATE OF TEXAS:\n \t       SECTION 1.  Section 382.0622(a), Health and Safety Code, is\n \tamended to read as follows:\n \t       (a)  Clean Air Act fees consist of:\n \t             (1)  fees collected by the commission under Sections\n \t382.062, 382.0621, 382.202, and 382.302 and as otherwise provided\n \tby law;' metadata={'Law Name': 'TX_SB2102', 'Alt Law Name': 'SB2102'}


In [7]:
for split in splits:
    header = ""
    for key, val in split.metadata.items():
        header += f"{key}: {val}, "
    header = "An excpert from, " + header[:-2] + "\n-----"
    split.page_content = header + split.page_content
    

In [8]:
print(splits[0])

page_content='An excpert from, Law Name: TX_SB2102, Alt Law Name: SB2102\n-----S.B. No. 2102\n \n \n \n \t\nAN ACT\n \trelating to the initial registration and inspection period for\n \tcertain rental vehicles; authorizing fees.\n \t       BE IT ENACTED BY THE LEGISLATURE OF THE STATE OF TEXAS:\n \t       SECTION 1.  Section 382.0622(a), Health and Safety Code, is\n \tamended to read as follows:\n \t       (a)  Clean Air Act fees consist of:\n \t             (1)  fees collected by the commission under Sections\n \t382.062, 382.0621, 382.202, and 382.302 and as otherwise provided\n \tby law;' metadata={'Law Name': 'TX_SB2102', 'Alt Law Name': 'SB2102'}


In [9]:
embeddings = GPT4AllEmbeddings()
vectorstore = Chroma.from_documents(splits, embeddings)

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


In [10]:
retriever=vectorstore.as_retriever(search_kwargs={"k": 10})

In [11]:
docs = retriever.get_relevant_documents("What is in SB2102?")
for doc in docs:
    print(doc.metadata['Law Name'])

TX_SB2102
TX_SB2102
TX_SB2102
TX_SB2102
TX_SB2102
TX_SB2102
TX_SB2102
TX_SB2102
TX_SB2588
TX_SB2102


In [12]:
llm = ChatOpenAI(base_url="http://localhost:1234/v1", api_key="not-needed", temperature=0.7, max_tokens=1000, streaming=True, callbacks=[StreamingStdOutCallbackHandler()])

In [13]:
llm = ChatOpenAI(base_url="http://localhost:1234/v1", api_key="not-needed", temperature=0.7, max_tokens=1000, streaming=True, callbacks=[StreamingStdOutCallbackHandler()])
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
 #    chain_type_kwargs={
 #        'document_prompt': PromptTemplate(
 #            input_variables=["page_content", "source"], 
 #            template="Source: {source}\nContext: {page_content}"
 #        ),
	# },
)

In [14]:
#Use semicolon to suppress additional printed output
def askLLM(query):
    result = qa({"query": query});
    print("\n --- SOURCES --- \n")
    for idx, doc in enumerate((result['source_documents'])):
        print(f"Source {idx+1}:\n", doc.page_content, "\n")

In [15]:
askLLM("Explain SB2588 to me like I'm 5")

  warn_deprecated(


Sure thing! So, there's a law called SB2588. It's like a set of rules that the grown-ups made for something called a Municipal Utility District in Fort Bend County. This district is responsible for building and taking care of roads and drainage systems, but they need permission from the cities or towns where the roads are located. The law also says that the district can collect money from people who live in the area to help pay for these projects. And there's another law called SB2102 that talks about cars getting inspected every year or two to make sure they don't pollute too much. The grown-ups have made sure all the rules were followed before this law could be passed. That's it! Let me know if you have any other questions.
 --- SOURCES --- 

Source 1:
 An excpert from, Law Name: TX_SB2588, Alt Law Name: SB2588
-----municipality for operation and maintenance macadamized, graveled,
 	or paved roads, or improvements, including storm drainage, in aid
 	of those roads.
 	       Sec. 7974

In [16]:
askLLM("Explain SB2102 to me like I'm 5")

Okay, imagine there's a rule that says some cars in Texas need to have their emissions checked every year or every two years. But now, there's a new rule that says some cars can have their emissions checked every three years instead. This rule is part of a bigger plan called SB2102. The plan also says how the state should collect fees for these emissions checks to make sure they get the same amount of money as before. And if the big plan requires it, the state must set up this system. But, even if this new rule applies, cars still need to pass an emissions check if there's a federal or state law that says so. And some cars, like rental cars, have different rules.
 --- SOURCES --- 

Source 1:
 An excpert from, Law Name: TX_SB2102, Alt Law Name: SB2102
-----quality state implementation plan to provide for a three-year
 	emissions inspection period for a vehicle described by Section
 	548.1025(a), Transportation Code, as added by this Act.
 	       (b)  On the approval of a revision to th

In [17]:
askLLM("What fees are included as part of SB2102?")

SB2102 includes several fees related to vehicle inspections and registrations. Here is a summary:
- Fees collected for inspections of vehicles other than mopeds under Sections 548.501 and 548.503, Transportation Code, amounting to $2 per inspection.
- A fee of $6 for an inspection of a vehicle under Section 548.5035, Transportation Code.
- Fees required under the law that are not specifically mentioned in the provided context.
Additionally, SB2102 establishes a three-year initial inspection period for certain rental vehicles and sets fees for their inspections. The amount of these fees is to be determined by rule.
 --- SOURCES --- 

Source 1:
 An excpert from, Law Name: TX_SB2102, Alt Law Name: SB2102
-----by law;
 	             (2)  $2 from the portion of each fee collected for
 	inspections of vehicles other than mopeds and remitted to the state
 	under Sections 548.501 and 548.503, Transportation Code;
 	             (3)  $6 from the portion of each fee collected for an
 	inspection