# Imports

In [40]:
# DB Imports
import os
import chromadb
from chromadb.config import Settings


#Ingestions  
from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma


# imports for the llm logic
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
import torch
import base64
import textwrap
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

## VectorDB config

In [41]:
# DEFINING CHROMA DB SETTINGS

CHROMA_SETTINGS = Settings(
    chroma_db_impl = 'duckdb+parquet', 
    persist_directory = "db",
    anonymized_telemetry = False
)


In [42]:
print(CHROMA_SETTINGS)

environment='' chroma_db_impl='duckdb+parquet' chroma_api_impl='chromadb.api.local.LocalAPI' chroma_telemetry_impl='chromadb.telemetry.posthog.Posthog' chroma_sysdb_impl='chromadb.db.impl.sqlite.SqliteDB' chroma_producer_impl='chromadb.db.impl.sqlite.SqliteDB' chroma_consumer_impl='chromadb.db.impl.sqlite.SqliteDB' chroma_segment_manager_impl='chromadb.segment.impl.manager.local.LocalSegmentManager' clickhouse_host=None clickhouse_port=None tenant_id='default' topic_namespace='default' persist_directory='db' chroma_server_host=None chroma_server_http_port=None chroma_server_ssl_enabled=False chroma_server_grpc_port=None chroma_server_cors_allow_origins=[] anonymized_telemetry=False allow_reset=False sqlite_database=':memory:' migrations='apply'


## Ingestions

In [43]:
persist_directory = "db"

# def main():
for root, dirs, files in os.walk("docs"):
    for file in files:
        if file.endswith(".pdf"):
            print(f'Loading ===========> {file}')
            loader = PDFMinerLoader(os.path.join(root, file))
documents = loader.load()



In [44]:
documents



In [45]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size= 400, chunk_overlap=40)

In [46]:
texts = text_splitter.split_documents(documents)

In [47]:
print(len(texts))

12202


In [48]:
texts[0]

Document(page_content='Title 49 \nTransportation \n\nParts 1 to 99 \n\nRevised as of October 1, 2021 \n\nContaining a codification of documents \nof general applicability and future effect \n\nAs of October 1, 2021 \n\nPublished by the Office of the Federal Register \nNational Archives and Records Administration \nas a Special Edition of the Federal Register \n\nB\nO\nJ\n_\n$\n$\n\nh\nt\ni\n\nI\n\nw\n2\n0\n7\nN\nW\nR\nF\nO\nM\nV\nn\no\n\ne\nn\ny\na\np\nk', metadata={'source': 'docs/CFR-2022-title49-vol1.pdf'})

In [49]:
texts[1]

Document(page_content='w\n2\n0\n7\nN\nW\nR\nF\nO\nM\nV\nn\no\n\ne\nn\ny\na\np\nk\n\nVerDate Sep<11>2014  16:32 Jun 16, 2022 Jkt 253227 PO 00000 Frm 00001 Fmt 8091 Sfmt 8091 Q:\\49\\49V1.TXT PC31\n\n \n \n \n \n\x0cU.S.  GOVERNMENT  OFFICIAL  EDITION  NOTICE \n\nLegal  Status  and  Use  of  Seals  and  Logos', metadata={'source': 'docs/CFR-2022-title49-vol1.pdf'})

In [50]:
texts[2]

Document(page_content='The  seal  of  the  National  Archives  and  Records  Administration \n(NARA)  authenticates  the  Code  of  Federal  Regulations  (CFR)  as \nthe  official  codification  of  Federal  regulations  established  under \nthe Federal Register Act. Under the provisions of 44 U.S.C. 1507, the \ncontents of the CFR, a special edition of the Federal Register, shall', metadata={'source': 'docs/CFR-2022-title49-vol1.pdf'})

In [51]:
texts[3]

Document(page_content='be judicially noticed. The CFR is prima facie evidence of the origi-\nnal documents published in the Federal Register (44 U.S.C. 1510).', metadata={'source': 'docs/CFR-2022-title49-vol1.pdf'})

In [52]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [53]:
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
db.persist()
db=None

## LLM 

In [54]:
checkpoint = "LaMini-T5-738M"

In [55]:
%pwd

'/mnt/e/USTransportationLLM'

In [56]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [57]:
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    checkpoint,
    device_map="auto",
    torch_dtype = torch.float32)

In [66]:

def llm_Pipeline():
    pipe = pipeline(
        'text2text-generation',
        model = base_model,
        tokenizer = tokenizer,
        max_length = 512,
        do_sample=True,
        temperature = 0.4,
        top_p = 0.9
    )
    local_llm = HuggingFacePipeline(pipeline=pipe)
    return local_llm

In [67]:
import streamlit as st

In [68]:
@st.cache_resource
def qa_llm():
    llm = llm_Pipeline()
    embeddings = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')
    db= Chroma(persist_directory = "db", embedding_function = embeddings, 
    client_settings = CHROMA_SETTINGS)
    retriever = db.as_retriever()
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type= "stuff",
        retriever= retriever,
        return_source_documents=True)
    return qa

In [69]:
def process_answer(instruction):
    response=''
    instruction = instruction
    # print(instruction)
    qa = qa_llm()
    # print(qa)
    generated_text = qa(instruction)
    answer = generated_text['result']
    return answer, generated_text

In [62]:
q = input("Enter your query here!")

Enter your query here! which is list organization of the United States Department of Transportation?


In [70]:
print(q)

which is list organization of the United States Department of Transportation?


In [71]:
ans, meta = process_answer(q)

In [72]:
print(ans)

The answer is (a) The Secretary of Transportation is the head of the Department.


In [73]:
print(meta)

{'query': 'which is list organization of the United States Department of Transportation?', 'result': 'The answer is (a) The Secretary of Transportation is the head of the Department.', 'source_documents': [Document(page_content='AUTHORITY: 49 U.S.C. 322. \n\nSOURCE:  81  FR  19819,  Apr.  5,  2016,  unless \n\notherwise noted. \n\nSubpart A—General \n\n§ 1.1 Overview. \n\nThis  part  describes  the  organization \nof  the  United  States  Department  of \nTransportation  and  provides  for  the \nperformance  of  duties  imposed  upon, \nand  the  exercise  of  powers  vested  in, \nthe  Secretary  of  Transportation  by \nlaw. \n\n§ 1.2 Organization of the Department. \n\n(a)  The  Secretary  of  Transportation', metadata={'source': 'docs/CFR-2022-title49-vol1.pdf'}), Document(page_content='SOURCE:  81  FR  19819,  Apr.  5,  2016,  unless \n\notherwise noted. \n\nSubpart A—General \n\n§ 1.1 Overview. \n\nThis  part  describes  the  organization \nof  the  United  States  Department  o

In [28]:
q = input("another quer")

KeyboardInterrupt: Interrupted by user

In [None]:
ans, meta = process_answer(q)

In [None]:
print(ans)

In [None]:
print(meta)