In [1]:
from langchain.document_loaders import DirectoryLoader, UnstructuredHTMLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

In [3]:
loader = DirectoryLoader("2023-07-26/")

In [4]:
documents = loader.load()

In [5]:
len(documents)

12715

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=10)
texts = text_splitter.split_documents(documents)

In [7]:
len(texts)

111640

In [8]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings

persist_directory = "./storage"
# embeddings = OpenAIEmbeddings()
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embeddings,
                                 persist_directory=persist_directory)
vectordb.persist()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [9]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

retriever = vectordb.as_retriever(search_kwargs={"k": 5})
llm = ChatOpenAI(model_name='gpt-3.5-turbo')

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

In [12]:
user_input = 'How much was the salary increase ? '

query = f"###Prompt {user_input}"

llm_response = qa(query)
print(llm_response["result"])

The salary increase is not specified in the given context.
