In [7]:
import yaml, os, openai
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [8]:
with open('cadentials.yaml') as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ['OPENAI_API_KEY'] = credentials['OPENAI_API_KEY']
os.environ['HUGGINGFACEHUB_API_TOKEN'] = credentials['HUGGINGFACEHUB_API_TOKEN']
os.environ['ENGINE'] = credentials['ENGINE']

openai.api_key = credentials['OPENAI_API_KEY']
openai.api_base = credentials['OPENAI_API_BASE']
openai.api_type = credentials['OPENAI_API_TYPE']
openai.api_version = credentials['OPENAI_API_VERSION']
openai.engine = credentials['ENGINE']

In [9]:
loader = DirectoryLoader(
                        'data/new_articles/', 
                        loader_cls=TextLoader,
                        glob="./*.txt"
                        )

documents = loader.load()

#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(
                                                chunk_size=1000, 
                                                chunk_overlap=200
                                                )
texts = text_splitter.split_documents(documents)

len(texts)

233

In [10]:
texts[3]

Document(page_content='Called ChatGPT Business, OpenAI describes the forthcoming offering as “for professionals who need more control over their data as well as enterprises seeking to manage their end users.”\n\n“ChatGPT Business will follow our API’s data usage policies, which means that end users’ data won’t be used to train our models by default,” OpenAI wrote in a blog post. “We plan to make ChatGPT Business available in the coming months.”\n\nApril 24, 2023\n\nOpenAI applied for a trademark for “GPT,” which stands for “Generative Pre-trained Transformer,” last December. Last month, the company petitioned the USPTO to speed up the process, citing the “myriad infringements and counterfeit apps” beginning to spring into existence.\n\nUnfortunately for OpenAI, its petition was dismissed last week. According to the agency, OpenAI’s attorneys neglected to pay an associated fee as well as provide “appropriate documentary evidence supporting the justification of special action.”', metadat

### create the DB

In [11]:
# !pip install chromadb

In [12]:
persist_directory = 'db/01'

embedding = HuggingFaceEmbeddings(
                                model_name="sentence-transformers/all-mpnet-base-v2",
                                model_kwargs={'device': 'mps'},
                                encode_kwargs={'normalize_embeddings': False}
                                )

vectordb = Chroma.from_documents(
                                documents=texts, 
                                embedding=embedding,
                                persist_directory=persist_directory
                                )

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [13]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [14]:
vectordb = Chroma(
                persist_directory=persist_directory, 
                embedding_function=embedding
                )

### Make a retriever

In [15]:
vectordb = Chroma(
                persist_directory=persist_directory, 
                embedding_function=embedding
                )
retriever = vectordb.as_retriever()

In [16]:
docs = retriever.get_relevant_documents("How much money did Pando raise?")
docs

[Document(page_content='Signaling that investments in the supply chain sector remain robust, Pando, a startup developing fulfillment management technologies, today announced that it raised $30 million in a Series B round, bringing its total raised to $45 million.\n\nIron Pillar and Uncorrelated Ventures led the round, with participation from existing investors Nexus Venture Partners, Chiratae Ventures and Next47. CEO and founder Nitin Jayakrishnan says that the new capital will be put toward expanding Pando’s global sales, marketing and delivery capabilities.\n\n“We will not expand into new industries or adjacent product areas,” he told TechCrunch in an email interview. “Great talent is the foundation of the business — we will continue to augment our teams at all levels of the organization. Pando is also open to exploring strategic partnerships and acquisitions with this round of funding.”', metadata={'source': 'data/new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-in

In [17]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
retriever.search_type, retriever.search_kwargs

('similarity', {'k': 2})

### Make a Chain

In [18]:
llm = ChatOpenAI(
                openai_api_key=os.environ["OPENAI_API_KEY"],
                engine = os.environ["ENGINE"],
                model='gpt-3.5-turbo',
                temperature=0.9, 
                max_tokens = 256
                )

qa_chain = RetrievalQA.from_chain_type(
                                        llm=llm, 
                                        chain_type="stuff", 
                                        retriever=retriever, 
                                        return_source_documents=True
                                        )

                    engine was transferred to model_kwargs.
                    Please confirm that engine is what you intended.


In [19]:
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [20]:
# full example
query = "How much money did Pando raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Pando raised $30 million in its Series B round, bringing its total raised to $45 million.


Sources:
data/new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt
data/new_articles/05-03-ai-powered-supply-chain-startup-pando-lands-30m-investment.txt


In [21]:
query = "What did databricks acquire?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Databricks acquired Okera, a data governance platform with a focus on AI.


Sources:
data/new_articles/05-03-databricks-acquires-ai-centric-data-governance-platform-okera.txt
data/new_articles/05-03-databricks-acquires-ai-centric-data-governance-platform-okera.txt


In [22]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.messages[0].prompt.template)

Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
{context}


In [23]:
# To cleanup, you can delete the collection
vectordb.delete_collection()
vectordb.persist()