# Document Chat with LangChain & PineCone

## Setup

In [42]:
!pip install openai langchain unstructured tiktoken gradio chromadb pinecone-client ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.1-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.9 (from ipywidgets)
  Downloading widgetsnbextension-4.0.9-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.9 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.9-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.1-py3-none-any.whl (139 kB)
   ---------------------------------------- 0.0/139.4 kB ? eta -:--:--
   -------------------- ------------------- 71.7/139.4 kB 2.0 MB/s eta 0:00:01
   -------------------- ------------------- 71.7/139.4 kB 2.0 MB/s eta 0:00:01
   -------------------- ------------------- 71.7/139.4 kB 2.0 MB/s eta 0:00:01
   -------------------- ------------------- 71.7/139.4 kB 2.0 MB/s eta 0:00:01
   -------------------- ------------------- 71.7/139.4 kB 2.0 MB/s eta 0:00:01
   -------------------- ------------------- 71.7/139.4 kB 2.0 MB/s eta 0:00:01
   -------------------- ------------------- 71.7/139.4

In [4]:
import os
from dotenv import load_dotenv
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone, Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI

In [5]:
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = str(os.getenv("PINECONE_API_KEY"))
PINECONE_ENV = str(os.getenv("PINECONE_ENV_KEY"))

## LangChain Document Loader

In [49]:
from langchain.document_loaders import DirectoryLoader

txt_loader = DirectoryLoader(r'.\transcripts', glob="**/*.txt")

loaders = [txt_loader]

documents = []
for loader in loaders:
    documents.extend(loader.load())

print (f'You have {len(documents)} document(s) in your data')
print (f'There are {len(documents[153].page_content)} characters in your document')

You have 206 document(s) in your data
There are 97 characters in your document


In [57]:
documents[19]

Document(page_content="basically you've got the same sort of conversation chain.", metadata={'source': 'transcripts\\115.txt'})

### Split the Text from the documents

In [58]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=40) #chunk overlap seems to work better
documents = text_splitter.split_documents(documents)

In [59]:
print(len(documents))

199


In [61]:
documents[121]

Document(page_content='about bankruptcy on YouTube.', metadata={'source': 'transcripts\\26.txt'})

## PineCone

### Set up embeddings

In [62]:
embeddings = OpenAIEmbeddings()

### Using pinecone for storing vectors

In [63]:
import pinecone
from tqdm.autonotebook import tqdm

In [64]:
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENV  # next to api key in console
)
vectorstore = Pinecone.from_documents(documents, embeddings, index_name="mia")

In [66]:
query = "Name an energy drink"
docs = vectorstore.similarity_search(query)
print(docs)
print(len(docs))
print(docs[0].page_content)

[Document(page_content="is excessive caffeine. This isn't just like one energy drink.", metadata={'source': 'transcripts\\68.txt'}), Document(page_content="is excessive caffeine. This isn't just like one energy drink.", metadata={'source': 'transcripts\\68.txt'}), Document(page_content='Let drink bang energy just because of the fucking super creatine claims. What are you talking about?', metadata={'source': 'transcripts\\71.txt'}), Document(page_content='Let drink bang energy just because of the fucking super creatine claims. What are you talking about?', metadata={'source': 'transcripts\\71.txt'})]
4
is excessive caffeine. This isn't just like one energy drink.


## Chaining with Chat History

In [74]:
from langchain.llms import OpenAI

In [80]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3})
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0.9), retriever)
chat_history = []

In [81]:
query = "What was the energy drink I heard recently?"
result = qa({"question": query, "chat_history": chat_history})
chat_history.append((query, result["answer"]))
result["answer"]

' Bang Energy'

In [82]:
query = "Who is its owner"
result = qa({"question": query, "chat_history": chat_history})
chat_history.append((query, result["answer"]))
result["answer"]

' The owner of Bang Energy is the absolute unhinged CEO of BANG!.'

In [83]:
query = "Yeah but whats his name"
result = qa({"question": query, "chat_history": chat_history})
chat_history.append((query, result["answer"]))
result["answer"]

" I don't know."

## Chatbot

In [72]:
from IPython.display import display
import ipywidgets as widgets

In [73]:
chat_history = []

def on_submit(_):
    query = input_box.value
    input_box.value = ""
    
    if query.lower() == 'exit':
        print("Thanks for the chat!")
        return
    
    result = qa({"question": query, "chat_history": chat_history})
    chat_history.append((query, result['answer']))
    
    display(widgets.HTML(f'<b>User:</b> {query}'))
    display(widgets.HTML(f'<b><font color="Red">Chatbot:</font></b> {result["answer"]}'))

print("Chat with your data. Type 'exit' to stop")

input_box = widgets.Text(placeholder='Please enter your question:')
input_box.on_submit(on_submit)

display(input_box)

Chat with your data. Type 'exit' to stop


  input_box.on_submit(on_submit)


Text(value='', placeholder='Please enter your question:')

HTML(value="<b>User:</b> Yoo what's up")

HTML(value='<b><font color="Red">Chatbot:</font></b>  Hi! I\'m good. How about you?')

HTML(value="<b>User:</b> what's bang's ceo's name?")

HTML(value='<b><font color="Red">Chatbot:</font></b>  I don\'t know.')

HTML(value="<b>User:</b> don't they have a lawsuit?")

HTML(value='<b><font color="Red">Chatbot:</font></b>  It is not clear if Bang has a lawsuit.')

Thanks for the chat!
