# Chat with any documents using langchain

## Setup

In [1]:
!pip install openai langchain unstructured tiktoken gradio chromadb pinecone-client ndg-httpsclient pyopenssl pyasn1



In [2]:
import os
from dotenv import main
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone, Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI

In [3]:
main.load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

PINECONE_API_KEY = str(os.getenv("PINECONE_API_KEY"))
PINECONE_ENV = str(os.getenv("PINECONE_ENV_KEY"))

In [4]:
OPENAI_API_KEY

'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'

[LangChain Document Loader](https://python.langchain.com/en/latest/modules/indexes/document_loaders.html)

In [5]:
from langchain.document_loaders import DirectoryLoader

txt_loader = DirectoryLoader(r'.\transcripts', glob="**/*.txt")

In [6]:
#take all the loader
loaders = [txt_loader]

#lets create document 
documents = []
for loader in loaders:
    documents.extend(loader.load())

In [7]:
print (f'You have {len(documents)} document(s) in your data')
print (f'There are {len(documents[153].page_content)} characters in your document')

You have 206 document(s) in your data
There are 97 characters in your document


In [8]:
documents[153]

Document(page_content="You don't want to know. All of this is just easily faked and easily manufactured and manipulated.", metadata={'source': 'transcripts\\51.txt'})

## Split the Text from the documents

In [9]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=40) #chunk overlap seems to work better
documents = text_splitter.split_documents(documents)
print(len(documents))

199


In [10]:
documents[153]

Document(page_content='People that drink bang are stupid and fat and no other way.', metadata={'source': 'transcripts\\57.txt'})

In [11]:
documents[151]

Document(page_content="It's interesting every fucking drink ever has this in it. It's not like bang on", metadata={'source': 'transcripts\\55.txt'})

## Embeddings and storing it in Vectorestore

In [12]:
OPENAI_API_KEY

'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'

In [13]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

### Using pinecone for storing vectors

- [Pinecone langchain doc](https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/pinecone.html?highlight=pinecone#pinecone
)
- What is [vectorstore](https://www.pinecone.io/learn/vector-database/)
- Get your pinecone api key and env -> https://app.pinecone.io/

In [None]:
import pinecone 

# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENV  # next to api key in console
)

index_name = "mia"

vectorstore = Pinecone.from_documents(documents, embeddings, index_name=index_name)

In [14]:
# if you already have an index, you can load it like this
import pinecone
from tqdm.autonotebook import tqdm

# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENV  # next to api key in console
)

index_name = "mia"
vectorstore = Pinecone.from_existing_index(index_name, embeddings)

  from tqdm.autonotebook import tqdm


#### We had 23 documents so there are 23 vectors being created in Pinecone.

In [15]:
query = "Name an energy drink"
docs = vectorstore.similarity_search(query)

AuthenticationError: Incorrect API key provided: sk-45tko***************************************b7OO. You can find your API key at https://platform.openai.com/account/api-keys.

In [16]:
len(docs) #it went on and search on the 4 different vectors to find the similarity

NameError: name 'docs' is not defined

In [None]:
print(docs[0].page_content)

In [None]:
print(docs[1].page_content)

## Now the langchain part (Chaining with Chat History) --> With One line of Code (Fantastic)
- There are many chains but we use this [link](https://python.langchain.com/en/latest/modules/chains/index_examples/chat_vector_db.html)

In [None]:
from langchain.llms import OpenAI

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":2})
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0), retriever)

In [None]:
chat_history = []
query = "How much is spent for training the gpt4all model?"
result = qa({"question": query, "chat_history": chat_history})
result["answer"]

In [None]:
chat_history.append((query, result["answer"]))
chat_history

In [None]:
query = "What is this number multiplied by 2?"
result = qa({"question": query, "chat_history": chat_history})
result["answer"]

## Create a chatbot with memory with simple widgets

In [None]:
from IPython.display import display
import ipywidgets as widgets

In [None]:
chat_history = []

def on_submit(_):
    query = input_box.value
    input_box.value = ""
    
    if query.lower() == 'exit':
        print("Thanks for the chat!")
        return
    
    result = qa({"question": query, "chat_history": chat_history})
    chat_history.append((query, result['answer']))
    
    display(widgets.HTML(f'<b>User:</b> {query}'))
    display(widgets.HTML(f'<b><font color="Orange">Chatbot:</font></b> {result["answer"]}'))

print("Chat with your data. Type 'exit' to stop")

input_box = widgets.Text(placeholder='Please enter your question:')
input_box.on_submit(on_submit)

display(input_box)

### Gradio sample example

In [None]:
import gradio as gr
import random

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    def respond(message, chat_history):
        print(message)
        print(chat_history)
        bot_message = random.choice(["How are you?", "I love you", "I'm very hungry"])
        chat_history.append((message, bot_message))
        print(chat_history)
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch(debug=True, share=True)

### Gradio langchain example

In [None]:
import gradio as gr
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")
    
    def respond(user_message, chat_history):
        print(user_message)
        print(chat_history)
        # Get response from QA chain
        response = qa({"question": user_message, "chat_history": chat_history})
        # Append user message and response to chat history
        chat_history.append((user_message, response["answer"]))
        print(chat_history)
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot], queue=False)
    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch(debug=True, share=True)