In [None]:
import openai
import langchain
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
import os

In [None]:
## reading doc
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [None]:
doc = read_doc('documents/')

In [None]:
doc

In [None]:
len(doc)

In [None]:
## dividing into chunks
def chunk_data(docs, chunk_size = 800, chunk_overlap = 50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return docs


In [None]:
documents = chunk_data(docs = doc)
documents


In [None]:
len(documents)

In [None]:
#Embedding technique of OPENAI
embeddings = OpenAIEmbeddings(api_key = os.environ['OPENAI_API_KEY'])
embeddings

In [None]:
vectors = embeddings.embed_query('how are you?')
len(vectors)

In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
    api_key=os.environ.get("PINECONE_API_KEY")
)

# Now do stuff
index_name = 'langchainvector'
if 'langchainvector' not in pc.list_indexes().names():
    pc.create_index(
        name=index_name, 
        dimension=1536, 
        metric='euclidean',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-west-2'
        )
    )

In [None]:
from langchain.vectorstores import Pinecone as PineconeStore
index = PineconeStore.from_documents(doc, embeddings, index_name = index_name)

In [None]:
# Cosine Similarity Retrieve from VectorDB
def retrieve_query(query, k = 2):
    matching_results = index.similarity_search(query, k = 2)
    return matching_results

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI

In [None]:
llm = OpenAI(model_name = 'davinci-002', temperature = 0.5)
chain = load_qa_chain(llm, chain_type = 'stuff')

In [None]:
#Search answers from Vector DB
def retrieve_answers(query):
    doc_search = retrieve_query(query)
    print(doc_search)
    response = chain.run(input_documents = doc_search, question = query)
    return response

In [None]:
our_query = 'our_query'
answer = retrieve_answers(our_query)
print(answer)