In [68]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from apikey import OPENAI_API_KEY, PINECONE_API_KEY, PINECONE_API_ENV 

In [69]:
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
os.environ['PINECONE_API_ENV'] = PINECONE_API_ENV

### Load your data

In [153]:
loader = PyPDFLoader("/Users/mrityunjay/Downloads/report2022.pdf")
data = loader.load()

In [154]:
# PDFLoader splits the doc into pages by default
print (f'There are {len(data)} document(s) in your data')
print (f'There are {len(data[1].page_content)} characters in your document')

You have 144 document(s) in your data
There are 1638 characters in your document


### Chunk your data up into smaller documents

In [155]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [157]:
print (f'Now you have {len(texts)} documents')

Now you have 323 documents


### Create embeddings of your documents to get ready for semantic search

In [158]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [159]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'YourAPIKey')
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', 'YourAPIKey')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'us-east1-gcp')

In [160]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [161]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchain-test" # put in the name of your pinecone index here

In [162]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

### Query those docs to get your answer back

In [166]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [167]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [175]:
# function to ask questions and get answers

def chat(query):
    docs = docsearch.similarity_search(query, k=5)
    answer = chain.run(input_documents=docs, question=query)
    return answer

In [176]:
question = "is the result for this year profitable?"
chat(question)

' Yes, the result for this year is profitable. The profit for the period was USD 4.981 billion.'

In [179]:
question = "how much more is the actual profit from last year, not EBIDTA?"
chat(question)

' The actual profit from last year (2021) was USD 18,033 million, which is USD 29,321 million in 2022, an increase of USD 11,288 million.'

In [174]:
chat("Who are the authors of this paper?")

' M. A. Bhanja, S. Chaudhary, and A. Jatain.'

In [116]:
ans = chat("What all machine learning models are used in this paper?")
print(ans)

 The paper uses feature selection followed by various machine learning models ranging from the most basic ones to the latest and most advanced ones, including a radial basis function, a multilayer perceptron, an adaptive neuro-fuzzy classifier with linguistic hedges, a rotation forest classifier, a back propagation neural network, a decision tree, and an XGBoost classifier.


In [117]:
chat("Will this paper be useful for my analysis on capitalism?")

' No, this paper does not discuss capitalism. It discusses feature selection and implementation of various machine learning models.'

In [152]:
chat("What is this paper, Give a summary?")

' This paper is about Recent Trends in Communication and Intelligent Systems, Algorithms for Intelligent Systems. It discusses the correlation between individual features and target variables, and how this can be used to carry out Parkinson Disease classification.'