In [68]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from apikey import OPENAI_API_KEY, PINECONE_API_KEY, PINECONE_API_ENV 

In [69]:
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
os.environ['PINECONE_API_ENV'] = PINECONE_API_ENV

### Load your data

In [91]:
# loader = PyPDFLoader("/Users/mrityunjay/Desktop/Mrityunjay/Documents/Student Loan Documents/NYU_OfferLetter.pdf")
# data = loader.load()

In [92]:
# loader = PyPDFLoader("/Users/mrityunjay/Desktop/Mrityunjay/Documents/Research Paper/paper16.pdf")
# data = loader.load()

In [153]:
loader = PyPDFLoader("/Users/mrityunjay/Downloads/apmm-annual-report-2022.pdf")
data = loader.load()

In [154]:
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[1].page_content)} characters in your document')

You have 144 document(s) in your data
There are 1638 characters in your document


### Chunk your data up into smaller documents

In [155]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [156]:
texts

[Document(page_content='ALL THE WAY\nAnnual Report 2022\nA.P . Møller - Mærsk A/S \nEsplanaden 50, DK-1263 Copenhagen K ⁄ Registration no. 22756214', metadata={'source': '/Users/mrityunjay/Downloads/apmm-annual-report-2022.pdf', 'page': 0}),
 Document(page_content='Improving life for all  \nby integrating the worldOur Purpose\nAt A.P. Moller - Maersk, we strive to go all the way, \nevery day, to deliver a more connected, flexible and \nsustainable future for global logistics. \nWe aspire to provide truly integrated logistics. Across \noceans, ports, on land and in the air, we are combining \nour supply chain infrastructure with the power of our \npeople and technology to drive end-to-end innovation \nthat accelerates our customers’ success.\nA more integrated world improves quality of life and \nprosperity on all levels. It is our responsibility to ensure \na more sustainable tomorrow for coming generations. \nWe believe in an integrated world. One planet.  \nConnected all the way.By i

In [157]:
print (f'Now you have {len(texts)} documents')

Now you have 323 documents


### Create embeddings of your documents to get ready for semantic search

In [158]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [159]:
# Check to see if there is an environment variable with you API keys, if not, use what you put below
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', 'YourAPIKey')
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', 'YourAPIKey')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'us-east1-gcp')

In [160]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [161]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchain-test" # put in the name of your pinecone index here

In [162]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

### Query those docs to get your answer back

In [166]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [167]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [175]:
# function to ask questions and get answers

def chat(query):
    docs = docsearch.similarity_search(query, k=5)
    answer = chain.run(input_documents=docs, question=query)
    return answer

In [176]:
question = "is the result for this year profitable?"
chat(question)

' Yes, the result for this year is profitable. The profit for the period was USD 4.981 billion.'

In [178]:
question = "how much more is the profit from last year"
chat(question)

' The profit before financial items (EBIT) increased to USD 814m (USD 623m) from last year, which is an increase of USD 191m.'

In [174]:
chat("Who are the authors of this paper?")

' M. A. Bhanja, S. Chaudhary, and A. Jatain.'

In [116]:
ans = chat("What all machine learning models are used in this paper?")
print(ans)

 The paper uses feature selection followed by various machine learning models ranging from the most basic ones to the latest and most advanced ones, including a radial basis function, a multilayer perceptron, an adaptive neuro-fuzzy classifier with linguistic hedges, a rotation forest classifier, a back propagation neural network, a decision tree, and an XGBoost classifier.


In [117]:
chat("Will this paper be useful for my analysis on capitalism?")

' No, this paper does not discuss capitalism. It discusses feature selection and implementation of various machine learning models.'

In [152]:
chat("What is this paper, Give a summary?")

' This paper is about Recent Trends in Communication and Intelligent Systems, Algorithms for Intelligent Systems. It discusses the correlation between individual features and target variables, and how this can be used to carry out Parkinson Disease classification.'