In [1]:
# installing through pip
! pip install -q cassio datasets langchain openai tiktoken


In [None]:
# installing PyPDF2
! pip install PyPDF2

In [21]:
# importing langchain components
import os
from dotenv import load_dotenv
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

# dataset retrieval with HuggingFace
from datasets import load_dataset

# cassio powering the AstraDB to integrate with Langchain
# we will also initialize the db connection
import cassio

# importing PdfReader from PyPDF2
from PyPDF2 import PdfReader

import warnings
warnings.filterwarnings('ignore')

In [16]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
AstraDB_ID=os.getenv("AstraDB_ID")
AstraDB_Application_Token=os.getenv("AstraDB_Application_Token")

In [18]:
# setup and provide the secrets
# we have to provide 3 important keys
# first AstraDB_Application_Token
# second AstraDB_ID and thirdly OPENAI_API_KEY
AstraDB_Application_Token = AstraDB_Application_Token
AstraDB_ID=AstraDB_ID
OPENAI_API_KEY=openai_api_key

In [12]:
# provide the path of the pdf file
pdf_reader=PdfReader('budget.pdf')

In [19]:
# reading text from PDF
from typing_extensions import Concatenate
raw_text=''
for i, page in enumerate(pdf_reader.pages):
    content=page.extract_text()
    if content:
        raw_text += content

In [None]:
# the data
raw_text

In [23]:
# initializing the connection with the database
cassio.init(
    token=AstraDB_Application_Token,
    database_id=AstraDB_ID
)

In [25]:
# creating object of OpenAI()
llm=OpenAI(openai_api_key=OPENAI_API_KEY)
# creating the embedding through OpenAIEmbeddings()
embedding=OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [36]:
# creatin the object of Cassandra() passing embedding parameter
astra_vector_store=Cassandra(
    embedding=embedding,
    table_name='qa_mini_demo',
    session=None,
    keyspace=None
    )


In [30]:
# importing CharacterTextSplitter() and creating an object of it.
from langchain.text_splitter import CharacterTextSplitter
text_splitter=CharacterTextSplitter(
    separator='\n',
    chunk_size=800,
    chunk_overlap=200,
    length_function=len  
)
texts=text_splitter.split_text(raw_text)

In [None]:
# showing output 
texts[:10]

##### load the dataset into vector store

In [38]:
# now call the object as 'astra_vector_store' of the class Cassandra which was imported from the library as
# from langchain.vectorstores.cassandra import Cassandra
astra_vector_store.add_texts(texts[:50])
print('Inserted %i headlines.' %len(texts[:50]))

# creating an object of VectorStoreIndexWrapper() class
astra_vector_index=VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 8 headlines.


In [None]:
# Format to generate
first_question=True
while True:
    if first_question:
        query_text=input('\nEnter your question (or type "quit" to exit): ').strip()
    else:
        query_text=input('\nWhat is your next question (or type "quit" to exit): ').strip()
    if query_text.lower() == 'quit':
        break
    if query_text == '':
        continue
    print('\nQUESTION: \"%s\"' % query_text)
    answer=astra_vector_index.query(
        query_text,
        llm=llm).strip()
    print('\nANSWER: \"%s\"\n' % answer)
    print('FIRST DOCUMENTS BY RELEVANCE:')
    for doc,score in astra_vector_store.similarity_search_with_store(query_text,k=4):
        print("   [%0.4f] \"%s ...\"" % (score,doc.page_content[:84]))


In [None]:
# q1: What is the current GDP
# q2: How much the agriculture target will be increased to and what the focus will be 