In [1]:
# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset

# With CassIO, the engine powering the Astra DB integration in LangChain,
# you will also initialize the DB connection:
import cassio

from PyPDF2 import PdfReader
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ASTRA_DB_APPLICATION_TOKEN = "AstraCS:..." # enter the "AstraCS:..." string found in in your Token JSON file
ASTRA_DB_ID = "32.." # enter your Database ID


from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [3]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('aeer.2104024.pdf')

In [4]:
from typing_extensions import Concatenate
# read text from pdf
pdfreader = PdfReader('aeer.2104024.pdf')
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [5]:
raw_text

'See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/355234799\nOil Spill Detection Using Satellite Imagery\nArticle  · August 2021\nDOI: 10.21926/ aeer .2104024\nCITATIONS\n4READS\n2,827\n3 author s, including:\nSome o f the author s of this public ation ar e also w orking on these r elat ed pr ojects:\nGoogle Earth Engine Applic ations  View pr oject\nEstimation of the L eaf Ar ea Inde x (LAI), soil moist ure, and dr yness using the r ed-NIR spectr al sp ace View pr oject\nMeisam Amani\nWSP Envir onment & Infr astruct ure Canada Limit ed\n122 PUBLICA TIONS \xa0\xa0\xa02,594  CITATIONS \xa0\xa0\xa0\nSEE PROFILE\nAll c ontent f ollo wing this p age was uplo aded b y Meisam Amani  on 20 Oct ober 2021.\nThe user has r equest ed enhanc ement of the do wnlo aded file. © 2021 by the author. This is an open access article distributed under the \nconditions of the Creative Commons by Attribution License , which permits \nunrest

Initialize the connection to your database:

_(do not worry if you see a few warnings, it's just that the drivers are chatty about negotiating protocol versions with the DB.)_

In [6]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

Create the LangChain embedding and LLM objects for later usage:

In [7]:
llm = OpenAI()
embedding = OpenAIEmbeddings()

Create your LangChain vector store ... backed by Astra DB!

In [8]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [9]:
from langchain.text_splitter import CharacterTextSplitter
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [10]:
len(texts)

29

In [11]:
texts[0]

'See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/355234799\nOil Spill Detection Using Satellite Imagery\nArticle  · August 2021\nDOI: 10.21926/ aeer .2104024\nCITATIONS\n4READS\n2,827\n3 author s, including:\nSome o f the author s of this public ation ar e also w orking on these r elat ed pr ojects:\nGoogle Earth Engine Applic ations  View pr oject\nEstimation of the L eaf Ar ea Inde x (LAI), soil moist ure, and dr yness using the r ed-NIR spectr al sp ace View pr oject\nMeisam Amani\nWSP Envir onment & Infr astruct ure Canada Limit ed\n122 PUBLICA TIONS \xa0\xa0\xa02,594  CITATIONS \xa0\xa0\xa0\nSEE PROFILE\nAll c ontent f ollo wing this p age was uplo aded b y Meisam Amani  on 20 Oct ober 2021.'

### Load the dataset into the vector store



In [12]:

astra_vector_store.add_texts(texts[:29])

print("Inserted %i headlines." % len(texts[:29]))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 29 headlines.


In [13]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=3):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))


QUESTION: "What this Doc token about?"
ANSWER: "This document is about a research paper titled "Oil Spill Detection Using Satellite Imagery" and the authors of the paper are Amber Bonnington, Meisam Amani, and Hamid Ebrahimy."

FIRST DOCUMENTS BY RELEVANCE:
    [0.8651] "The a uthors have declared tha t no competing interests exist.  
References  
1. Tea ..."
    [0.8638] "WSP Envir onment & Infr astruct ure Canada Limit ed
122 PUBLICA TIONS    2,594  CITA ..."
    [0.8626] "J Remote Sens. 2021; 2021: 1 -13. 
15. Bhushan B. Bioinspired oil –water separation  ..."
    [0.8622] "See discussions, st ats, and author pr ofiles f or this public ation at : https://ww ..."

QUESTION: "How to detect an oil ?"
ANSWER: "Oil spills can be detected using laser fluorosensors, SAR systems, and optical satellite data. Visual and statistical methods can be used for change detection from optical remote sensing data."

FIRST DOCUMENTS BY RELEVANCE:
    [0.9195] "information on the rate and direction of 