In [None]:
# Install libraries such as cassio, datasets, langchain, openai, tiktoken
# !pip install -q cassio datasets langchain openai tiktoken
# pip install  langchain-astradb>=0.0.1

import langchain

# Import the following from langchain
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper # All vectors will be wrapped as one package
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings # We will use Open AI embeddings to convert text into Vectors...

# Dataset retrieval using HuggingFace
from datasets import load_dataset

# CassIO helps integrate the AstraDB with the Langchain
import cassio


import warnings
warnings.filterwarnings("ignore")


import os
from langchain_astradb import AstraDBVectorStore
from langchain_core.documents import Document

In [None]:
# pip install PyPDF2
from PyPDF2 import PdfReader # This will read the document...

#### Setup Astra Database

In [None]:
import os

from astrapy.db import AstraDB

ASTRA_DB_APPLICATION_TOKEN="********************"
ASTRA_DB_ID = "************************"
OPEN_API_KEY = "*********************************************"

In [None]:
# Read the pdf
pdf = PdfReader("Gita.pdf")

In [None]:
# Saving the entire pdf as a raw_text
from typing_extensions import Concatenate

raw_text = ' '
for i, page in enumerate(pdf.pages):
    content = page.extract_text()
    if content:
        raw_text +=content

In [None]:
# Establish connection with db
cassio.init(token = ASTRA_DB_APPLICATION_TOKEN, database_id = ASTRA_DB_ID)

In [None]:
# Create a Langchain Embeddings & LLM Objects

llm = OpenAI(openai_api_key = OPEN_API_KEY)
embeddings = OpenAIEmbeddings(openai_api_key = OPEN_API_KEY)

In [None]:
# Create the LangChain Vector Store backed by AstraDB
astra_vector_store = Cassandra(embedding=embeddings,
                              table_name="qa_mini_demo",
                              session = None, keyspace = None)

In [None]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(separator = "\n",
                                     chunk_size = 1500,
                                     chunk_overlap = 200,
                                     length_function = len)

# Converting the Data into Chunks
text = text_splitter.split_text(raw_text)

In [None]:
# Add text into cassandra db
astra_vector_store.add_texts(text)
astra_vector_index = VectorStoreIndexWrapper(vectorstore = astra_vector_store)

In [None]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:1000]))