# Querying PDF with Astra DB and Langchain 


- need a serverless cassandra with vector search [Astra DB]

- Get DB token and Database ID 

In [1]:
# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra

# wrap all the vectors in a wraper
from langchain.indexes.vectorstore import VectorStoreIndexWrapper 
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

# if we want data set from huggingface
# Support for dataset retrieval with Hugging Face
from datasets import load_dataset

# With CassIO, the engine powering the Astra DB integration in LangChain,
# you will also initialize the DB connection:
import cassio

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from PyPDF2 import PdfReader

In [None]:
# genretate token
ASTRA_DB_APPLICATION_TOKEN = "YOUR_API_KEY" # 

ASTRA_DB_ID ="YOUR_API_KEY" # enter your Database ID

OPENAI_API_KEY = "YOUR_API_KEY" # enter your OpenAI API Key

In [4]:
# Read the pdf 
pdfreader = PdfReader('ai_engineer_roadmap_2025.pdf')

In [5]:
# read the raw data 
from typing_extensions import Concatenate
# read text from pdf
# extract all the text from pdf pages
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [6]:
raw_text 

' \n   \ncodebasics.io  \n \n1 \nAI Engineer Roadmap for Beginners  \nFollowing is the roadmap  to learning  AI Engineer  (also known as ML Engineer ) skills for a total \nbeginner. It includes FREE learning resources for technical skills (or tool skills) and soft (or core) skills  \n                       \nFind Your Suitability : Before you start your learning journey, it is important you find out if AI \nengineering  career really suits your natural abilities and interest. Take this test to know your \nsuitability : https://codebasics.io/survey/find -your-match -ds \nProceed further if results show that this career role is a match for you.  \nTotal Duration: 8 Months  (4 hours  of study every day, 6 days a week ) \nAlso, AI Engineer = Data Scientist + Software Engineer  \n \n \n \n \n \n \n   \ncodebasics.io  \n \n2 \n \nWeek 0: Do Proper Research and protect yourself from SCAMS.  \n \n Unfortunately, a lot of systematic scams are happening in ed tech, especially in the \ndata field

In [7]:
## Initialize the connection to the AstraDB database

cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

In [8]:
# Embeddings and llm 
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

  llm = OpenAI(openai_api_key=OPENAI_API_KEY)
  embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


In [9]:
## create a langchain vector store
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [10]:
# data the data , convert into chunks 
from langchain.text_splitter import CharacterTextSplitter
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [12]:
# top 50 texts 
texts[:50]

['codebasics.io  \n \n1 \nAI Engineer Roadmap for Beginners  \nFollowing is the roadmap  to learning  AI Engineer  (also known as ML Engineer ) skills for a total \nbeginner. It includes FREE learning resources for technical skills (or tool skills) and soft (or core) skills  \n                       \nFind Your Suitability : Before you start your learning journey, it is important you find out if AI \nengineering  career really suits your natural abilities and interest. Take this test to know your \nsuitability : https://codebasics.io/survey/find -your-match -ds \nProceed further if results show that this career role is a match for you.  \nTotal Duration: 8 Months  (4 hours  of study every day, 6 days a week ) \nAlso, AI Engineer = Data Scientist + Software Engineer',
 'Total Duration: 8 Months  (4 hours  of study every day, 6 days a week ) \nAlso, AI Engineer = Data Scientist + Software Engineer  \n \n \n \n \n \n \n   \ncodebasics.io  \n \n2 \n \nWeek 0: Do Proper Research and protect

In [13]:
## Load the dataset into the vector store

astra_vector_store.add_texts(texts[:50])

print("Inserted %i headlines." % len(texts[:50]))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)


Inserted 31 headlines.


In [None]:
## test the vetor database

first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84])) # only first 84 characters if we increase it then it will give more output


QUESTION: "How many total number of weeks i need to complete all the roadmap"
ANSWER: "It is not explicitly stated in the context how many total weeks are needed to complete the entire AI Engineer roadmap, as it may vary depending on the individual's pace and learning abilities. However, the total duration mentioned is 8 months, assuming a 6-day study week with 4 hours of study per day. This would roughly translate to 24 weeks. However, it is important to note that this duration is just a guideline and may vary for each individual. Additionally, the roadmap also suggests completing a few weeks of research before starting the actual learning journey. Therefore, the total number of weeks required may be more than 24."

FIRST DOCUMENTS BY RELEVANCE:
    [0.8970] "codebasics.io  
 
1 
AI Engineer Roadmap for Beginners  
Following is the roadmap  t ..."
    [0.8967] "Total Duration: 8 Months  (4 hours  of study every day, 6 days a week ) 
Also, AI En ..."
    [0.8876] "project using a Kagg