In [31]:
from langchain.document_loaders import PyPDFLoader,Docx2txtLoader
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from dotenv import load_dotenv
import tiktoken
import os

In [4]:
load_dotenv()

True

# Load the Document

In [5]:
def load_document(doc):
    name, extension= os.path.splitext(doc)

    if extension=='.pdf':
        print("Loading: ", doc)
        loader= PyPDFLoader(doc)
    elif extension=='.docx':
        print("Loading: ", doc)
        loader= Docx2txtLoader(doc)
    else:
        print("Document Format is not Supported")
        return None
    
    data= loader.load()
    return data

In [7]:
def chunk_data(doc, chunk_size=260, chunk_overlap=0):
    splitter= RecursiveCharacterTextSplitter(chunk_size= chunk_size, chunk_overlap=chunk_overlap)
    chunks= splitter.split_documents(doc)
    return chunks

In [9]:
def print_embedding_cost(texts):
    import tiktoken

    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    
    cost_per_million_tokens = 0.02
    cost = (total_tokens / 1_000_000) * cost_per_million_tokens

    print("Total Tokens:", total_tokens)
    print(f"Embedding Cost in USD: ${cost:.6f}")

# Embedding and uploading vectors into DB (Pinecone)

In [16]:
def insert_or_create_embeddings(index_name, chunks):
    # Load API keys
    pinecone_api_key = os.getenv("PINECONE_API_KEY")
    openai_api_key = os.getenv("OPENAI_API_KEY")

    if not pinecone_api_key or not openai_api_key:
        raise ValueError("Missing API keys. Set PINECONE_API_KEY and OPENAI_API_KEY.")

    print("Pinecone API Key:", pinecone_api_key)
    print("OpenAI API Key:", openai_api_key)

    # Initialize Pinecone client (v3 SDK)
    pc = Pinecone(api_key=pinecone_api_key)

    # Initialize OpenAI embeddings
    embeddings = OpenAIEmbeddings(
        api_key=openai_api_key,
        model='text-embedding-3-small',
        dimensions=1536
    )

    # Create index if it doesn't exist
    if index_name not in pc.list_indexes().names():
        print(f"Creating index '{index_name}'...")
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )
        print(f"Waiting for index '{index_name}' to be ready...")
        while not pc.describe_index(index_name).status['ready']:
            pass  # you can add a sleep here to avoid CPU loop

        # Use PineconeVectorStore from langchain-pinecone
        print(f"Uploading embeddings to index '{index_name}'...")
        vector_store = PineconeVectorStore.from_documents(
            documents=chunks,
            embedding=embeddings,
            index_name=index_name
        )

        print("Done.")
    else:
        print(f"{index_name} index already exists. Loading embeddings...")
        vector_store = PineconeVectorStore.from_existing_index(
            index_name=index_name,
            embedding=embeddings
        )
        print("Done")
    return vector_store

In [24]:
def delete_index_pinecone(index_name='all'):
    pc= Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

    if index_name=='all':
        user_input= input("Are you sure? [Y/N]")
        if user_input=='Y':
            print("Deleting all the indexes.........")
            for index in pc.list_indexes().names():
                pc.delete_index(index)
            print("Deleted!..")
        else:
            print("Give index name you wanted to delete")
    else:
        print(f"Deleting {index_name} index.........")
        pc.delete_index(index_name)
        print("Deleted!")

In [25]:
def ask_and_get_answer(vector_store, quest):
    llm= ChatOpenAI(model='gpt-3.5-turbo', temperature=0.7)

    retriever= vector_store.as_retriever(search_type='similarity', search_kwargs={'k':5})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff',retriever=retriever)

    answer= chain.invoke(quest)

    return answer

In [41]:
def create_embeddings_chroma(chunks,model='text-embedding-3-small', persist_directory='./chroma_db'):
    if model=='text-embedding-3-small':
        embeddings= OpenAIEmbeddings(model=model,dimensions=1536)
    else:
        embeddings= OpenAIEmbeddings(model=model)

    vector_store= Chroma.from_documents(chunks,embeddings,persist_directory=persist_directory)

    return vector_store

In [42]:
def load_embeddings_chroma(model='text-embedding-3-small', persist_directory='./chroma_db'):
    if model=='text-embedding-3-small':
        embeddings= OpenAIEmbeddings(model=model,dimensions=1536)
    else:
        embeddings= OpenAIEmbeddings(model=model)

    vector_store= Chroma(embedding_function=embeddings,persist_directory=persist_directory)

    return vector_store

In [19]:
delete_index_pinecone('all')

Deleting all the indexes.........
Deleted!..


## Loading the document

In [None]:
data = load_document(r"C:\Users\navin\Downloads\CONSTITUTION.pdf")

## Chunking The Document

In [21]:
chunks = chunk_data(data)
print(len(chunks))

199


## Embedding Cost

In [22]:
print_embedding_cost(chunks)

Total Tokens: 9391
Embedding Cost in USD: $0.000188


## Create and Store Embeddings (Pinecone)

In [23]:
index_name= "akadocument"
vector_store= insert_or_create_embeddings(index_name=index_name, chunks=chunks)

Pinecone API Key: pcsk_2MaPxR_U4S8EjJzR8eRQ3PixyKJbFzzWhMmEfaSk86XRwk3YGxRCUhuWFmYSBeq84BhB81
OpenAI API Key: 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
Creating index 'akadocument'...
Waiting for index 'akadocument' to be ready...
Uploading embeddings to index 'akadocument'...
Done.


## Ask the Question and Get the Answer

In [27]:
question="what is the whole document about, in brief?"
answer= ask_and_get_answer(vector_store=vector_store,quest=question)
print(answer)

{'query': 'what is the whole document about, in brief?', 'result': 'The document in question appears to be discussing the Indian Constitution. It outlines the purposes and functions of the Preamble of the Indian Constitution, as well as provisions related to rights, freedoms, government nature, and various aspects like agriculture, animal husbandry, and environmental protection.'}


In [28]:
question="When the constitution came in existence?"
answer= ask_and_get_answer(vector_store=vector_store,quest=question)
print(answer)

{'query': 'When the constitution came in existence?', 'result': 'The Indian Constitution came into existence on 26th November 1949 when it was adopted and enacted by the Constituent Assembly. It officially came into effect on 26th January 1950.'}


In [30]:
question="When will constitution get vanished from india?"
answer= ask_and_get_answer(vector_store=vector_store,quest=question)
print(answer)

{'query': 'When will constitution get vanished from india?', 'result': 'Constitutions do not typically "get vanished" from countries. They are fundamental documents that outline the framework of a country\'s government, laws, and rights. The Indian Constitution is a foundational document that has been in place since it was adopted on January 26, 1950. There are processes in place for amending the constitution, but it is not something that would simply vanish.'}


# Chroma DB as a Retreiver

In [34]:
data = load_document(r"C:\Users\navin\Downloads\Resume DS.pdf")

Loading:  C:\Users\navin\Downloads\Resume DS.pdf


In [37]:
chunks= chunk_data(data, chunk_size=256, chunk_overlap=50)
print(len(chunks))

31


In [43]:
vector_store= create_embeddings_chroma(chunks=chunks, model='text-embedding-ada-002')

In [44]:
q= "what kind of document is this?, and whom it belongs?"
answer= ask_and_get_answer(vector_store=vector_store, quest=q)
print(answer)

{'query': 'what kind of document is this?, and whom it belongs?', 'result': 'This document appears to be a resume or curriculum vitae (CV) belonging to an individual with a Bachelor of Technology in Mechanical Engineering from Shivaji University, Kolhapur. It highlights skills in programming languages, web frameworks, data processing, analysis, and visualization, as well as experience in collaborating with analysts, data scientists, and stakeholders to translate business requirements into technical solutions.'}


In [45]:
q= "Is this person compatible to work with AI?, or Is he lagging anywhere that he can tend to learn?"
answer= ask_and_get_answer(vector_store=vector_store, quest=q)
print(answer)

{'query': 'Is this person compatible to work with AI?, or Is he lagging anywhere that he can tend to learn?', 'result': "Based on the provided information, this individual seems well-equipped to work with AI. They have experience and expertise in technologies like Agentic AI, generative models, machine learning algorithms, and data engineering. They have also completed relevant certificates in Generative AI and Data Science. However, without more specific details about their practical experience with AI projects or specific areas they may need improvement in, it's challenging to give a comprehensive assessment."}


# Adding Memory (Chat History)

In [47]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

llm= ChatOpenAI(model='gpt-3.5-turbo',temperature=0.7)
retriever= vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 6})
memory= ConversationBufferMemory(memory_key='chat_history',return_messages=True)

crc= ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type='stuff',
    verbose= True
)

In [48]:
def ask_question(q, chain):
    result= chain.invoke({'question':q})
    return result

In [49]:
q="How is the person technology wise?"
result= ask_question(q, crc)

print(result)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
Education
Bachelor of Technology·Mechanical Engineering·
Shivaji University, Kolhapur
8.0 CGPA·May 2015 - Jun 2018
Skills
Programming Languages
PythonSQL
Web Frameworks
FlaskFastAPIDjango
Data Processing, Analysis and Visualization

technologies, including Agentic AI, to advance data-centric roles in AI and machine learning.
Professional Experience
Senior Engineer·Cloud4C Services Pvt. Ltd. Hyderabad·Jan 2023 - Jul 2025

and comprehensive reporting.
Collaborated with analysts, data scientists, and stakeholders to translate business requirements into robust technical
solutions.

Developed interactive dashboards and visual reports in Power BI to effectively communicate insig

In [51]:
print(result['answer'])

The person is skilled in programming languages like Python and SQL, as well as web frameworks such as Flask, FastAPI, and Django. They also have experience in data processing, analysis, and visualization. Additionally, they have a certificate in Generative AI and have completed a Post Graduate Program in Data Science.


In [52]:
q="Is this person compatible to do Data Science or AI Engineer Job Role?, if yes why?"
result= ask_question(q, crc)

print(result)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: How is the person technology wise?
Assistant: The person is skilled in programming languages like Python and SQL, as well as web frameworks such as Flask, FastAPI, and Django. They also have experience in data processing, analysis, and visualization. Additionally, they have a certificate in Generative AI and have completed a Post Graduate Program in Data Science.
Follow Up Input: Is this person compatible to do Data Science or AI Engineer Job Role?, if yes why?
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question.
If you don't know the 

In [53]:
print(result['answer'])

Yes, based on the provided information, this person is well-equipped to perform a job in Data Science or as an AI Engineer. They have specialized skills in statistical modeling, machine learning, and generative AI, which are essential for roles in data science and AI engineering. Their proficiency in Python, SQL, Kafka, and RabbitMQ, along with expertise in supervised and unsupervised learning, model evaluation, and data preprocessing, makes them suitable for these roles. Additionally, their experience in exploring emerging technologies like Agentic AI and generative models demonstrates a commitment to staying updated with advancements in the field, making them a strong candidate for roles in data science and AI engineering.


In [54]:
q="Tell me the technologies he comfortable working it in number like [python,sql,ml] makes 3"
result= ask_question(q, crc)

print(result['answer'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: How is the person technology wise?
Assistant: The person is skilled in programming languages like Python and SQL, as well as web frameworks such as Flask, FastAPI, and Django. They also have experience in data processing, analysis, and visualization. Additionally, they have a certificate in Generative AI and have completed a Post Graduate Program in Data Science.
Human: Is this person compatible to do Data Science or AI Engineer Job Role?, if yes why?
Assistant: Yes, based on the provided information, this person is well-equipped to perform a job in Data Science or as an AI Engineer. They have specialized skills in statistical modeling, machine learning, and generative AI, which are essential for roles in data science and AI engineering. Th

In [56]:
q="multiply proficient technologies number you told me with 2 and tell me the answer"
result= ask_question(q, crc)

print(result['answer'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: How is the person technology wise?
Assistant: The person is skilled in programming languages like Python and SQL, as well as web frameworks such as Flask, FastAPI, and Django. They also have experience in data processing, analysis, and visualization. Additionally, they have a certificate in Generative AI and have completed a Post Graduate Program in Data Science.
Human: Is this person compatible to do Data Science or AI Engineer Job Role?, if yes why?
Assistant: Yes, based on the provided information, this person is well-equipped to perform a job in Data Science or as an AI Engineer. They have specialized skills in statistical modeling, machine learning, and generative AI, which are essential for roles in data science and AI engineering. Th

In [57]:
q="How many characters are there in person name?"
result= ask_question(q, crc)

print(result['answer'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: How is the person technology wise?
Assistant: The person is skilled in programming languages like Python and SQL, as well as web frameworks such as Flask, FastAPI, and Django. They also have experience in data processing, analysis, and visualization. Additionally, they have a certificate in Generative AI and have completed a Post Graduate Program in Data Science.
Human: Is this person compatible to do Data Science or AI Engineer Job Role?, if yes why?
Assistant: Yes, based on the provided information, this person is well-equipped to perform a job in Data Science or as an AI Engineer. They have specialized skills in statistical modeling, machine learning, and generative AI, which are essential for roles in data science and AI engineering. Th

In [58]:
q="multiply the number of characters with 2"
result= ask_question(q, crc)

print(result['answer'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: How is the person technology wise?
Assistant: The person is skilled in programming languages like Python and SQL, as well as web frameworks such as Flask, FastAPI, and Django. They also have experience in data processing, analysis, and visualization. Additionally, they have a certificate in Generative AI and have completed a Post Graduate Program in Data Science.
Human: Is this person compatible to do Data Science or AI Engineer Job Role?, if yes why?
Assistant: Yes, based on the provided information, this person is well-equipped to perform a job in Data Science or as an AI Engineer. They have specialized skills in statistical modeling, machine learning, and generative AI, which are essential for roles in data science and AI engineering. Th

In [59]:
for item in result['chat_history']:
    print(item)

content='How is the person technology wise?' additional_kwargs={} response_metadata={}
content='The person is skilled in programming languages like Python and SQL, as well as web frameworks such as Flask, FastAPI, and Django. They also have experience in data processing, analysis, and visualization. Additionally, they have a certificate in Generative AI and have completed a Post Graduate Program in Data Science.' additional_kwargs={} response_metadata={}
content='Is this person compatible to do Data Science or AI Engineer Job Role?, if yes why?' additional_kwargs={} response_metadata={}
content='Yes, based on the provided information, this person is well-equipped to perform a job in Data Science or as an AI Engineer. They have specialized skills in statistical modeling, machine learning, and generative AI, which are essential for roles in data science and AI engineering. Their proficiency in Python, SQL, Kafka, and RabbitMQ, along with expertise in supervised and unsupervised learning,

## Using Custom Prompt

In [61]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import SystemMessagePromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate

llm= ChatOpenAI(model='gpt-3.5-turbo',temperature=0.7)
retriever= vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 6})
memory= ConversationBufferMemory(memory_key='chat_history',return_messages=True)

system_template= r'''
Use the following pieces of context to answer the user's question.
-----------
Context: ```{context}```
'''

user_template= r'''
Question: ```{question}```
Chat History: ```{chat_history}```
'''

messages= [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template(user_template)
]

qa_prompt= ChatPromptTemplate.from_messages(messages)

crc= ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type='stuff',
    combine_docs_chain_kwargs={'prompt':qa_prompt},
    verbose= True
)

In [62]:
q="Does this person belongs to reputed college?"
result= ask_question(q, crc)

print(result['answer'])



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: 
Use the following pieces of context to answer the user's question.
-----------
Context: ```Education
Bachelor of Technology·Mechanical Engineering·
Shivaji University, Kolhapur
8.0 CGPA·May 2015 - Jun 2018
Skills
Programming Languages
PythonSQL
Web Frameworks
FlaskFastAPIDjango
Data Processing, Analysis and Visualization

Certificates
Generative AI·Great Learning
Post Graduate Program in Data Science·Great Learning 2025 - 2025

reliability and scalability.
Explored emerging technologies, including Agentic AI and generative models, to drive innovation in analytics and AI
solutions.
Associate Manager·MAN Trucks and Bus India Pvt. Ltd. Pune·Jan 2020 - May 2022

technologies, including Agentic AI, to advance data-centric roles in AI and machine learning.
Professional Experience
Senior Engineer·Cloud4C Services Pvt. Ltd. Hyderabad·Jan 2023 - Jul 202

In [63]:
q="Tell me the technologies in numbers?"
result= ask_question(q, crc)

print(result['answer'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: Does this person belongs to reputed college?
Assistant: Yes, the person mentioned in the context holds a Bachelor of Technology degree in Mechanical Engineering from Shivaji University, Kolhapur. Shivaji University is a well-known and reputable university in India, so yes, the person belongs to a reputed college.
Follow Up Input: Tell me the technologies in numbers?
Standalone question:[0m

[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: 
Use the following pieces of context to answer the user's question.
-----------
Context: ```technologies, including Agentic AI, to advance data-centric roles in AI and machine learning.


In [64]:
q="Tell me college completion year? and when did he got job year and the difference between this year in months?"
result= ask_question(q, crc)

print(result['answer'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: Does this person belongs to reputed college?
Assistant: Yes, the person mentioned in the context holds a Bachelor of Technology degree in Mechanical Engineering from Shivaji University, Kolhapur. Shivaji University is a well-known and reputable university in India, so yes, the person belongs to a reputed college.
Human: Tell me the technologies in numbers?
Assistant: The person in the context has experience with multiple technologies, including:
- Programming Languages: Python, SQL
- Web Frameworks: Flask, FastAPI, Django
- Messaging & Streaming: Kafka, RabbitMQ
- APIs & Automation: REST APIs, Postman, Paramiko, JumpSSH, concurrent.futures
- Containerization & Deployment: Gitlab, Docker
- Operating Systems: Linux, Windows

In total, the per

In [65]:
q="multiply that difference with 5 and tell me the solution"
result= ask_question(q, crc)

print(result['answer'])



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:

Human: Does this person belongs to reputed college?
Assistant: Yes, the person mentioned in the context holds a Bachelor of Technology degree in Mechanical Engineering from Shivaji University, Kolhapur. Shivaji University is a well-known and reputable university in India, so yes, the person belongs to a reputed college.
Human: Tell me the technologies in numbers?
Assistant: The person in the context has experience with multiple technologies, including:
- Programming Languages: Python, SQL
- Web Frameworks: Flask, FastAPI, Django
- Messaging & Streaming: Kafka, RabbitMQ
- APIs & Automation: REST APIs, Postman, Paramiko, JumpSSH, concurrent.futures
- Containerization & Deployment: Gitlab, Docker
- Operating Systems: Linux, Windows

In total, the per