In [1]:
import os 
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in environment variables")

print("OpenAI API Key loaded successfully")


OpenAI API Key loaded successfully


In [2]:
os.getcwd()

'c:\\Users\\TempAccess\\Documents\\Dhruv\\RAG\\simple_rag'

In [3]:
os.chdir(r"C:\Users\TempAccess\Documents\Dhruv\RAG")
os.getcwd()

'C:\\Users\\TempAccess\\Documents\\Dhruv\\RAG'

In [None]:
from helper_function_openai import (
    SimpleRAG,
    RAGRetriever,
    OpenAIEmbedder,
    OpenAIChat,
    FAISSVectorStore,
    read_pdf,
    read_pdf_with_metadata,
    chunk_text,
    chunk_documents,
    show_context,
    Document
)

In [5]:
rag = SimpleRAG(
    embedding_model="text-embedding-3-small",
    chat_model = "gpt-4o-mini",
    temperature=0.0
)

In [6]:
rag

<helper_function_openai.SimpleRAG at 0x2468c0bef90>

In [7]:
pdf_path = r"C:\Users\TempAccess\Documents\Dhruv\RAG\data\Understanding_Climate_Change.pdf"

num_chunks = rag.index_pdf(path=pdf_path, chunk_size=1000, chunk_overlap=200)

num_chunks

97

In [8]:
question = "What is the main cause of climate change?"

result = rag.query(question)

print(f"Question: {question}")
print("--"*10)
print(f"Answer: {result["answer"]}")

Question: What is the main cause of climate change?
--------------------
Answer: The main cause of climate change is the increase in greenhouse gases in the atmosphere, primarily driven by human activities such as the burning of fossil fuels and deforestation.


In [9]:
result

{'answer': 'The main cause of climate change is the increase in greenhouse gases in the atmosphere, primarily driven by human activities such as the burning of fossil fuels and deforestation.',
 'question': 'What is the main cause of climate change?'}

In [10]:
question = "What is the main cause of climate change?"

result = rag.query(question, return_context=True)

print(f"Question: {question}")
print("--"*10)
print(f"Answer: {result["answer"]}")

Question: What is the main cause of climate change?
--------------------
Answer: The main cause of climate change is the increase in greenhouse gases in the atmosphere, primarily driven by human activities such as the burning of fossil fuels and deforestation.


In [11]:
print(f"Context: {result['context']}")

Context: ['man civilization. \nMost of these climate changes are attributed to very small variations in Earth\'s orbit that \nchange the amount of solar energy our planet receives. During the Holocene epoch, which began at the end of the last ice age, human societies flourished, but the industrial era has seen \nunprecedented changes. \nModern Observations \nModern scientific observations indicate a rapid increase in global temperatures, sea levels, \nand extreme weather events. The Intergovernmental Panel on Climate Change (IPCC) has \ndocumented these changes extensively. Ice core samples, tree rings, and ocean sediments \nprovide a historical record that scientists use to understand past climate conditions and \npredict future trends. The evidence overwhelmingly shows that recent changes are primarily \ndriven by human activities, particularly the emission of greenhouse gases. \nChapter 2: Causes of Climate Change \nGreenhouse Gases \nThe primary cause of recent climate change is th

In [12]:
show_context(result["context"])


Context 1:
man civilization. 
Most of these climate changes are attributed to very small variations in Earth's orbit that 
change the amount of solar energy our planet receives. During the Holocene epoch, which began at the end of the last ice age, human societies flourished, but the industrial era has seen 
unprecedented changes. 
Modern Observations 
Modern scientific observations indicate a rapid increase in global temperatures, sea levels, 
and extreme weather events. The Intergovernmental Panel on Climate Change (IPCC) has 
documented these changes extensively. Ice core samples, tree rings, and ocean sediments 
provide a historical record that scientists use to understand past climate conditions and 
predict future trends. The evidence overwhelmingly shows that recent changes are primarily 
driven by human activities, particularly the emission of greenhouse gases. 
Chapter 2: Causes of Climate Change 
Greenhouse Gases 
The primary cause of recent climate change is the increase in

In [13]:
rag.show_context("What are greenhouse gases??", k=2)

Query: What are greenhouse gases??


Context 1:
----------------------------------------
use gases in the 
atmosphere. Greenhouse gases, such as carbon dioxide (CO2), methane (CH4), and nitrous 
oxide (N2O), trap heat from the sun, creating a "greenhouse effect." This effect is essential for life on Earth, as it keeps the planet warm enough to support life. However, human 
activities have intensified this natural process, leading to a warmer climate. 
Fossil Fuels 
Burning fossil fuels for energy releases large amounts of CO2. This includes coal, oil, and 
natural gas used for elect...


Context 2:
----------------------------------------
man civilization. 
Most of these climate changes are attributed to very small variations in Earth's orbit that 
change the amount of solar energy our planet receives. During the Holocene epoch, which began at the end of the last ice age, human societies flourished, but the industrial era has seen 
unprecedented changes. 
Modern Observations 
Modern sc

## Break Down the Code

### Retrived all the text first

In [14]:
raw_text = read_pdf(pdf_path)
raw_text



In [15]:
# Read with page metadata
page_documents = read_pdf_with_metadata(pdf_path)

print(f"Total pages: {len(page_documents)}")
print(f"\nPage 1 metadata: {page_documents[0].metadata}")
print(f"Page 1 content preview: {page_documents[0].content[:200]}...")

Total pages: 33

Page 1 metadata: {'source': 'C:\\Users\\TempAccess\\Documents\\Dhruv\\RAG\\data\\Understanding_Climate_Change.pdf', 'page': 1, 'total_pages': 33}
Page 1 content preview: Understanding Climate Change 
Chapter 1: Introduction to Climate Change 
Climate change refers to significant, long-term changes in the global climate. The term 
"global climate" encompasses the plane...


## lets now do text chunking

In [16]:
chunks = chunk_text(
    raw_text,
    chunk_size=1000,
    chunk_overlap=250
)

print(f"Created {len(chunks)} chunks")
print(f"\nChunk 1 ({len(chunks[0])} chars):")
print(chunks[0][:300] + "...")

Created 76 chunks

Chunk 1 (947 chars):
Understanding Climate Change 
Chapter 1: Introduction to Climate Change 
Climate change refers to significant, long-term changes in the global climate. The term 
"global climate" encompasses the planet's overall weather patterns, including temperature, 
precipitation, and wind patterns, over an exte...


In [17]:
# Chunk documents with metadata preservation
chunked_docs = chunk_documents(
    page_documents,
    chunk_size=1000,
    chunk_overlap=200
)

print(f"Created {len(chunked_docs)} document chunks")
print(f"\nChunk 5 metadata: {chunked_docs[5].metadata}")

Created 97 document chunks

Chunk 5 metadata: {'source': 'C:\\Users\\TempAccess\\Documents\\Dhruv\\RAG\\data\\Understanding_Climate_Change.pdf', 'page': 2, 'total_pages': 33, 'chunk_index': 2, 'chunk_count': 3}


In [18]:
chunked_docs

[Document(content='Understanding Climate Change \nChapter 1: Introduction to Climate Change \nClimate change refers to significant, long-term changes in the global climate. The term \n"global climate" encompasses the planet\'s overall weather patterns, including temperature, \nprecipitation, and wind patterns, over an extended period. Over the past century, human \nactivities, particularly the burning of fossil fuels and deforestation, have significantly \ncontributed to climate change. \nHistorical Context \nThe Earth\'s climate has changed throughout history. Over the past 650,000 years, there have \nbeen seven cycles of glacial advance and retreat, with the abrupt end of the last ice age about \n11,700 years ago marking the beginning of the modern climate era and human civilization. \nMost of these climate changes are attributed to very small variations in Earth\'s orbit that \nchange the amount of solar energy our planet receives. During the Holocene epoch, which', metadata={'sourc

### Now lets create a embedding with the help of openai.

In [19]:
embedder = OpenAIEmbedder(
    model = "text-embedding-3-small"
)

embedder

<helper_function_openai.OpenAIEmbedder at 0x246c582dc40>

In [20]:
single_embedding = embedder.embed_text("What is climate Change??")
single_embedding

[0.05243690684437752,
 -0.01864018477499485,
 0.03788062557578087,
 0.03642285615205765,
 -0.0023608398623764515,
 0.0020700893364846706,
 -0.010097213089466095,
 -0.07258845865726471,
 -0.01773979514837265,
 -0.015177976340055466,
 0.016292743384838104,
 0.024289049208164215,
 -0.030806146562099457,
 0.015338759869337082,
 -0.021737948060035706,
 0.010354466736316681,
 0.014802814461290836,
 0.027333220466971397,
 -0.011254855431616306,
 0.00959878321737051,
 -0.01482425257563591,
 -0.013280728831887245,
 0.0007034284644760191,
 -0.006050824653357267,
 -0.03837369754910469,
 0.004022270906716585,
 -0.016689343377947807,
 -0.043647401034832,
 0.03674442321062088,
 0.01464203093200922,
 0.032649800181388855,
 -0.02801923081278801,
 0.011254855431616306,
 0.021277036517858505,
 0.02326003462076187,
 -0.0005932246567681432,
 -0.07134506106376648,
 0.00021789534366689622,
 0.015006473287940025,
 -0.042468320578336716,
 -0.032092414796352386,
 -0.019765669479966164,
 -0.016582153737545013,


In [21]:
len(single_embedding)

1536

In [22]:
sample_texts = [
    "Climate change is caused by greenhouse gases",
    "Carbon dioxide traps heat in the atmosphere",
    "Renewable energy can help reduce emissions."
]


embeddings = embedder.embed_texts(sample_texts, )
embeddings

[[0.050574108958244324,
  -0.009185713715851307,
  0.03919493407011032,
  0.01626417227089405,
  0.0015086026396602392,
  -0.015766095370054245,
  -0.025804288685321808,
  -0.027643347159028053,
  -0.002665197942405939,
  -0.022356053814291954,
  0.025938386097550392,
  0.04076579585671425,
  -0.005344763398170471,
  -0.0016774224350228906,
  -0.08582272380590439,
  -0.045938149094581604,
  0.011752733029425144,
  0.03770069777965546,
  -0.04471210762858391,
  0.0071311406791210175,
  -0.0009572443086653948,
  0.00489457743242383,
  0.030765915289521217,
  0.017346534878015518,
  -0.06904131919145584,
  0.04218340292572975,
  0.0018414530204609036,
  -0.026053328067064285,
  0.032739073038101196,
  0.055210065096616745,
  0.01903233863413334,
  -0.015766095370054245,
  0.0010608111042529345,
  -0.03612983599305153,
  0.003876869333907962,
  -0.002395804738625884,
  -0.055899713188409805,
  0.04057422652840614,
  -0.025708504021167755,
  -0.021379053592681885,
  0.00721255736425519,
  -

In [23]:
len(embeddings), len(embeddings[0])

(3, 1536)

In [24]:
sample_docs = [
    Document(content=text, metadata={"source": "sample"})
    for text in sample_texts
]

sample_docs

[Document(content='Climate change is caused by greenhouse gases', metadata={'source': 'sample'}, embedding=None),
 Document(content='Carbon dioxide traps heat in the atmosphere', metadata={'source': 'sample'}, embedding=None),
 Document(content='Renewable energy can help reduce emissions.', metadata={'source': 'sample'}, embedding=None)]

In [25]:
embedded_docs = embedder.embed_documents(sample_docs)

embedded_docs

[Document(content='Climate change is caused by greenhouse gases', metadata={'source': 'sample'}, embedding=[0.05060269683599472, -0.00918394885957241, 0.03920655697584152, 0.01626104861497879, 0.0014975391095504165, -0.015810947865247726, -0.025799330323934555, -0.027695495635271072, -0.002693415619432926, -0.02239006571471691, 0.025875942781567574, 0.04075796157121658, -0.005338948220014572, -0.0017034357879310846, -0.08588284999132156, -0.04596762731671333, 0.011721745133399963, 0.03771260753273964, -0.04466520994901657, 0.0071920184418559074, -0.0009893813403323293, 0.004869695752859116, 0.030817463994026184, 0.017362356185913086, -0.06910466402769089, 0.0421752966940403, 0.0018375080544501543, -0.026086628437042236, 0.03269447386264801, 0.055199459195137024, 0.019019106402993202, -0.015791796147823334, 0.0010145199485123158, -0.0361037403345108, 0.003892883425578475, -0.0023714029230177402, -0.05588897317647934, 0.040604736655950546, -0.02574187144637108, -0.021336641162633896, 0.0

## now lets add embeddings into FAISS Vector store

In [26]:
embedder.dimension

1536

In [27]:
vector_store = FAISSVectorStore(dimension= embedder.dimension)
vector_store

<helper_function_openai.FAISSVectorStore at 0x246c5d17aa0>

In [28]:
vector_store.add_documents(embedded_docs)


In [29]:
query = "What is the capital of france??"

query_embedding = embedder.embed_text(query)

results = vector_store.search(query_embeddings=query_embedding, k=2)
results

[RetrievalResult(document=Document(content='Renewable energy can help reduce emissions.', metadata={'source': 'sample'}, embedding=[-0.006266915705054998, 0.058018237352371216, 0.020619448274374008, 0.018175743520259857, 0.00040223970427177846, 0.06216239556670189, -0.07657686620950699, 0.05405426025390625, 0.03272534906864166, 0.012072117999196053, -0.010799590498209, -0.0007735108374617994, -0.05067586898803711, 0.013153202831745148, 0.017094658687710762, -0.009329990483820438, 0.004853622056543827, 0.009132917039096355, 0.007973003201186657, 0.0623876228928566, -0.04878396913409233, -0.007348001003265381, 0.027522627264261246, 0.002411326626315713, 0.008125030435621738, 0.0009952177060768008, -0.003944271709769964, -0.024549642577767372, 0.0745948776602745, -0.008046201430261135, 0.05337858200073242, -0.03677941858768463, -0.033243369311094284, -0.03216228261590004, 0.03536049276590347, 0.025067662820219994, -0.03993258252739906, -0.054414618760347366, -0.014594649896025658, -0.0324

In [30]:
print(f"Query: {query}\n")

for result in results:
    print(f"rank {result.rank} (score: {result.score})")
    print(f"Content: {result.document.content}")
    print("-" * 50)
    

Query: What is the capital of france??

rank 0 (score: 0.04704802855849266)
Content: Renewable energy can help reduce emissions.
--------------------------------------------------
rank 1 (score: 0.032909803092479706)
Content: Carbon dioxide traps heat in the atmosphere
--------------------------------------------------


In [31]:
query = "How can we reduce CO2?"

query_embedding = embedder.embed_text(query)

results = vector_store.search(query_embeddings=query_embedding, k=2)
results

print(f"Query: {query}\n")

for result in results:
    print(f"rank {result.rank} (score: {result.score})")
    print(f"Content: {result.document.content}")
    print("-" * 50)
    

Query: How can we reduce CO2?

rank 0 (score: 0.5275881290435791)
Content: Renewable energy can help reduce emissions.
--------------------------------------------------
rank 1 (score: 0.4239322543144226)
Content: Carbon dioxide traps heat in the atmosphere
--------------------------------------------------


# Openai Chat Completion

In [32]:
chat = OpenAIChat(
    model_name = "gpt-4o-mini",
    temperature = 0.0,
    max_tokens = 1000
)

chat

<helper_function_openai.OpenAIChat at 0x246c5d17c20>

In [33]:
messages = [
    {"role":"system", "content":"You are a helpful assistant."},
    {"role":"user", "content":"what is RAG in ai??"}
]

messages

[{'role': 'system', 'content': 'You are a helpful assistant.'},
 {'role': 'user', 'content': 'what is RAG in ai??'}]

In [34]:
response = chat.chat(messages)
response

'RAG in AI typically refers to "Retrieval-Augmented Generation." It is a model architecture that combines retrieval-based methods with generative models to improve the quality and relevance of generated text.\n\n### Key Components of RAG:\n\n1. **Retrieval Component**: This part of the model retrieves relevant documents or pieces of information from a large corpus based on the input query. It often uses techniques like dense retrieval or traditional keyword-based search to find the most relevant context.\n\n2. **Generative Component**: After retrieving relevant information, the generative model (often based on architectures like Transformers) uses this context to produce coherent and contextually appropriate responses or text. This allows the model to generate more informed and accurate outputs based on real-world data.\n\n### Benefits of RAG:\n\n- **Improved Accuracy**: By leveraging external knowledge, RAG can provide more accurate and contextually relevant responses than models that

In [35]:
context = [
    "Greenhouse gases trap heat in Earth's atmosphere.",
    "CO2 is the main greenhouse gas from human activities.",
    "Burning fossil fuels releases large amounts of CO2."
]


answer = chat.chat_with_context(
    query = "What causes greenhouse gas emissions?",
    context = context
)


print(f"Got Answer based on context: {answer}")

Got Answer based on context: Greenhouse gas emissions are caused by human activities, primarily through the burning of fossil fuels, which releases large amounts of CO2.


In [36]:
answer = chat.chat([
    {"role":"system", "content":"You are a helpful assistant."},
    {"role":"user", "content":"What causes greenhouse gas emissions?"}
])

In [37]:
print(f"Answer without context: {answer}")

Answer without context: Greenhouse gas emissions are caused by a variety of human activities and natural processes. Here are some of the primary sources:

### Human Activities

1. **Fossil Fuel Combustion**: The burning of coal, oil, and natural gas for electricity, heat, and transportation is the largest single source of global greenhouse gas emissions. This includes:
   - Power plants
   - Vehicles (cars, trucks, airplanes)
   - Industrial processes

2. **Deforestation**: Trees absorb carbon dioxide (CO2) from the atmosphere. When forests are cut down for agriculture, urban development, or logging, the carbon stored in trees is released back into the atmosphere.

3. **Agriculture**: Agricultural practices contribute to greenhouse gas emissions in several ways:
   - Methane (CH4) emissions from enteric fermentation in livestock (cows, sheep, etc.)
   - Nitrous oxide (N2O) emissions from fertilized soils and manure management
   - Land-use changes for crop production

4. **Waste Manage

In [38]:
messages = [
    {"role": "system", "content": "Extract key information from the user's text."},
    {"role": "user", "content": "Climate change is caused by CO2 emissions from burning fossil fuels. The main sources are transportation and power plants."}
]

json_response = chat.chat_json(messages)
print("Structured response:")
print(json_response)

Structured response:
{'cause': 'Climate change', 'main_cause': 'CO2 emissions', 'source_of_emissions': ['burning fossil fuels', 'transportation', 'power plants']}


## lets see the entire RAG retriever

In [39]:
retriever = RAGRetriever(embedding_model="text-embedding-3-small")

num_indexed = retriever.index_pdf(
    pdf_path,
    chunk_size=1000,
    chunk_overlap=200
)

print(f"Indexed {num_indexed} chunks")

Indexed 97 chunks


In [40]:
query = "What are the effects of rising sea levels??"

results = retriever.retrieve(query, k=3)

print(f"Query: {query}")

for result in results:
    print(f"Rank {result.rank} (Score: {result.score})")
    print(f"    PAGE: {result.document.metadata.get("page", "N/A")}")
    print(f"    Content: {result.document.content[:200]}...")

Query: What are the effects of rising sea levels??
Rank 0 (Score: 0.5581742525100708)
    PAGE: 3
    Content: human 
activities. For example, spring is arriving earlier, and winters are becoming shorter and 
milder in many regions. This shift disrupts plant and animal life cycles and agricultural 
practices. ...
Rank 1 (Score: 0.5241072773933411)
    PAGE: 4
    Content: The Arctic is warming at more than twice the global average rate, leading to significant ice 
loss. Antarctic ice sheets are also losing mass, contributing to sea level rise. This melting 
affects glo...
Rank 2 (Score: 0.5164402723312378)
    PAGE: 4
    Content: ate 
conflicts. 
Flooding 
Heavy rainfall events are becoming more common, leading to increased flooding. Urban 
areas with poor drainage and infrastructure are particularly at risk. Flood management ...


In [41]:
context_list = retriever.retrieve_context(query, k=2)
show_context(context_list)


Context 1:
human 
activities. For example, spring is arriving earlier, and winters are becoming shorter and 
milder in many regions. This shift disrupts plant and animal life cycles and agricultural 
practices. Melting Ice and Rising Sea Levels 
Warmer temperatures are causing polar ice caps and glaciers to melt, contributing to rising 
sea levels. Sea levels have risen by about 20 centimeters (8 inches) in the past century, 
threatening coastal communities and ecosystems. 
Polar Ice Melt



Context 2:
The Arctic is warming at more than twice the global average rate, leading to significant ice 
loss. Antarctic ice sheets are also losing mass, contributing to sea level rise. This melting 
affects global ocean currents and weather patterns. 
Glacial Retreat 
Glaciers around the world are retreating, affecting water supplies for millions of people. 
Regions dependent on glacial meltwater, such as the Himalayas and the Andes, face 
particular risks. Glacial melt also impacts hydropower ge

In [42]:
test_queries = [
    "What is the greenhouse effect?",
    "How does deforestation contribute to climate change?",
    "What are renewable energy sources?",
    "What is the paris agreement?",
    ""
]


for query in test_queries:
    result = rag.query(query, k =2)
    print(f"\n Q: {query}")
    print("A: ", result)


 Q: What is the greenhouse effect?
A:  {'answer': 'The greenhouse effect is the process by which greenhouse gases, such as carbon dioxide (CO2), methane (CH4), and nitrous oxide (N2O), trap heat from the sun in the atmosphere. This effect is essential for life on Earth, as it keeps the planet warm enough to support life.', 'question': 'What is the greenhouse effect?'}

 Q: How does deforestation contribute to climate change?
A:  {'answer': 'Deforestation contributes to climate change by releasing stored carbon back into the atmosphere when trees are cut down for timber or to clear land for agriculture. This reduces the number of trees available to absorb CO2, exacerbating the greenhouse effect.', 'question': 'How does deforestation contribute to climate change?'}

 Q: What are renewable energy sources?
A:  {'answer': 'Renewable energy sources mentioned in the context are wind, solar, and hydroelectric power. These sources produce little to no greenhouse gas emissions and are sustainab

In [44]:
save_path = "data/climate_vectorstore"
retriever.vector_store.save(save_path)

In [45]:
loading_store = FAISSVectorStore.load(save_path)
loading_store

<helper_function_openai.FAISSVectorStore at 0x246c5d8f1a0>

In [46]:
print(f"Loaded {len(loading_store.documents)}")

Loaded 97


In [48]:
test_query = embedder.embed_text("What is the global warming?")
results = loading_store.search(test_query, k=2)

for r in results:
    print(f"{r.score:.4f} {r.document.metadata['page']}: {r.document.content}")

0.5217 1: Understanding Climate Change 
Chapter 1: Introduction to Climate Change 
Climate change refers to significant, long-term changes in the global climate. The term 
"global climate" encompasses the planet's overall weather patterns, including temperature, 
precipitation, and wind patterns, over an extended period. Over the past century, human 
activities, particularly the burning of fossil fuels and deforestation, have significantly 
contributed to climate change. 
Historical Context 
The Earth's climate has changed throughout history. Over the past 650,000 years, there have 
been seven cycles of glacial advance and retreat, with the abrupt end of the last ice age about 
11,700 years ago marking the beginning of the modern climate era and human civilization. 
Most of these climate changes are attributed to very small variations in Earth's orbit that 
change the amount of solar energy our planet receives. During the Holocene epoch, which
0.5007 1: man civilization. 
Most of these