In [52]:
from pypdf import PdfReader


In [53]:
pdfpath = "../data/intro_to_ai.pdf"
reader = PdfReader(pdfpath)

In [54]:
len(reader.pages)


5

In [55]:
page = reader.pages[0]
text = page.extract_text()
text[:1000]


' Table of Contents Chapter 1: Introduction: ......................................................................................................................... 4 What is AI ............................................................................................................................................ 4 What is ML .......................................................................................................................................... 6 What is the difference between AI and ML ........................................................................................ 9 Maths ................................................................................................................................................ 11 Current state of AI and ML:............................................................................................................... 13 Expected applications in the next 10 years: .....................................................

In [56]:
all_text = ""

for page in reader.pages:
    page_text = page.extract_text()
    if page_text:
        all_text += page_text + "\n"


In [57]:
len(all_text)


12605

In [58]:
all_text[:1000]


' Table of Contents Chapter 1: Introduction: ......................................................................................................................... 4 What is AI ............................................................................................................................................ 4 What is ML .......................................................................................................................................... 6 What is the difference between AI and ML ........................................................................................ 9 Maths ................................................................................................................................................ 11 Current state of AI and ML:............................................................................................................... 13 Expected applications in the next 10 years: .....................................................

In [59]:
all_text[-1000:]


'AI is its ability to process and analyze large amounts of data in a matter of seconds, which can lead to improved decision making. This has the potential to benefit a wide range of industries, including finance, where AI algorithms can be used to identify patterns in financial data that can be used to inform investment decisions. In healthcare, AI can be used to analyze medical images and patient data to improve the accuracy of diagnoses and treatment plans. Additionally, AI can be used in manufacturing to optimize production processes, and in transportation to improve the safety of autonomous vehicles. Another potential benefit of AI is the automation of repetitive and dangerous tasks, which can improve working conditions and increase productivity. For example, in manufacturing, robots with AI capabilities can be used to perform tasks such as welding and painting, which can reduce the risk of workplace injuries. Similarly, in healthcare, AI can be used to automate tasks such as data 

In [60]:
chunk_size = 800
overlap = 150

chunks = []
start = 0

while start < len(all_text):
    end = start + chunk_size
    chunk = all_text[start:end]
    chunks.append(chunk)
    start = end - overlap


In [61]:
len(chunks)


20

In [62]:
chunks[0][:500]


' Table of Contents Chapter 1: Introduction: ......................................................................................................................... 4 What is AI ............................................................................................................................................ 4 What is ML .......................................................................................................................................... 6 What is the difference bet'

In [63]:
chunks[len(chunks)//2][:500]


'.... 99 Medical Image Segmentation ......................................................................................................... 101 Chapter 3: Big Data Analysis and Algorithms ...................................................................................... 104 Big data & Demo Hadoop-I ............................................................................................................. 106 Hadoop Ecosystem & Demo Hadoop-II ...............................................'

In [64]:
from openai import OpenAI


In [65]:
from dotenv import load_dotenv
load_dotenv()


True

In [66]:
client = OpenAI()


In [67]:
sample_chunk = chunks[0]
len(sample_chunk)


800

In [68]:
embedding_response = client.embeddings.create(
    model="text-embedding-3-small",
    input=sample_chunk
)


In [69]:
vector = embedding_response.data[0].embedding
len(vector)


1536

In [70]:
chunk_embeddings = []


In [71]:
for chunk in chunks:
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=chunk
    )
    vector = response.data[0].embedding
    chunk_embeddings.append({
        "text": chunk,
        "embedding": vector
    })


In [72]:
len(chunk_embeddings)


20

In [73]:
len(chunk_embeddings[0]["embedding"])


1536

In [74]:
import numpy as np


In [75]:
def cosine_similarity(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


In [76]:
question = "What are the main fields of Artificial Intelligence according to this document?"


In [77]:
question_embedding = client.embeddings.create(
    model="text-embedding-3-small",
    input=question
).data[0].embedding


In [78]:
scores = []

for item in chunk_embeddings:
    score = cosine_similarity(question_embedding, item["embedding"])
    scores.append({
        "text": item["text"],
        "score": score
    })


In [79]:
scores = sorted(scores, key=lambda x: x["score"], reverse=True)


In [80]:
for i, item in enumerate(scores[:5], start=1):
    print(f"\n--- Result {i} (score: {item['score']:.3f}) ---")
    print(item["text"][:500])



--- Result 1 (score: 0.529) ---
 Table of Contents Chapter 1: Introduction: ......................................................................................................................... 4 What is AI ............................................................................................................................................ 4 What is ML .......................................................................................................................................... 6 What is the difference bet

--- Result 2 (score: 0.522) ---
0 Virtual Tour creation ....................................................................................................................... 152 Conclusion: .......................................................................................................................................... 154               
 Chapter 1: Introduction:  What is AI  Artificial Intelligence (AI) is a field of computer science that aims to cre

In [81]:
top_k = 8
context_chunks = [item["text"] for item in scores[:top_k]]


In [82]:
context = "\n\n---\n\n".join(context_chunks)


In [83]:
prompt = f"""
You are an AI assistant that answers questions using ONLY the provided context.

The context may include chapter titles, section headings, and sub-sections.
The question asks for MAIN FIELDS, meaning only high-level categories,
not individual techniques, algorithms, or subtopics.

Extract only the highest-level fields discussed in the document.
If the answer is not present, say:
"I don't know based on the document."

Context:
{context}

Question:
What are the main fields of Artificial Intelligence discussed in this document?

Answer (as a short bullet list of fields only):
"""


In [84]:
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "user", "content": prompt}
    ],
    temperature=0
)


In [85]:
print(response.choices[0].message.content)


- Artificial Intelligence (AI)
- Machine Learning (ML)
- Computer Vision
- Natural Language Processing
- Augmented Reality
