In [1]:
# 📥 Step 1: Extract raw text from multiple NCERT PDFs using PyMuPDF
import fitz  # PyMuPDF
import os

def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

pdf_folder = "ncert_pdfs"
pdf_files = ["hesc113.pdf", "hesc108.pdf", "hesc107.pdf", "hesc106.pdf"]

for pdf_file in pdf_files:
    full_path = os.path.join(pdf_folder, pdf_file)
    text = extract_text_from_pdf(full_path)
    print(f"📄 Text from {pdf_file} (First 1000 chars):\n")
    print(text[:1000])
    print("\n" + "-"*80 + "\n")


📄 Text from hesc113.pdf (First 1000 chars):

LIGHT
LIGHT
T
he world is largely known through
the senses. The sense of sight
is one of the most important
senses. Through it we see mountains,
rivers, trees, plants, chairs, people and
so many other things around us. We also
see clouds, rainbows and birds flying
in the sky. At night we see the moon
and the stars. You are able to see the
words and sentences printed on this
page. How is seeing made possible?
13.1 What makes Things
Visible
Have you ever thought how we see
various objects? You may say that  eyes
see the objects. But, can you see an
object in the dark? It means that eyes
alone cannot see any object. It is only
when light from an object enters our
eyes that we see the object. The light
may have been emitted by the object, or
may have been reflected by it.
You learnt in Class VII that a polished
or a shiny surface can act as a mirror. A
mirror changes the direction  of light
that falls on it. Can you tell in which
direction the l

In [2]:
# 🧹 Step 2: Clean extracted text and save to .txt files
import re

output_folder = "cleaned_texts"
os.makedirs(output_folder, exist_ok=True)

def clean_text(text):
    text = re.sub(r'-\n', '', text)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s+\n', '\n', text)
    return text.strip()

for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, filename)
        doc = fitz.open(pdf_path)
        full_text = "".join(page.get_text() for page in doc)
        cleaned = clean_text(full_text)
        txt_filename = os.path.splitext(filename)[0] + ".txt"
        output_path = os.path.join(output_folder, txt_filename)
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(cleaned)
        print(f"✅ Saved: {output_path}")


✅ Saved: cleaned_texts/hesc108.txt
✅ Saved: cleaned_texts/hesc106.txt
✅ Saved: cleaned_texts/hesc107.txt
✅ Saved: cleaned_texts/hesc113.txt


In [3]:
# 📚 Step 3: Combine cleaned summaries into final summary format
input_folder = "cleaned_texts"
final_folder = "final_summaries"
os.makedirs(final_folder, exist_ok=True)

for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        with open(os.path.join(input_folder, filename), "r") as f:
            content = f.read()
        final_text = f"📘 Summary of {filename}\n\n{content}"
        with open(os.path.join(final_folder, filename), "w") as f:
            f.write(final_text)
print("✅ All final summary files created.")


✅ All final summary files created.


In [4]:
# 📁 Step 4: Merge all summary files into one combined file
output_file = "final_combined_summary.txt"

with open(output_file, "w", encoding="utf-8") as outfile:
    for filename in sorted(os.listdir(final_folder)):
        filepath = os.path.join(final_folder, filename)
        with open(filepath, "r", encoding="utf-8") as infile:
            outfile.write(infile.read())
            outfile.write("\n" + "="*100 + "\n\n")

print(f"✅ Combined file created: {output_file}")


IsADirectoryError: [Errno 21] Is a directory: 'final_summaries/.ipynb_checkpoints'

In [5]:
# 🧱 Step 5: Convert combined summary into structured topic-wise JSON
import json

with open("final_combined_summary.txt", "r", encoding="utf-8") as f:
    text = f.read()

chapters = text.split("Chapter")[1:]
data = {}

for chapter in chapters:
    lines = chapter.strip().split("\n")
    chapter_title = "Chapter " + lines[0].strip()
    data[chapter_title] = {"Topics": []}
    current_topic = None
    current_subtopic = None

    for line in lines[1:]:
        line = line.strip()
        if line.startswith("Topic:"):
            current_topic = {"Topic Name": line.replace("Topic:", "").strip(), "Subtopics": []}
            data[chapter_title]["Topics"].append(current_topic)
        elif line.startswith("Sub-topic:"):
            current_subtopic = {
                "Subtopic Name": line.replace("Sub-topic:", "").strip(),
                "Content": {
                    "Paragraphs": [],
                    "Examples": [],
                    "Figures": [],
                    "Activities": [],
                    "Questions": [],
                    "Tables": [],
                    "Boxed Facts": []
                }
            }
            current_topic["Subtopics"].append(current_subtopic)
        else:
            if current_subtopic:
                if line.startswith("Paragraph:"):
                    current_subtopic["Content"]["Paragraphs"].append(line.replace("Paragraph:", "").strip())
                elif line.startswith("Example:"):
                    current_subtopic["Content"]["Examples"].append(line.replace("Example:", "").strip())
                elif line.startswith("Figure:"):
                    current_subtopic["Content"]["Figures"].append(line.replace("Figure:", "").strip())
                elif line.startswith("Activity:"):
                    current_subtopic["Content"]["Activities"].append(line.replace("Activity:", "").strip())
                elif line.startswith("Question:"):
                    current_subtopic["Content"]["Questions"].append(line.replace("Question:", "").strip())
                elif line.startswith("Table:"):
                    current_subtopic["Content"]["Tables"].append(line.replace("Table:", "").strip())
                elif line.startswith("Boxed Fact:"):
                    current_subtopic["Content"]["Boxed Facts"].append(line.replace("Boxed Fact:", "").strip())

with open("structured_output.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print("✅ structured_output.json file created.")


✅ structured_output.json file created.


In [7]:
!pip install pandas openpyxl


Collecting pandas
  Using cached pandas-2.3.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached pandas-2.3.1-cp313-cp313-macosx_11_0_arm64.whl (10.7 MB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: pytz, tzdata, et-xmlfile, pandas, openpyxl
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5/5[0m [openpyxl]4/5[0m [openpyxl]
[1A[2KSuccessfully installed et-

In [8]:
#this is to create a clean, structured Excel sheet

import json
import pandas as pd

# Load the structured JSON
with open("final_structured_with_topics.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Prepare rows for the Excel
rows = []

for chapter_id, chapter in data.items():
    chapter_title = chapter.get("ChapterTitle", "")
    for topic in chapter.get("Topics", []):
        rows.append({
            "Chapter ID": chapter_id,
            "Chapter Title": chapter_title,
            "Topic Number": topic.get("TopicNumber", ""),
            "Topic Title": topic.get("TopicTitle", ""),
            "Topic Content": topic.get("Content", "")
        })

# Create DataFrame
df = pd.DataFrame(rows)

# Save to Excel
excel_path = "ncert_extracted_topics.xlsx"
df.to_excel(excel_path, index=False)
print(f"✅ Excel file created: {excel_path}")


✅ Excel file created: ncert_extracted_topics.xlsx


In [9]:
import json

with open("final_structured_with_topics.json", "r", encoding="utf-8") as f:
    data = json.load(f)

lines = []

for chapter_id, chapter in data.items():
    lines.append(f"📘 Chapter: {chapter_id} - {chapter.get('ChapterTitle', 'Untitled')}")
    
    for topic in chapter.get("Topics", []):
        topic_title = topic.get("TopicTitle", "Untitled Topic")
        lines.append(f"  └── 🔢 {topic.get('TopicNumber', '?')} {topic_title}")
        
        content = topic.get("Content", "")
        
        # Try detecting content types
        if "activity" in content.lower():
            lines.append("        └── 📌 Includes: Activity")
        if "question" in content.lower():
            lines.append("        └── ❓ Includes: Questions")
        if "table" in content.lower():
            lines.append("        └── 📊 Includes: Table")
        if "figure" in content.lower() or "diagram" in content.lower() or "image" in content.lower():
            lines.append("        └── 🖼️ Includes: Figures/Images")
        if "example" in content.lower():
            lines.append("        └── 💡 Includes: Example")
        if "box" in content.lower() or "fact" in content.lower():
            lines.append("        └── 📦 Includes: Boxed Fact")
        if "paragraph" in content.lower() or len(content) > 100:
            lines.append("        └── 📄 Includes: Paragraphs")

    lines.append("\n")

# Save to file
with open("knowledge_graph.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("✅ Knowledge graph saved as: knowledge_graph.txt")


✅ Knowledge graph saved as: knowledge_graph.txt


In [10]:
import pandas as pd
import math

# Load your Excel
df = pd.read_excel("ncert_extracted_topics.xlsx")

# Ask user for number of days to complete the study
total_days = int(input("📅 Enter number of study days (e.g., 10, 15, 30): "))

# Remove rows with missing topic titles
df = df.dropna(subset=["Topic Title"])

# Total number of topics
topics = df[["Chapter ID", "Chapter Title", "Topic Number", "Topic Title"]].drop_duplicates().reset_index(drop=True)
num_topics = len(topics)

# Calculate topics per day
topics_per_day = math.ceil(num_topics / total_days)

# Split into days
study_plan = {}
for i in range(total_days):
    start = i * topics_per_day
    end = min((i + 1) * topics_per_day, num_topics)
    if start >= end:
        break
    day_topics = topics.iloc[start:end]
    study_plan[f"Day {i+1}"] = day_topics.to_dict(orient="records")

# Save as text file
with open("study_planner.txt", "w", encoding="utf-8") as f:
    for day, topics in study_plan.items():
        f.write(f"{day}:\n")
        for t in topics:
            f.write(f"  📘 {t['Chapter Title']} - {t['Topic Number']} {t['Topic Title']}\n")
        f.write("\n")

print("✅ Study planner saved as: study_planner.txt")


📅 Enter number of study days (e.g., 10, 15, 30):  15


✅ Study planner saved as: study_planner.txt
