<a href="https://colab.research.google.com/github/noobie105/10MS_RAG_Application/blob/main/10MS_TA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Converting PDF to DOC using GEMINI

In [None]:
!pip install pdf2image python-docx
!apt-get update
!apt-get install -y poppler-utils

In [None]:
import os
import google.generativeai as genai
from docx import Document
import re
from google.colab import files
from google.colab import userdata
import time
from pdf2image import convert_from_path
import requests

#configuring Gemini API
def configure_gemini():
    try:
        api_key = userdata.get("GOOGLE_AOI_KEY_2")
        if not api_key:
            raise ValueError("API key not found in Colab Secrets as 'GOOGLE_AOI_KEY_2'.")
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel('models/gemini-2.0-flash')  # Use Gemini 2.0 Flash
        # Test API key with a simple request
        model.generate_content("Test")
        return model
    except Exception as e:
        print(f"Error configuring Gemini API: {e}")
        print("Ensure the 'GOOGLE_AOI_KEY_2' secret is set in Colab Secrets: https://colab.research.google.com/drive/1...")
        return None

#extracting Text from PDF
def extract_text_from_page(pdf_path, page_num, model, max_retries=3, retry_delay=5):
    for attempt in range(1, max_retries + 1):
        try:
            images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num, dpi=300)
            if not images:
                print(f"Error: No image generated for page {page_num} on attempt {attempt}.")
                continue
            temp_image_path = f"/content/temp_page_{page_num}.png"
            images[0].save(temp_image_path, 'PNG')

            sample_file = genai.upload_file(path=temp_image_path, display_name=f"Page_{page_num}")
            print(f"Uploaded page {page_num} as: {sample_file.uri} on attempt {attempt}")

            prompt = """
            Extract all text from the provided PDF page image, preserving the original Bengali script, sentence structure, and formatting as much as possible.
            Include all questions, answers, passages, and vocabulary notes.
            Ensure no content is omitted from the page.
            Output the text in a clean, readable format without summarizing or modifying content.
            """
            response = model.generate_content([sample_file, prompt])

            genai.delete_file(sample_file.name)
            print(f"Deleted temporary file: {sample_file.name}")

            os.remove(temp_image_path)

            text = response.text if response.text else None
            if text:
                return text
            print(f"Warning: No text extracted from page {page_num} on attempt {attempt}.")
        except (requests.exceptions.ConnectionError, Exception) as e:
            print(f"Error extracting text from page {page_num} on attempt {attempt}: {e}")
            if attempt < max_retries:
                print(f"Retrying page {page_num} in {retry_delay} seconds...")
                time.sleep(retry_delay)
            continue
        finally:
            if os.path.exists(temp_image_path):
                os.remove(temp_image_path)

    print(f"Failed to extract text from page {page_num} after {max_retries} attempts.")
    return None

def clean_text(text):
    if not text:
        return ""
    text = re.sub(r'\s+', ' ', text.strip())
    text = re.sub(r'[^\u0980-\u09FF\s।]', '', text)
    text = text.replace('া ু', 'ৌ').replace('ি ী', 'ী').replace('ু ু', 'ূ')
    text = re.sub(r'অনলাইন ব্যাচ বাংলা ইংরেজি আইসিটি\s*', '', text)
    return text

def save_to_word(text, output_path="/content/preprocessed_text.docx"):
    doc = Document()
    doc.add_paragraph(text)
    doc.save(output_path)
    print(f"Preprocessed text saved to {output_path}")
    files.download(output_path)

def preprocess_pdf(pdf_path="/content/HSC26-Bangla1st-Paper.pdf"):
    model = configure_gemini()
    if not model:
        print("Error: Failed to initialize Gemini model. Check your API key in Colab Secrets.")
        return

    try:
        images = convert_from_path(pdf_path, dpi=100)
        total_pages = len(images)
        print(f"Total pages in PDF: {total_pages}")
    except Exception as e:
        print(f"Error determining page count: {e}")
        return

    all_text = ""
    for page_num in range(1, total_pages + 1):
        print(f"Extracting page {page_num}...")
        page_text = extract_text_from_page(pdf_path, page_num, model)
        if page_text:
            cleaned_text = clean_text(page_text)
            all_text += f"\n\n--- Page {page_num} ---\n{cleaned_text}"
        else:
            print(f"Warning: No text extracted from page {page_num} after retries.")
        time.sleep(5)

    if not all_text.strip():
        print("Error: No text extracted from any page. Ensure the PDF is valid.")
        return
    save_to_word(all_text)

    print("\nPreprocessed Text Preview:")
    print(all_text[:1000] + "..." if len(all_text) > 1000 else all_text)

if __name__ == "__main__":
    preprocess_pdf()