In [1]:
import os
import json
import pdfplumber
import pytesseract
from PIL import Image
import pandas as pd
from datetime import datetime
from docx import Document
from pptx import Presentation
import warnings
import re

# Suppress specific CropBox warning from pdfplumber
import logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)

class CropBoxFilter:
    def filter(self, record):
        return not (record.levelno == logging.WARNING and 'CropBox missing from /Page, defaulting to MediaBox' in record.getMessage())

logging.getLogger().addFilter(CropBoxFilter())

# Add tkinter for folder selection
def select_folder():
    import tkinter as tk
    from tkinter import filedialog
    root = tk.Tk()
    root.withdraw()
    folder_selected = filedialog.askdirectory(title='Select folder to process')
    root.destroy()
    return folder_selected

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception:
        pass
    return text.strip()

def ocr_pdf(pdf_path):
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                img = page.to_image(resolution=300).original
                pil_img = Image.fromarray(img)
                page_text = pytesseract.image_to_string(pil_img)
                if page_text:
                    text += page_text + "\n"
    except Exception:
        pass
    return text.strip()

def extract_text_from_excel(excel_path):
    text = ""
    try:
        xls = pd.ExcelFile(excel_path)
        for sheet_name in xls.sheet_names:
            df = pd.read_excel(xls, sheet_name=sheet_name, dtype=str)
            text += f"\n--- Sheet: {sheet_name} ---\n"
            text += df.fillna('').to_string(index=False, header=True)
    except Exception:
        pass
    return text.strip()

def extract_text_from_docx(docx_path):
    text = ""
    try:
        doc = Document(docx_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    except Exception:
        pass
    return text.strip()

def extract_text_from_pptx(pptx_path):
    text = ""
    try:
        prs = Presentation(pptx_path)
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + "\n"
    except Exception:
        pass
    return text.strip()

def process_folder(folder_path, output_json_path):
    data = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            print(f"[INFO] Processing file: {file}")
            file_path = os.path.join(root, file)
            ext = file.lower().split('.')[-1]
            entry = {
                'file_name': file,
                'file_path': file_path,
                'type': ext,
                'extracted_at': datetime.now().isoformat(),
                'text': ''
            }
            reason = None
            if ext == 'pdf':
                text = extract_text_from_pdf(file_path)
                if not text:
                    text = ocr_pdf(file_path)
                    if text:
                        reason = 'extracted via OCR'
                    else:
                        reason = 'no text extracted (PDF and OCR failed)'
                else:
                    reason = 'extracted as text PDF'
                entry['text'] = text
            elif ext in ['xls', 'xlsx']:
                text = extract_text_from_excel(file_path)
                entry['text'] = text
                reason = 'extracted from Excel' if text else 'no text extracted (Excel)'
            elif ext == 'docx':
                text = extract_text_from_docx(file_path)
                entry['text'] = text
                reason = 'extracted from Word' if text else 'no text extracted (Word)'
            elif ext == 'pptx':
                text = extract_text_from_pptx(file_path)
                entry['text'] = text
                reason = 'extracted from PowerPoint' if text else 'no text extracted (PowerPoint)'
            else:
                print(f"[WARN] Skipped unsupported file type: {file}")
                continue
            if entry['text']:
                data.append(entry)
            else:
                print(f"[WARN] No text extracted from: {file} ({reason})")
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"Processed {len(data)} documents. Output saved to {output_json_path}")

if __name__ == "__main__":
    # New output directory outside the repo
    output_dir = r'C:/Users/Omar Essam2/OneDrive - Rowad Modern Engineering/x004 Data Science/03.rme.db/05.llm/extracted_json'
    os.makedirs(output_dir, exist_ok=True)
    folder = select_folder()
    if not folder:
        print("No folder selected. Exiting.")
    else:
        folder_name = os.path.basename(os.path.normpath(folder))
        output_json = os.path.join(output_dir, f"{folder_name}_extracted.json")
        process_folder(folder, output_json) 

[INFO] Processing file: Copy of G.M. %s Comparison Ehab 12-12-23.xlsb
[WARN] Skipped unsupported file type: Copy of G.M. %s Comparison Ehab 12-12-23.xlsb
[INFO] Processing file: G.M. %s Comparison Ehab 30-3-2024 cutdate 14-3-2024.xlsb
[WARN] Skipped unsupported file type: G.M. %s Comparison Ehab 30-3-2024 cutdate 14-3-2024.xlsb
[INFO] Processing file: G.M. %s Comparison Ehab 30-9-2023.xlsb
[WARN] Skipped unsupported file type: G.M. %s Comparison Ehab 30-9-2023.xlsb
[INFO] Processing file: G.M. %s Comparison Ehab 30-7-2023-1.xlsx
[WARN] No text extracted from: G.M. %s Comparison Ehab 30-7-2023-1.xlsx (no text extracted (Excel))
[INFO] Processing file: Copy of G.M. %s Comparison Ehab 30-9-2023LLNN.xlsb
[WARN] Skipped unsupported file type: Copy of G.M. %s Comparison Ehab 30-9-2023LLNN.xlsb
[INFO] Processing file: G.M. %s Comparison Ehab 30-9-2023L.xlsb
[WARN] Skipped unsupported file type: G.M. %s Comparison Ehab 30-9-2023L.xlsb
[INFO] Processing file: G.M. %s Comparison Ehab 30-9-2023LL

In [2]:
import os
import json
import openai
import chromadb
from chromadb.config import Settings
from dotenv import load_dotenv
from tqdm import tqdm

# Load OpenAI API key from .env
load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

# New paths outside the repo
EXTRACTED_DIR = r'C:/Users/Omar Essam2/OneDrive - Rowad Modern Engineering/x004 Data Science/03.rme.db/05.llm/extracted_json'
CHROMA_DB_DIR = r'C:/Users/Omar Essam2/OneDrive - Rowad Modern Engineering/x004 Data Science/03.rme.db/05.llm/chroma_db'
COLLECTION_NAME = 'company_docs'
CHUNK_SIZE = 1000  # characters per chunk
CHUNK_OVERLAP = 200
ID_TRACK_FILE = os.path.join(CHROMA_DB_DIR, 'embedded_chunk_ids.txt')

# Helper: chunk text
def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# Helper: get OpenAI embeddings
def get_embedding(text):
    resp = openai.embeddings.create(
        input=[text],
        model="text-embedding-ada-002"
    )
    return resp.data[0].embedding

# Initialize Chroma DB
client = chromadb.PersistentClient(path=CHROMA_DB_DIR, settings=Settings(allow_reset=True))
if COLLECTION_NAME in [c.name for c in client.list_collections()]:
    collection = client.get_collection(COLLECTION_NAME)
else:
    collection = client.create_collection(COLLECTION_NAME)

def load_embedded_ids():
    if not os.path.exists(ID_TRACK_FILE):
        return set()
    with open(ID_TRACK_FILE, 'r', encoding='utf-8') as f:
        return set(line.strip() for line in f if line.strip())

def save_embedded_id(chunk_id):
    with open(ID_TRACK_FILE, 'a', encoding='utf-8') as f:
        f.write(chunk_id + '\n')

def process_json_files():
    files = [f for f in os.listdir(EXTRACTED_DIR) if f.endswith('.json')]
    doc_count = 0
    embedded_ids = load_embedded_ids()
    for file in files:
        with open(os.path.join(EXTRACTED_DIR, file), 'r', encoding='utf-8') as f:
            docs = json.load(f)
        for doc in tqdm(docs, desc=f"Processing {file}"):
            text = doc['text']
            if not text.strip():
                continue
            chunks = chunk_text(text)
            for i, chunk in enumerate(chunks):
                chunk_id = f"{doc['file_name']}_{i}"
                if chunk_id in embedded_ids:
                    continue  # Skip already embedded chunk
                meta = {
                    'file_name': doc['file_name'],
                    'file_path': doc['file_path'],
                    'type': doc['type'],
                    'extracted_at': doc['extracted_at'],
                    'chunk': i
                }
                try:
                    emb = get_embedding(chunk)
                    collection.add(
                        documents=[chunk],
                        embeddings=[emb],
                        metadatas=[meta],
                        ids=[chunk_id]
                    )
                    save_embedded_id(chunk_id)
                    doc_count += 1
                except Exception as e:
                    print(f"Error embedding chunk: {e}")
    print(f"Finished! {doc_count} new chunks embedded and stored in Chroma DB.")

if __name__ == "__main__":
    process_json_files() 

Processing 01 Dash Boards_extracted.json: 100%|██████████| 29/29 [00:00<00:00, 56208.33it/s]
Processing 02 artical_extracted.json: 100%|██████████| 21/21 [00:00<00:00, 24755.59it/s]
Processing 02 Monthly Progress Presentation_extracted.json: 100%|██████████| 514/514 [00:00<00:00, 123397.19it/s]
Processing 0207 Qasr Rashwan Lot A_extracted.json: 100%|██████████| 5/5 [00:00<00:00, 25731.93it/s]
Processing 0208 Qasr Rashwan Lot B_extracted.json: 100%|██████████| 3/3 [00:00<00:00, 12006.60it/s]
Processing 07 RME Global Reports_extracted.json: 100%|██████████| 1007/1007 [58:26<00:00,  3.48s/it]   

Finished! 6754 new chunks embedded and stored in Chroma DB.



