In [None]:
%%writefile "genai_utils/index_images_es.py"
#!/usr/bin/env python 

import pdfplumber, os, base64, glob, tqdm, json
from io import BytesIO
import io, base64
from PIL import Image
from IPython.display import HTML
from genai_utils.describe_image import describe_image
from genai_utils import db_elastic

def _extractImagesFromPDF(file=None, **kwargs):
    assert file.endswith("pdf"), "Called with non PDF File!!"

    ret = {}
    txt = []
    with pdfplumber.open(file) as doc:
        for pageNumber, page in enumerate(doc.pages):
            images = page.images
            txt.append(page.extract_text())
            for image_index, img in enumerate(images):
                bbox = (img['x0'], img['top'], img['x1'], img['bottom'])
                image = page.within_bbox(bbox).to_image()
                pil_image = image.original
                imageRGB = pil_image.convert("RGB")
                b = BytesIO()
                imageRGB.save(b, format='PNG')
                b.seek(0)
                br= b.read()
                b64Image = base64.b64encode(br).decode("utf-8")
                url = "data:image/jpg;base64, " + b64Image
                #img = f"<img src='{url}' >"
                #display (HTML(img))
                ret[url] = 1
    ret = [r for r in ret.keys()]
    return dict(images=ret, texts=txt)

def indexImagesFromPDF(file, savedir="/tmp/genai_utils/", verbose =0):
    ret = _extractImagesFromPDF(file)
        
    if ( savedir is None or not savedir):
        return ret, None
    files = []
    for i, img in enumerate(ret['images']):
        img1=img[img.index(",")+1:].strip()
        imgd = Image.open(io.BytesIO(base64.decodebytes(img1.encode()) ))
        
        bname = os.path.basename(file)
        sfile = f"{savedir}/{bname}__{i}.png"
        #os.makedirs(savedir, exist_ok=True)
        imgd.save(sfile)
        files.append(sfile)
        print(f"Saved {sfile}")
        if ( verbose):
            display(HTML(f"<img src='{img}'> "))
            print(ret['texts'][i][0:128])
    return ret, files
    
def index_directory(directory, outf= {}, recurse=0):
    pngs = glob.glob(os.path.join(directory, '**/*.png') , recursive=recurse)
    jpgs = glob.glob(os.path.join(directory, '**/*.jpg') , recursive=recurse)
    jpes = glob.glob(os.path.join(directory, '**/*.jpeg'), recursive=recurse)
    pdfs = glob.glob(os.path.join(directory, '**/*.pdf') , recursive=recurse)

    images= []
    for pdfFile in tqdm.tqdm(pdfs):
        print(f"Getting images from {pdfFile}")
        ret, files = indexImagesFromPDF(pdfFile)
        images.extend(files)
    
    image_paths = [*pngs, *jpgs, *jpes, *images]
    for image_path in tqdm.tqdm(image_paths):
        if image_path in outf:
            continue
        with open(image_path, 'rb') as f:
            image_data = f.read()
        try:
            description = ""
            description = describe_image(image_data)
            print(f"Indexed {image_path}: {description}")
            outf[image_path] = description
        except Exception as e:
            print(f"Failed to index {image_path}: {e}")
            pass
        
    return outf

def getDocs(outf):
    from langchain_core.documents import Document

    docs = []
    for k,v in outf.items():
        print(k, v[0:32])
        d = Document(page_content=v, metadata=dict(source=k) )
        docs.append(d)
    return docs

def save(outf, file="/tmp/genai_utils/images_dir.json"):
    with open(file, "wt") as f:
        f.write(json.dumps(outf))
    
def load(file="/tmp/genai_utils/images_dir.json"):
    outf = {}
    if ( os.path.exists(file)):
        with open(file, "rt") as f:
            outf = json.loads( f.read() )
    return outf

outf=load()

Overwriting genai_utils/index_images_es.py


In [8]:
from genai_utils import db_elastic

outf = index_directory("/tmp/**", outf=outf, recurse=1)

m, url, user,pw = "all-minilm:L6-v2", "http://localhost:9200", "elastic", "elastic"
index = "sageai_images"

db_elastic.loadES( model=m, index=index, es_url=url , es_user=user, es_pass=pw, docs=getDocs(outf) )
db_elastic.esCountIndex(index=index, es_url=url, es_user=user, es_pass= pw)
    

0it [00:00, ?it/s]
100%|██████████| 10/10 [00:00<00:00, 384798.53it/s]
2025-05-27 18:27:24,103 elastic_transport.transport INFO: GET http://localhost:9200/ [status:200 duration:0.005s]
2025-05-27 18:27:24,106 elastic_transport.transport INFO: HEAD http://localhost:9200/sageai_images [status:200 duration:0.002s]
2025-05-27 18:27:24,216 httpx INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


/tmp/a.png The image is a flowchart depicti
/tmp/gg/a.png The image is a flowchart depicti
/tmp/genai_utils/z4.pdf__0.png The image is a flowchart depicti
/tmp/genai_utils/z4.pdf__1.png The image is a line graph with a
/tmp/genai_utils/z4.pdf__3.png The image is a bar chart with a 
/tmp/genai_utils/z4.pdf__2.png The image is a line graph with a
/tmp/genai_utils/z4.pdf__4.png The image is a line graph with a


2025-05-27 18:27:24,256 elastic_transport.transport INFO: PUT http://localhost:9200/_bulk?refresh=true [status:200 duration:0.038s]
2025-05-27 18:27:24,265 elastic_transport.transport INFO: POST http://localhost:9200/sageai_images/_count [status:200 duration:0.004s]


Total documents in index 'sageai_images': {'count': 6, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}


ObjectApiResponse({'count': 6, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

In [11]:
res = db_elastic.esSearchIndex(None, index= index, query="SAda")
res[0]

2025-05-27 18:28:17,449 elastic_transport.transport INFO: GET http://localhost:9200/ [status:200 duration:0.005s]
2025-05-27 18:28:17,490 httpx INFO: HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-05-27 18:28:17,502 elastic_transport.transport INFO: POST http://localhost:9200/sageai_images/_search?_source_includes=metadata,text [status:200 duration:0.011s]


{'page_content': 'The image is a flowchart depicting a system architecture for processing and summarizing audio/video meetings. The flowchart is divided into two main sections: the user interface and the processing pipeline. \n\nIn the user interface section, labeled "User Interface," there is a box labeled "Upload Meeting (audio/video)" which represents the input of a meeting recording. This input is directed to a box labeled "Diarize and Transcript," which is part of the processing pipeline. The diarization and transcription process is facilitated by AWS Transcribe, as indicated by the text "AWS Transcribe" within the box.\n\nThe output from the "Diarize and Transcript" box is labeled "Transcript" and is directed to a box labeled "BART Summarizer." This box represents the use of the BART model for summarizing the transcribed content.\n\nThe flowchart also includes a section labeled "LLM" which stands for Large Language Model. This section includes a box labeled "GPT-3 Turbo" and anot

In [15]:
res = db_elastic.esTextSearch(query="Sada", index= index, )
if (len(res) > 0):
    res[0]

2025-05-27 18:29:06,895 elastic_transport.transport INFO: POST http://localhost:9200/sageai_images/_search?q=Sada [status:200 duration:0.015s]


# Test

In [None]:
file= os.path.expanduser("~/Desktop/data/z4.pdf")
ret = indexImagesFromPDF(file)