In [1]:
from os import path

PROJECT_ROOT = path.abspath(globals()['_dh'][0])
DATALAKE_PATH = path.abspath(path.join(PROJECT_ROOT, '..', '..', 'datalake', 'imgproj'))

In [2]:
import base64
import os

import cv2
import numpy as np
import pandas as pd

from utils.image_toolbox import load_image


def jpeg_as_base64(fqfn: str) -> str:
    with open(fqfn, 'rb') as f:
        jpeg_bytes = f.read()
        image_str = base64.b64encode(jpeg_bytes).decode('utf-8')
        return image_str


def read(folder_path: str, labels: list[str] = ['0', '1'], k: int = 3) -> pd.DataFrame:
    img_labels: list[int] = []
    img_fqfns: list[str] = []
    img_file_names: list[str] = []
    img_b64s: list[str] = []
    img_greyscales: list[np.ndarray] = []
    img_dims: list[tuple[int, int]] = []  # (Height, Width)

    for label in labels:
        label_counter = 0
        label_folder = os.path.join(folder_path, label)
        if not os.path.isdir(label_folder):
            continue  # Skip if the folder does not exist

        for file_name in os.listdir(label_folder):
            if file_name.endswith('.jpg') or file_name.endswith('.jpeg'):
                label_counter += 1
                if label_counter > k:
                    break

                img_file_names.append(file_name)

                img_fqfn = os.path.join(label_folder, file_name)
                img_fqfns.append(img_fqfn)

                img_b64 = jpeg_as_base64(img_fqfn)
                img_b64s.append(img_b64)

                image_grey = load_image(img_fqfn, flag=cv2.IMREAD_GRAYSCALE)
                img_greyscales.append(image_grey)

                image_dimension = image_grey.shape[:2]  # do not include color channel
                img_dims.append(image_dimension)

                img_labels.append(int(label))


    return pd.DataFrame(data={
        'file_name': img_file_names,
        'fqfn': img_fqfns,
        'jpeg_file_b64': img_b64s,
        'img_grey': img_greyscales,
        'img_height': [dim[0] for dim in img_dims] ,
        'img_width': [dim[1] for dim in img_dims] ,
        'label': img_labels,
    })


In [3]:
df = read(DATALAKE_PATH, labels=['0', '1'], k=3)
df[['img_grey', 'img_height', 'img_width']]

Unnamed: 0,img_grey,img_height,img_width
0,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",6132,390
1,"[[250, 250, 250, 250, 250, 250, 250, 250, 250,...",2121,980
2,"[[249, 249, 249, 249, 249, 249, 249, 249, 249,...",980,980
3,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",980,980
4,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",980,980
5,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",980,980


In [4]:
from imgproj.classifier.mlflow_loader import MLflowModel

# Instantiate and use the model as usual
model = MLflowModel()
df['outcome'] = model.predict(inputs=df)
print(df)


2025-02-02 17:09:41,442 - tensor_craft - INFO - img_grey[0].shape=(6132, 390)
2025-02-02 17:09:41,533 - tensor_craft - INFO - Label=0
2025-02-02 17:09:41,534 - tensor_craft - INFO - Label=0
2025-02-02 17:09:41,534 - tensor_craft - INFO - Label=0
2025-02-02 17:09:41,534 - tensor_craft - INFO - Label=1
2025-02-02 17:09:41,534 - tensor_craft - INFO - Label=1
2025-02-02 17:09:41,534 - tensor_craft - INFO - Label=1


                                           file_name  \
0          46227448-9f75-4799-9129-ddc1b0e3e62b.jpeg   
1          fb8355c4-c524-4e3a-bd19-e93e48ba19a1.jpeg   
2                                      636575470.jpg   
3  000053f06e606e017aa100004000000000000000000000...   
4  00006a606e606c06708600004000000000000000000000...   
5  000049706e602c11180100001000000000000000000000...   

                                                fqfn  \
0  /Users/shershen/datalake/imgproj/0/46227448-9f...   
1  /Users/shershen/datalake/imgproj/0/fb8355c4-c5...   
2   /Users/shershen/datalake/imgproj/0/636575470.jpg   
3  /Users/shershen/datalake/imgproj/1/000053f06e6...   
4  /Users/shershen/datalake/imgproj/1/00006a606e6...   
5  /Users/shershen/datalake/imgproj/1/000049706e6...   

                                       jpeg_file_b64  \
0  /9j/4AAQSkZJRgABAQAAAQABAAD/4gHYSUNDX1BST0ZJTE...   
1  /9j/4AAQSkZJRgABAQAAAQABAAD/4gHYSUNDX1BST0ZJTE...   
2  /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQg