In [0]:
import sys
import os
sys.path.append("/Workspace/Users/pdacosta@integralads.com/.ide/ctx-inference_stack/src")
sys.path.append("/Workspace/Users/pdacosta@integralads.com/.ide/ctx-logoface-detector")
os.environ["AWS_PROFILE"]="saml"

In [0]:
from compute_engine.units.generator import UnitGenerator, UnitDeclatation
from compute_engine.structures.entities import RunInfo
from compute_engine.structures.messages import MessageHeader, RunInfoMessage, EndOfComputeMessage
from compute_engine.structures.entities import FrameDetection, FrameDetections
from compute_engine.units.handler import Handler, RunVariable
from compute_engine.utils.message_capture import MessageCapture

from concurrent.futures import Future
import pyarrow.parquet as pq
import asyncio
import pyarrow as pa
import tqdm
import pandas as pd
from pyspark.sql.functions import concat, lit, col
from dbricks.dbutils.boxes import transf_any_box

In [0]:
n_workers = 2
model_batch_size = 16
detection_min_size_percentage = 0.01
confidence_threshold = 0.0


batch_size = 16
generate_uris = True
ignore_progress = True

dbfs_mnt_path_faces = '/dbfs/mnt/innovation/pdacosta/data/wider_face/preds_faces/xywh'
dbfs_mnt_path_uris = '/dbfs/mnt/innovation/pdacosta/data/wider_face/preds_faces/xywh/uris'

pyspark_mnt_path_faces = dbfs_mnt_path_faces.replace('/dbfs', '')
pyspark_mnt_path_uris = dbfs_mnt_path_uris.replace('/dbfs', '')

csv_mnt_path = "/dbfs/mnt/innovation/pdacosta/data/wider_face/dataset/"
pyspark_csv_mnt_path = csv_mnt_path.replace('/dbfs', '')

s3_prefix = "s3://mls.us-east-1.innovation/pdacosta/data/wider_face/dataset/"

progress_file = os.path.join(dbfs_mnt_path_uris, 'progress.txt')

In [0]:
i = 0
def gen_message(url):
    global i
    run_info = {}
    run_info["export"] = 'json'
    run_info["pipeline"] = {"mode": "semi_auto"}
    run_info["source"] = {
        "kind": "image",
        "url": url,
        "uuid": "0"
    }
    run_info["run"] = {
        "id": i
    }
    i += 1
    run_info["company"] = {"id": 0}
    run_info = RunInfo(**run_info, atomic=None)
    return run_info

In [0]:
unit_generator = UnitGenerator(model_engine="tf")
unit_generator.set_config("database_client", None)
unit_generator.set_config("bucket_name", "reminiz.production")
unit_generator.set_config("cloud_provider", "aws")
unit_generator.set_config("models_local_path", "./")
unit_generator.set_config("s3_bucket_url", "")


units = []
with unit_generator:
    input_unit = unit_generator.units.downloader()
    x = input_unit
    units.append(x)
    x @= unit_generator.units.run_frame_extractor()
    units.append(x)
    x @= unit_generator.units.frame_resizer(target_size=416)
    units.append(x)
    x @= unit_generator.units.detector(
        max_workers= n_workers,
        model_path= "models/detectors/faces/face_detector_march_20-20230116-tf/",
        batchsize= model_batch_size,
        detection_min_size_percentage= detection_min_size_percentage,
        confidence_threshold=confidence_threshold
    )

[32m2023-10-05 14:27:27,141[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mMainThread#5421:cloud_storage.l67[0m [1;30mINFO[0m Begin to download 3 files from models/detectors/faces/face_detector_march_20-20230116-tf/
[32m2023-10-05 14:27:27,142[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mMainThread#5421:cloud_storage.l97[0m [1;30mINFO[0m Downloading models/detectors/faces/face_detector_march_20-20230116-tf/saved_model.pb file to ./models/detectors/faces/face_detector_march_20-20230116-tf//saved_model.pb ...
[32m2023-10-05 14:27:27,806[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mMainThread#5421:cloud_storage.l99[0m [1;30mINFO[0m Downloading models/detectors/faces/face_detector_march_20-20230116-tf/saved_model.pb file to ./models/detectors/faces/face_detector_march_20-20230116-tf//saved_model.pb done
[32m2023-10-05 14:27:27,807[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mMainThread#5421:cloud_storage.l97[0m [1;30mINFO[0m Downloading models/

Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.loader.load or tf.compat.v1.saved_model.load. There will be a new function for importing SavedModels in Tensorflow 2.0.
INFO:tensorflow:Restoring parameters from ./models/detectors/faces/face_detector_march_20-20230116-tf/variables/variables


[32m2023-10-05 14:27:36,986[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mMainThread#5421:detector.l82[0m [1;30mINFO[0m warm_up detector ...
[32m2023-10-05 14:27:43,862[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mMainThread#5421:detector.l87[0m [1;30mINFO[0m warm_up detector done


In [0]:
if generate_uris:
    
    # read the csv files with the annotations
    val_df = spark.read.csv(os.path.join(pyspark_csv_mnt_path, "val.csv"), header=True)
    
    # stack the dataframes
    df = val_df
    
    # create the correct uri for the images, we need to add the s3 prefix to the paths
    df = df.withColumn("uri", concat(lit(s3_prefix), col("path")))
    
    # filter out the images that are too small, cannot have a dimension smaller than 96 px
    df = df.filter((col("width") >= 96) & (col("height") >= 96))
    
    #keep only the columns we need
    df = df.select("asset", "uri")
    
    # read the assets already saved
    # if the file does not exist, avoid the error
    # try:
    #     assets_saved = spark.read.parquet(pyspark_mnt_path_faces)
    #     assets_saved = assets_saved.select("asset").distinct()
    #     # filter out the assets already saved
    #     df = df.join(assets_saved, on="asset", how="left_anti")
    # except:
    #     pass

    # make sure we have no duplicates
    df = df.dropDuplicates(["asset"])
    
    #save df as a csv file 
    df.write.mode("overwrite").csv(os.path.join(pyspark_mnt_path_uris, "uris.csv"), header=True)
        
else:
    
    df = spark.read.csv(os.path.join(pyspark_mnt_path_uris, "uris.csv"), header=True)
    

# collect the uris and assets into a python lists
rows = df.collect()
assets, uris = [row.asset for row in rows], [row.uri for row in rows]
df.unpersist()

print(f"Number of images to process: {len(assets)}")
print(f"Assets: {assets[:2]}")
print(f"Uris: {uris[:2]}")    

Number of images to process: 3226
Assets: ['0_Parade_Parade_0_102.jpg', '0_Parade_Parade_0_12.jpg']
Uris: ['s3://mls.us-east-1.innovation/pdacosta/data/wider_face/dataset/val/0_Parade_Parade_0_102.jpg', 's3://mls.us-east-1.innovation/pdacosta/data/wider_face/dataset/val/0_Parade_Parade_0_12.jpg']


In [0]:
f = input_unit(
    run_info=gen_message(
        url=uris[0]
    )
)
f.result()

[32m2023-10-05 14:31:14,347[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mThread-10#5421:unit.l417[0m [1;30mINFO[0m [94mdownloader unit running with 1 worker(s) ...[0m
[32m2023-10-05 14:31:14,347[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mThread-12#5421:unit.l417[0m [1;30mINFO[0m [94mrun_frame_extractor unit running with 1 worker(s) ...[0m
[32m2023-10-05 14:31:14,348[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mThread-14#5421:unit.l417[0m [1;30mINFO[0m [94mframe_resizer unit running with 1 worker(s) ...[0m
[32m2023-10-05 14:31:14,348[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mThread-16#5421:unit.l417[0m [1;30mINFO[0m [94mdetector unit running with 2 worker(s) ...[0m
[32m2023-10-05 14:31:14,352[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mThread-10#5421:downloader.l83[0m [1;30mDEBUG[0m [32mDownloading asset s3://mls.us-east-1.innovation/pdacosta/data/wider_face/dataset/val/0_Parade_Parade_0_102.jpg ...[0m
[32m2023

Out[9]: [{'frame': array([[[107, 115, 118],
          [108, 116, 119],
          [124, 132, 134],
          ...,
          [ 42,  40,  25],
          [ 38,  34,  25],
          [ 34,  30,  21]],
  
         [[106, 114, 117],
          [112, 120, 123],
          [144, 152, 154],
          ...,
          [ 39,  37,  22],
          [ 34,  30,  21],
          [ 29,  25,  16]],
  
         [[102, 109, 117],
          [117, 124, 132],
          [164, 172, 175],
          ...,
          [ 40,  39,  21],
          [ 36,  34,  19],
          [ 33,  31,  16]],
  
         ...,
  
         [[173, 169, 160],
          [173, 169, 160],
          [174, 170, 161],
          ...,
          [252, 232,  37],
          [252, 230,  46],
          [252, 230,  46]],
  
         [[172, 168, 159],
          [172, 168, 159],
          [173, 169, 160],
          ...,
          [252, 232,  37],
          [252, 230,  46],
          [252, 230,  46]],
  
         [[171, 167, 158],
          [171, 167, 158],
       

[32m2023-10-05 14:31:15,819[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mThread-15#5421:performance_monitor.l59[0m [1;30mINFO[0m [93m                      detector - 1.0 msg/s[0m


In [0]:
def save_batch(assets, uris, boxes, idx):
    
    
    data = [{"asset":asset, "uri": uri, "boxes": boxes} for asset, uri, boxes in zip(assets, uris, boxes)]
    
    df_ = pd.DataFrame(data)
    table = pa.Table.from_pandas(df_)
    parquet_filename = os.path.join(dbfs_mnt_path_faces, f"face_boxes_{str(idx).zfill(6)}.parquet")
    try:
        pq.write_table(table, parquet_filename)
        with open(progress_file, 'w') as f:
            f.write(str(idx))
    except:
        raise Exception('Could not save the parquet file')

In [0]:
if not ignore_progress:
    # read the progress index from a file
    try:
        with open(progress_file, 'r') as f:
            progress = int(f.read())
    except:
        progress = 0
else:
    progress = 0

n_uris = len(uris)
print(f'Progress: {progress} out of {n_uris} images')

Progress: 0 out of 3226 images


In [0]:

save_uris = []
save_boxes = []
save_assets = []
for i in range(progress, n_uris, batch_size):
    batch_uris = uris[i:i+batch_size]
    batch_assets = assets[i:i+batch_size]
    # we call the model with the batch of uris
    calls = [asyncio.wrap_future(input_unit(run_info=gen_message(url=uri))) for uri in batch_uris]
    
    results = await asyncio.gather(*calls)
    
    # we need to get the boxes now
    batch_boxes = []
    for res in results:
        frame_detections = res[0]["frame_detections"]
        detections = frame_detections.detections
        boxes = []
        if detections:
            for frame_detection in detections:
                score = frame_detection.classification[0][-1]
                box = frame_detection.box
                box = transf_any_box(box, "xyxy", "xywh")
                boxes.append({"score": score, "box": box})
                
        batch_boxes.append(boxes)
    
    save_uris += batch_uris
    save_boxes += batch_boxes
    save_assets += batch_assets
    
    
    if (i + batch_size) % 20000 == 0:
        
        save_batch(save_assets, save_uris, save_boxes, i+batch_size)

        # reset the batch count, save uris, and save boxes
        save_uris = []
        save_boxes = []
        save_assets = []
        
# Handle the final batch
if save_uris:
    save_batch(save_assets, save_uris, save_boxes, n_uris)

[32m2023-10-05 14:34:12,490[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mThread-10#5421:downloader.l83[0m [1;30mDEBUG[0m [32mDownloading asset s3://mls.us-east-1.innovation/pdacosta/data/wider_face/dataset/val/0_Parade_Parade_0_102.jpg ...[0m
[32m2023-10-05 14:34:12,492[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mThread-10#5421:downloader.l83[0m [1;30mDEBUG[0m [32mDownloading asset s3://mls.us-east-1.innovation/pdacosta/data/wider_face/dataset/val/0_Parade_Parade_0_12.jpg ...[0m
[32m2023-10-05 14:34:12,494[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mThread-10#5421:downloader.l83[0m [1;30mDEBUG[0m [32mDownloading asset s3://mls.us-east-1.innovation/pdacosta/data/wider_face/dataset/val/0_Parade_Parade_0_120.jpg ...[0m
[32m2023-10-05 14:34:12,496[0m [35m0712-085056-y3y7acgy-10-104-23-245[0m [34mThread-10#5421:downloader.l83[0m [1;30mDEBUG[0m [32mDownloading asset s3://mls.us-east-1.innovation/pdacosta/data/wider_face/dataset/val/0_Parad