# Import dependencies

In [1]:
from qai_hub_models.models.facemap_3dmm.model import MODEL_ASSET_VERSION, MODEL_ID
from qai_hub_models.utils.asset_loaders import CachedWebModelAsset
from ai_edge_litert.interpreter import Interpreter
from PIL import Image
import coremltools as ct
import os, cv2, numpy as np
from pathlib import Path
import shutil

Torch version 2.5.1 has not been tested with coremltools. You may run into unexpected errors. Torch 2.5.0 is the most recent version that has been tested.


# Initialize Assets

In [2]:
ASSET_NAMES = [
    "meanFace.npy",
    "shapeBasis.npy",
    "blendShape.npy",
    "face_img.jpg",
    "face_img_fbox.txt",
]

out_dir = Path("./assets")
out_dir.mkdir(parents=True, exist_ok=True)

resolved = {}
for name in ASSET_NAMES:
    src_path = CachedWebModelAsset.from_asset_store(
        MODEL_ID, MODEL_ASSET_VERSION, name
    ).fetch()
    dst_path = out_dir / name
    shutil.copy2(src_path, dst_path)
    resolved[name] = str(dst_path)

print(resolved)

{'meanFace.npy': 'assets/meanFace.npy', 'shapeBasis.npy': 'assets/shapeBasis.npy', 'blendShape.npy': 'assets/blendShape.npy', 'face_img.jpg': 'assets/face_img.jpg', 'face_img_fbox.txt': 'assets/face_img_fbox.txt'}


# Import utility functions

In [3]:
ASSETS_DIR = "./assets"


def _load_assets(assets_dir=ASSETS_DIR):
    face = np.load(Path(assets_dir) / "meanFace.npy").reshape(-1, 1).astype(np.float32)
    basis_id = np.load(Path(assets_dir) / "shapeBasis.npy").astype(np.float32)
    basis_exp = np.load(Path(assets_dir) / "blendShape.npy").astype(np.float32)
    vn = 68
    face = face.reshape(3 * vn, 1)
    basis_id = basis_id.reshape(3 * vn, 219)
    basis_exp = basis_exp.reshape(3 * vn, 39)
    return face, basis_id, basis_exp, vn


def _project(output, face, basis_id, basis_exp, vn):
    a_id = output[0:219]
    a_exp = output[219:258]
    pitch = output[258]
    yaw = output[259]
    roll = output[260]
    tX = output[261]
    tY = output[262]
    f = output[263]
    a_id = a_id * 3.0
    a_exp = a_exp * 0.5 + 0.5
    pitch = pitch * np.pi / 2.0
    yaw = yaw * np.pi / 2.0
    roll = roll * np.pi / 2.0
    tX = tX * 60.0
    tY = tY * 60.0
    tZ = 500.0
    f = f * 150.0 + 450.0
    p = np.array(
        [
            [1.0, 0.0, 0.0],
            [0.0, np.cos(-np.pi), -np.sin(-np.pi)],
            [0.0, np.sin(-np.pi), np.cos(-np.pi)],
        ],
        dtype=np.float32,
    )
    cr, sr = np.cos(-roll), np.sin(-roll)
    cp, sp = np.cos(-pitch), np.sin(-pitch)
    cy, sy = np.cos(-yaw), np.sin(-yaw)
    Rz = np.array([[cr, -sr, 0.0], [sr, cr, 0.0], [0.0, 0.0, 1.0]], dtype=np.float32)
    Ry = np.array([[cy, 0.0, sy], [0.0, 1.0, 0.0], [-sy, 0.0, cy]], dtype=np.float32)
    Rx = np.array([[1.0, 0.0, 0.0], [0.0, cp, -sp], [0.0, sp, cp]], dtype=np.float32)
    R = Ry @ (Rx @ (p @ Rz))
    shape = face + basis_id @ a_id.reshape(-1, 1) + basis_exp @ a_exp.reshape(-1, 1)
    shape = shape.reshape(vn, 3)
    V = shape @ R.T
    V[:, 0] += tX
    V[:, 1] += tY
    V[:, 2] += tZ
    lm = V[:, :2] * (f / tZ)
    return lm.astype(np.float32), float(pitch), float(yaw), float(roll)


def _normalize(inp, mode):
    if mode == "0_1":
        return inp.astype(np.float32) / 255.0
    if mode == "neg1_1":
        return (inp.astype(np.float32) / 127.5) - 1.0
    return inp.astype(np.float32)


def _rect_to_square(x0, y0, x1, y1, H, W, scale=1.1):
    cx = 0.5 * (x0 + x1)
    cy = 0.5 * (y0 + y1)
    side = max(x1 - x0 + 1, y1 - y0 + 1) * scale
    nx0 = int(round(cx - side / 2))
    ny0 = int(round(cy - side / 2))
    nx1 = int(round(cx + side / 2))
    ny1 = int(round(cy + side / 2))
    nx0 = max(0, nx0)
    ny0 = max(0, ny0)
    nx1 = min(W - 1, nx1)
    ny1 = min(H - 1, ny1)
    return nx0, ny0, nx1, ny1


def _prep(
    img,
    bbox,
    ih,
    iw,
    bbox_order="x0y0x1y1",
    square_crop=True,
    square_scale=1.1,
    norm="0_1",
    to_rgb=True,
):
    H, W = img.shape[:2]
    if bbox is None:
        x0, y0, x1, y1 = 0, 0, W - 1, H - 1
    else:
        if bbox_order == "x0x1y0y1":
            bx0, bx1, by0, by1 = bbox
            x0, y0, x1, y1 = int(bx0), int(by0), int(bx1), int(by1)
        else:
            x0, y0, x1, y1 = [int(v) for v in bbox]
    if square_crop:
        x0, y0, x1, y1 = _rect_to_square(x0, y0, x1, y1, H, W, square_scale)
    x0 = max(0, x0)
    y0 = max(0, y0)
    x1 = min(W - 1, x1)
    y1 = min(H - 1, y1)
    roi = img[y0 : y1 + 1, x0 : x1 + 1]
    if to_rgb:
        roi = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)
    roi = cv2.resize(roi, (iw, ih), interpolation=cv2.INTER_LINEAR)
    inp = _normalize(roi, norm)[np.newaxis, ...]
    return inp, (x0, y0, x1, y1)


def _transform(lm, bbox, rh, rw):
    x0, y0, x1, y1 = bbox
    h = (y1 - y0) + 1
    w = (x1 - x0) + 1
    out = lm.copy()
    out[:, 0] = (out[:, 0] + rw / 2.0) * (w / rw) + x0
    out[:, 1] = (out[:, 1] + rh / 2.0) * (h / rh) + y0
    return out


def _draw_pose_text(vis, bbox, deg):
    x0, y0, x1, y1 = bbox
    w = (x1 - x0) + 1
    h = (y1 - y0) + 1
    scale = max(0.5, 0.001 * max(w, h))
    thick = max(1, int(round(scale * 2)))
    text = f"pitch:{deg[0]:.1f} yaw:{deg[1]:.1f} roll:{deg[2]:.1f}"
    (tw, th), base = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, scale, thick)
    pad = max(2, int(round(0.2 * th)))
    tx = max(0, min(x0, vis.shape[1] - tw - pad))
    ty = min(vis.shape[0] - pad, y0 + th + pad)
    overlay = vis.copy()
    cv2.rectangle(
        overlay,
        (tx - pad, ty - th - pad),
        (tx + tw + pad, ty + base + pad),
        (0, 0, 0),
        -1,
    )
    vis = cv2.addWeighted(overlay, 0.5, vis, 0.5, 0)
    cv2.putText(
        vis,
        text,
        (tx, ty),
        cv2.FONT_HERSHEY_SIMPLEX,
        scale,
        (0, 255, 0),
        thick,
        cv2.LINE_AA,
    )
    return vis


def annotate_image(
    image_path,
    interpreter,
    assets_dir=ASSETS_DIR,
    bbox=None,
    save_path=None,
    bbox_order="x0y0x1y1",
    square_crop=True,
    square_scale=1.1,
    norm="0_1",
    to_rgb=True,
    radius=10,
    thickness=-1,
):
    face, basis_id, basis_exp, vn = _load_assets(assets_dir)
    inp_info = interpreter.get_input_details()[0]
    out_info = interpreter.get_output_details()[0]
    ih, iw = int(inp_info["shape"][1]), int(inp_info["shape"][2])
    img = cv2.imread(image_path)
    inp, bbox_xyxy = _prep(
        img, bbox, ih, iw, bbox_order, square_crop, square_scale, norm, to_rgb
    )
    interpreter.set_tensor(inp_info["index"], inp.astype(np.float32))
    interpreter.invoke()
    out = interpreter.get_tensor(out_info["index"])[0]
    lm_crop, pitch, yaw, roll = _project(out, face, basis_id, basis_exp, vn)
    lm_img = _transform(lm_crop, bbox_xyxy, ih, iw)
    deg = np.array([pitch, yaw, roll]) * (180.0 / np.pi)
    vis = img.copy()
    vis = _draw_pose_text(vis, bbox_xyxy, deg)
    if radius <= 0:
        bw = (bbox_xyxy[2] - bbox_xyxy[0]) + 1
        bh = (bbox_xyxy[3] - bbox_xyxy[1]) + 1
        radius = max(3, int(0.008 * max(bw, bh)))
    for x, y in lm_img:
        cv2.circle(vis, (int(round(x)), int(round(y))), radius, (0, 255, 0), thickness)
    if save_path:
        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
        cv2.imwrite(save_path, vis)
    return lm_img, {
        "pitch_rad": pitch,
        "yaw_rad": yaw,
        "roll_rad": roll,
        "pitch_deg": deg[0],
        "yaw_deg": deg[1],
        "roll_deg": deg[2],
    }

# Create adapter class for CoreML inference

In [4]:
class CoreMLInterpreterAdapter:

    def __init__(
        self,
        mlmodel,
        input_name="input",
        output_name="output",
        expects_rgb=True,
        input_range="uint8",
    ):
        self.mlmodel = mlmodel
        self.input_name = input_name
        self.output_name = output_name
        self.expects_rgb = expects_rgb
        self.input_range = input_range
        self._in = None
        self._out = None
        spec = self.mlmodel.get_spec()
        in0 = spec.description.input[0]
        self._shape = np.array(
            [1, in0.type.imageType.height, in0.type.imageType.width, 3]
        )

    def get_input_details(self):
        return [{"name": self.input_name, "index": 0, "shape": self._shape}]

    def _infer_out_shape(self):
        ih, iw = int(self._shape[1]), int(self._shape[2])
        z = Image.fromarray(np.zeros((ih, iw, 3), np.uint8))
        y = np.array(self.mlmodel.predict({self.input_name: z})[self.output_name])
        return np.array([1, int(y.size)])

    def get_output_details(self):
        if not hasattr(self, "_out_shape"):
            self._out_shape = self._infer_out_shape()
        return [{"name": self.output_name, "index": 0, "shape": self._out_shape}]

    def set_tensor(self, index, arr):
        self._in = arr

    def invoke(self):
        x = self._in
        if x is None:
            raise RuntimeError("set_tensor must be called before invoke")
        x = np.squeeze(x, axis=0)
        if x.dtype != np.uint8:
            if self.input_range == "uint8":
                x = np.clip(x, 0.0, 1.0) * 255.0
            x = x.astype(np.uint8)
        if x.shape[-1] == 3 and not self.expects_rgb:
            x = x[..., ::-1]
        pil = Image.fromarray(x)
        pred = self.mlmodel.predict({self.input_name: pil})
        y = np.array(pred[self.output_name], dtype=np.float32).reshape(1, -1)
        self._out = y

    def get_tensor(self, index):
        if self._out is None:
            raise RuntimeError("invoke must be called before get_tensor")
        return self._out

# Initialize TFLite Interpreter

In [5]:
MODEL_PATH = "models/LandmarkDetectionModel.tflite"
SAMPLE_DIR = "sample"
RESULTS_DIR = "results"
CONF_THRESH = 0.5
NUM_LANDMARKS = 68
os.makedirs(f"tflite-{RESULTS_DIR}", exist_ok=True)
os.makedirs(f"coreml-{RESULTS_DIR}", exist_ok=True)

try:
    tflite_interpreter = Interpreter(model_path=MODEL_PATH)
    tflite_interpreter.allocate_tensors()
    print("✅ Model loaded and tensors allocated successfully.")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("Please ensure the model file is at the correct path:", MODEL_PATH)
    exit()

input_details = tflite_interpreter.get_input_details()[0]
output_details = tflite_interpreter.get_output_details()[0]

INPUT_SHAPE = input_details["shape"]
INPUT_HEIGHT = INPUT_SHAPE[1]
INPUT_WIDTH = INPUT_SHAPE[2]

print("\n--- Model Details ---")
print("Inputs:\n", input_details)
print("Outputs:\n", output_details)
print("---------------------\n")

✅ Model loaded and tensors allocated successfully.

--- Model Details ---
Inputs:
 {'name': 'image', 'index': 0, 'shape': array([  1, 128, 128,   3], dtype=int32), 'shape_signature': array([  1, 128, 128,   3], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}
Outputs:
 {'name': 'parameters_3dmm', 'index': 85, 'shape': array([  1, 265], dtype=int32), 'shape_signature': array([  1, 265], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}
---------------------



INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


# Initialize CoreML Interpreter

In [6]:
MODEL_PATH = "models/LandmarkDetectionModel.mlpackage"

try:
    mlmodel = ct.models.MLModel(model=MODEL_PATH)
    coreml_interpreter = CoreMLInterpreterAdapter(
        mlmodel,
        input_name="input",
        output_name="output",
        expects_rgb=True,
        input_range="uint8",
    )
except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("Please ensure the model file is at the correct path:", MODEL_PATH)
    exit()

input_details = coreml_interpreter.get_input_details()[0]
output_details = coreml_interpreter.get_output_details()[0]

INPUT_SHAPE = input_details["shape"]
INPUT_HEIGHT = INPUT_SHAPE[1]
INPUT_WIDTH = INPUT_SHAPE[2]

print("\n--- Model Details ---")
print("Inputs:\n", input_details)
print("Outputs:\n", output_details)
print("---------------------\n")


--- Model Details ---
Inputs:
 {'name': 'input', 'index': 0, 'shape': array([  1, 128, 128,   3])}
Outputs:
 {'name': 'output', 'index': 0, 'shape': array([  1, 265])}
---------------------



# Fetch image files from sample directory

In [7]:
try:
    print("🔄Loading sample images...")
    image_files = [f for f in os.listdir(SAMPLE_DIR) if f.endswith(".jpg")]
    if not image_files:
        print("⚠️ No .jpg images found in the 'sample' directory.")
except FileNotFoundError:
    print(f"❌ Error: The directory '{SAMPLE_DIR}' was not found.")
    image_files = []
finally:
    print("✅Sample Images Loaded")

🔄Loading sample images...
✅Sample Images Loaded


# Run inference on each sample image (.jpg)

In [12]:
paths = [p for p in Path(SAMPLE_DIR).glob("*.jpg")]
for p in paths:
    out_path = str(Path(f"tflite-{RESULTS_DIR}") / (p.stem + "_annotated.jpg"))
    tflite_result = annotate_image(
        str(p), tflite_interpreter, assets_dir=ASSETS_DIR, bbox=None, save_path=out_path
    )
    out_path = str(Path(f"coreml-{RESULTS_DIR}") / (p.stem + "_annotated.jpg"))
    coreml_result = annotate_image(
        str(p), coreml_interpreter, assets_dir=ASSETS_DIR, bbox=None, save_path=out_path
    )

In [13]:
import numpy as np

fbox = np.loadtxt("./assets/face_img_fbox.txt")
x0, x1, y0, y1 = [int(v) for v in fbox]
bbox = (x0, y0, x1, y1)
out_path = "tflite-results/qcom_demo_check.jpg"
annotate_image(
    "./assets/face_img.jpg",
    tflite_interpreter,
    bbox=bbox,
    save_path=out_path,
)
out_path = "coreml-results/qcom_demo_check.jpg"
annotate_image(
    "./assets/face_img.jpg",
    coreml_interpreter,
    bbox=bbox,
    save_path=out_path,
)

(array([[126.84751, 197.22092],
        [126.02574, 232.8127 ],
        [128.57668, 265.71866],
        [134.9893 , 303.11694],
        [146.13406, 327.9792 ],
        [159.5923 , 346.98584],
        [176.73459, 361.64655],
        [196.4888 , 374.2413 ],
        [216.82167, 379.0624 ],
        [237.51495, 377.39938],
        [258.7813 , 367.97137],
        [277.66373, 356.07495],
        [293.48608, 339.30475],
        [308.07635, 316.36453],
        [319.83136, 280.3706 ],
        [327.2766 , 248.2154 ],
        [331.84674, 212.91841],
        [159.69366, 188.71666],
        [169.64903, 186.18839],
        [180.50935, 185.65121],
        [192.07141, 187.02788],
        [201.98207, 190.04187],
        [258.62445, 195.53204],
        [268.6644 , 194.58078],
        [280.19525, 195.42703],
        [290.85797, 197.76718],
        [300.45587, 201.6398 ],
        [229.38399, 208.49495],
        [227.99039, 227.62337],
        [226.4346 , 248.9091 ],
        [225.14165, 265.04727],
        

# Compare Results

In [14]:
def infer_params(
    image_path,
    interpreter,
    assets_dir,
    bbox=None,
    bbox_order="x0y0x1y1",
    square_crop=True,
    square_scale=1.1,
    norm="0_1",
    to_rgb=True,
):
    face, basis_id, basis_exp, vn = _load_assets(assets_dir)
    inp_info = interpreter.get_input_details()[0]
    out_info = interpreter.get_output_details()[0]
    ih, iw = int(inp_info["shape"][1]), int(inp_info["shape"][2])
    img = cv2.imread(image_path)
    inp, _ = _prep(
        img, bbox, ih, iw, bbox_order, square_crop, square_scale, norm, to_rgb
    )
    interpreter.set_tensor(inp_info["index"], inp.astype(np.float32))
    interpreter.invoke()
    out = interpreter.get_tensor(out_info["index"])[0]
    return out.ravel().astype(np.float64)


def cosine_sim(a, b):
    a = a.ravel().astype(np.float64)
    b = b.ravel().astype(np.float64)
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    if na == 0 or nb == 0:
        return np.nan
    return float(np.dot(a, b) / (na * nb))

In [17]:
sims = []
angle_deltas = []
paths = [p for p in Path(SAMPLE_DIR).glob("*.jpg")]
for p in paths:
    out_path = str(Path(f"tflite-{RESULTS_DIR}") / (p.stem + "_annotated.jpg"))
    tflite_result = annotate_image(
        str(p), tflite_interpreter, assets_dir=ASSETS_DIR, bbox=None, save_path=out_path
    )
    out_path = str(Path(f"coreml-{RESULTS_DIR}") / (p.stem + "_annotated.jpg"))
    coreml_result = annotate_image(
        str(p), coreml_interpreter, assets_dir=ASSETS_DIR, bbox=None, save_path=out_path
    )

    t_params = infer_params(str(p), tflite_interpreter, ASSETS_DIR)
    c_params = infer_params(str(p), coreml_interpreter, ASSETS_DIR)
    if t_params.shape != c_params.shape:
        raise ValueError(
            f"Shape mismatch {t_params.shape} vs {c_params.shape} on {p.name}"
        )
    sims.append((p.name, cosine_sim(t_params, c_params)))

    _, t_pose = tflite_result
    _, c_pose = coreml_result

    print(t_pose)

    angle_deltas.append(
        (
            p.name,
            {
                "yawΔ": (
                    None
                    if t_pose["yaw_deg"] is None or c_pose["yaw_deg"] is None
                    else float(abs(t_pose["yaw_deg"] - c_pose["yaw_deg"]))
                ),
                "pitchΔ": (
                    None
                    if t_pose["pitch_deg"] is None or c_pose["pitch_deg"] is None
                    else float(abs(t_pose["pitch_deg"] - c_pose["pitch_deg"]))
                ),
                "rollΔ": (
                    None
                    if t_pose["roll_deg"] is None or c_pose["roll_deg"] is None
                    else float(abs(t_pose["roll_deg"] - c_pose["roll_deg"]))
                ),
            },
        )
    )

for name, s in sims:
    print(f"{name}: cos_sim={s:.6f}")

vals = [s for _, s in sims if not np.isnan(s)]
if vals:
    print("mean:", float(np.mean(vals)))
    print("min:", float(np.min(vals)))
    print("p5/p50/p95:", [float(np.percentile(vals, q)) for q in (5, 50, 95)])

for name, d in angle_deltas:
    print(name, {k: (None if v is None else round(v, 2)) for k, v in d.items()})

{'pitch_rad': 0.048106609821965914, 'yaw_rad': -0.7189976459608002, 'roll_rad': 0.030785039486177133, 'pitch_deg': 2.7563057094812393, 'yaw_deg': -41.19553059339523, 'roll_deg': 1.763852834701538}
{'pitch_rad': 0.4935035468493173, 'yaw_rad': -0.031622168024512336, 'roll_rad': -0.022954027304515394, 'pitch_deg': 28.275670409202576, 'yaw_deg': -1.811816766858101, 'roll_deg': -1.3151688873767853}
{'pitch_rad': -0.03541107318476343, 'yaw_rad': -0.007352626266396744, 'roll_rad': -0.3743799533763554, 'pitch_deg': -2.028905041515827, 'yaw_deg': -0.42127445340156555, 'roll_deg': -21.4503912627697}
{'pitch_rad': -0.1089539721531283, 'yaw_rad': 0.03182729254591486, 'roll_rad': 0.456609408277499, 'pitch_deg': -6.24260276556015, 'yaw_deg': 1.8235695362091064, 'roll_deg': 26.16179198026657}
{'pitch_rad': 0.17187581083741468, 'yaw_rad': 0.7500865042736795, 'roll_rad': 0.029549107776089066, 'pitch_deg': 9.847758561372757, 'yaw_deg': 42.976790964603424, 'roll_deg': 1.6930391639471054}
{'pitch_rad': -0