In [1]:
# This notebook is adapted for WSL, adb commands are different than in Linux environments
import glob
import json
import os
import shutil
import subprocess
import time
import zipfile
from pathlib import Path

import numpy as np
import sys
import torch
from PIL import Image
from transformers import AutoConfig, AutoProcessor

from qwen_vl_utils import process_vision_info
from common_files.nsptargets import NspTargets
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Check and initialize paths to files in contestant_uploads/
# contestant_uploads/
# ├── ar*-ar*-cl*
# │   └── weight_sharing_model_*_of_*.serialized.bin
# ├── embedding_weights*.raw
# ├── inputs.json
# ├── mask.raw
# ├── position_ids_cos.raw
# ├── position_ids_sin.raw
# ├── serialized_binaries
# │   └── veg.serialized.bin
# └── tokenizer.json

execution_ws = os.getcwd()
context_path = execution_ws + "/contestant_uploads"
missing = []

# ---- Find ar*-ar*-cl* folders ----
ar_folders = glob.glob(os.path.join(context_path, "ar*-ar*-cl*"))
ar_folders = [f for f in ar_folders if os.path.isdir(f)]

if len(ar_folders) == 0:
    missing.append("ar*-ar*-cl* folder")
elif len(ar_folders) > 1:
    raise RuntimeError(
        f"Multiple ar*-ar*-cl* folders found: "
        f"{[os.path.basename(f) for f in ar_folders]}. "
        "Exactly one is required."
    )
else:
    ar_foldername = os.path.basename(ar_folders[0])

# ---- Check .serialized.bin(s) inside ar* folder ----
ar_bins = glob.glob(os.path.join(context_path, ar_foldername, "*.serialized.bin"))
if not ar_bins:
    missing.append("*.serialized.bin inside ar*-ar*-cl* folder")

# ---- Check embedding_weights*.raw ----
embed_files = glob.glob(os.path.join(context_path, "embedding_weights*.raw"))
embed_weights_filename = embed_files[0] if embed_files else None

if embed_weights_filename is None:
    missing.append("embedding_weights*.raw")
else:
    embed_weights_filename = os.path.basename(embed_weights_filename)

# ---- Required raw files ----
required_raws = [
    "mask.raw",
    "position_ids_cos.raw",
    "position_ids_sin.raw"
]

for raw in required_raws:
    if not os.path.isfile(os.path.join(context_path, raw)):
        missing.append(raw)

# ---- Check serialized_binaries folder ----
serialized_dir = os.path.join(context_path, "serialized_binaries")
if not os.path.isdir(serialized_dir):
    missing.append("serialized_binaries/")
else:
    serialized_bins = glob.glob(os.path.join(serialized_dir, "*.serialized.bin"))
    if not serialized_bins:
        missing.append("*.serialized.bin inside serialized_binaries/")

# ---- Check JSON files ----
required_jsons = ["inputs.json", "tokenizer.json"]
for jf in required_jsons:
    if not os.path.isfile(os.path.join(context_path, jf)):
        missing.append(jf)

# ---- Final validation ----
if missing:
    print("Missing required files/folders:")
    for m in missing:
        print(f"  - {m}")
    raise RuntimeError("Validation failed: required contestant_uploads content missing")

print("Contestant's upload files passed validation")

inputs_json_path = os.path.join(context_path, "inputs.json")

with open(inputs_json_path, "r") as f:
    inputs = json.load(f)

# ---- Top-level inputs ----
qwen2_vl_processor_input = inputs["qwen_vl_processor"]
llm_config_input = inputs["llm_config"]
inp_h_input = inputs["data_preprocess_inp_h"]
inp_w_input = inputs["data_preprocess_inp_w"]
run_veg_n_tokens_input  = inputs["run_veg_n_tokens"]
run_veg_embedding_dim_input  = inputs["run_veg_embedding_dim"]
# ---- Genie config ----
genie_config = inputs["genie_config"]

Contestant's upload files passed validation


In [3]:
ADB = "/mnt/c/platform-tools/adb.exe"
cmd = f'{ADB} devices -l'  # command as a single string
result = subprocess.run(cmd, capture_output=True, text=True, shell=True)

# Example result:
    # List of devices attached
    # d2d2c4d6               device product:canoe model:Canoe_for_arm64 device:canoe transport_id:1

lines = [line.strip() for line in result.stdout.splitlines() if line.strip()]
device_lines = lines[1:]  # Everything after the header

if not device_lines:
    raise RuntimeError("Error: No devices found.")

device_id = device_lines[0].split()[0]
print(f"Successfully connected to: {device_id}")

# Set up NSP Target
sys.path.append('../../')

# Change GEN4 here to GEN5
nsp_target = NspTargets.Android.GEN5 # For GEN5

Successfully connected to: d2d2c4d6


In [4]:
# Configure QNN SDK and Genie paths
# Link /qnn_assets to /qnn
if os.path.isdir("qnn_assets"):
    os.unlink("qnn_assets")
os.symlink("/qnn", "qnn_assets")

execution_ws = os.getcwd()
QNN_SDK_dir = os.path.join(execution_ws, "qnn_assets")
QNN_lib_dir = os.path.join(QNN_SDK_dir, "lib/aarch64-android")
QNN_binary = os.path.join(QNN_SDK_dir, "bin/aarch64-android/qnn-net-run")
GENIE_lib_dir = os.path.join(QNN_SDK_dir, "lib/aarch64-android")
GENIE_binary = os.path.join(QNN_SDK_dir, "bin/aarch64-android/genie-t2t-run")
QNN_skel = os.path.join(QNN_SDK_dir, "lib/hexagon-" + nsp_target.dsp_arch,  "unsigned", "lib" + nsp_target.qnn_htp_lib_name + "Skel.so")

des_dir = os.path.join(execution_ws, "to_device")
# Sub directories of des_dir for Qualla directory structure:
des_dir_models = os.path.join(des_dir, "models")
des_dir_qwen2_models = os.path.join(des_dir_models, "qwen2-vl")
des_dir_qwen2_models_2B = os.path.join(des_dir_qwen2_models, "2B-FT")
des_dir_qwen2_model_2B_data = os.path.join(des_dir_qwen2_models_2B, "data")

if os.path.exists(des_dir):
    shutil.rmtree(des_dir) # clear destination dir 
os.makedirs(des_dir_qwen2_model_2B_data) # will recursively create all required directories

# on device destination directories
target_device_dir = "/data/local/tmp/qwen2_vl_assets"

# This is the path of the prepared model binaries for Qwen2-VL (example2)
qwen2_models_context_path = os.path.join(context_path, ar_foldername) # MIGHT NEED TO CHANGE THIS LATER ON

# This is the list of split LLM model files (in order):
llm_model_names = os.listdir(qwen2_models_context_path)
llm_model_names.sort()
llm_model_names = [f for f in llm_model_names if os.path.isfile(os.path.join(qwen2_models_context_path, f))]

veg_models_context_path = os.path.join(context_path + "/serialized_binaries")

for model_bin in os.listdir(veg_models_context_path):
    src_file = os.path.join(veg_models_context_path, model_bin)
    # Only copy files, skip directories
    if os.path.isfile(src_file):
        shutil.copy(src_file, des_dir)

# Copy model binaries 
for model_bin in os.listdir(qwen2_models_context_path):
    src_file = os.path.join(qwen2_models_context_path, model_bin)
    if os.path.isfile(src_file):
        shutil.copy(src_file, des_dir_qwen2_models_2B)

qwen2_vl_embedding_buffer_file = os.path.join(context_path, embed_weights_filename)
shutil.copy(qwen2_vl_embedding_buffer_file, des_dir_qwen2_models_2B)

# Copy necessary libraries to a common location
QNN_libs = ["libQnnHtp.so", "libQnnHtpNetRunExtensions.so", "libQnnHtpPrepare.so", "lib" + nsp_target.qnn_htp_lib_name + "Stub.so", "libQnnSystem.so"]
for lib in QNN_libs:
    shutil.copy(os.path.join(QNN_lib_dir, lib), des_dir)

GENIE_libs = ["libGenie.so"]
for lib in GENIE_libs:
    shutil.copy(os.path.join(GENIE_lib_dir, lib), des_dir)

# Copy binaries
shutil.copy(QNN_binary, des_dir)
shutil.copy(GENIE_binary, des_dir)

# Copy Skel
shutil.copy(QNN_skel, des_dir)

qwen2_vl_tokenizer_path = context_path + "/tokenizer.json"
shutil.copy(qwen2_vl_tokenizer_path, des_dir_qwen2_models)

'/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/to_device/models/qwen2-vl/tokenizer.json'

In [5]:
# htp_backend_extensions.json goes inside the LLM model data folder:
qwen2_data_folder_rel_path = os.path.relpath(des_dir_qwen2_model_2B_data, des_dir)
target_device_data_dir = os.path.join(target_device_dir, qwen2_data_folder_rel_path)

# HTP backend extensions config file (htp_backend_extensions.json) example
htp_backend_extensions_data = {
    "backend_extensions": {
        "shared_library_path": "libQnnHtpNetRunExtensions.so",
        "config_file_path": os.path.join(target_device_data_dir, "htp_backend_ext_config.json")
    }
}

# HTP backend config file (htp_backend_ext_config.json) example
htp_backend_ext_config_data = {
    "devices": [
        {
            "cores":[{
                "perf_profile": "burst",
                "rpc_control_latency": 100
            }]
        }
    ]
}

# write the config files to the destination
with open(os.path.join(des_dir, 'htp_backend_extensions.json'),'w') as f:
    f.write(json.dumps(htp_backend_extensions_data, indent=4))
# Genie and QNN will use the same htp_backend_ext_config_data, so it will be dumped to the location expected by Genie
with open(os.path.join(des_dir_qwen2_model_2B_data,  'htp_backend_ext_config.json'),'w') as f:
    f.write(json.dumps(htp_backend_ext_config_data, indent=4))

dialog = genie_config["dialog"]

# tokenizer
dialog["tokenizer"]["path"] = str(
    "models/qwen2-vl/tokenizer.json"
)

# backend extensions
dialog["engine"]["backend"]["extensions"] = str(
    "models/qwen2-vl/2B-FT/data/htp_backend_ext_config.json"
)

BASE_MODEL_DIR = Path("models/qwen2-vl/2B-FT")

# ctx-bins (supports multiple files)
dialog["engine"]["model"]["binary"]["ctx-bins"] = [
    str(BASE_MODEL_DIR / name) for name in llm_model_names
]

with open(os.path.join(des_dir, 'qwen2-vl-e2t-htp.json'), 'w') as f:
    f.write(json.dumps(genie_config, indent=4))

In [6]:
# OPTIONAL: remove target directory from the device if retrying these steps
# Changed for WSL:
RH = "localhost"
cmd_rm = [
    ADB,
    "-H", RH,
    "-s", device_id,
    "shell", "rm", "-rf", target_device_dir
]
result_rm = subprocess.run(cmd_rm, capture_output=True, text=True)
print(result_rm.stdout, result_rm.stderr)

 


In [7]:
zip_path = os.path.join(des_dir, "package.zip")
device_zip_path = f"{target_device_dir}/package.zip"

if os.path.exists(zip_path):
    os.remove(zip_path)

zip_path = os.path.join(os.path.dirname(des_dir), "package.zip")
device_zip_path = f"{target_device_dir}/package.zip"
# --- 1. create ZIP ---
print("Zipping directory:", des_dir)
t0 = time.time()
with zipfile.ZipFile(zip_path, 'w', compression=zipfile.ZIP_STORED) as zipf:
    for root, dirs, files in os.walk(des_dir):
        for file in files:
            full_path = os.path.join(root, file)
            arcname = os.path.relpath(full_path, des_dir)
            zipf.write(full_path, arcname)

t1 = time.time()
print(f"\tZipping completed in {t1 - t0:.2f}s")

# --- 2. adb push ZIP ---
print("Pushing ZIP to device...")
cmd_push = [
    ADB, "-H", RH, "-s", device_id,
    "push", zip_path, device_zip_path
]

t2 = time.time()
proc_push = subprocess.run(cmd_push, capture_output=True, text=True)
t3 = time.time()

# print("adb push stdout:", proc_push.stdout)
# print("adb push stderr:", proc_push.stderr)
print(f"adb push time: {t3 - t2:.2f}s\n")

# --- 3. adb unzip ---
print("Unzipping on device...")
cmd_unzip = (
    f'{ADB} -H {RH} -s {device_id} shell "cd {target_device_dir} && unzip -o package.zip"'
)

t4 = time.time()
proc_unzip = subprocess.run(cmd_unzip, shell=True, capture_output=True, text=True)
t5 = time.time()

print(f"\tadb unzip time: {t5 - t4:.2f}s")

# --- 4. Remove ZIP from device and host ---
print("Removing ZIPs from device and host...")
subprocess.run([
    ADB, "-H", RH, "-s", device_id,
    "shell", "rm", "-f", device_zip_path
], capture_output=True, text=True)
os.remove(zip_path)

print(f"\nTotal time: {t5 - t0:.2f}s")

Zipping directory: /home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/to_device
	Zipping completed in 5.26s
Pushing ZIP to device...
adb push time: 89.54s

Unzipping on device...
	adb unzip time: 1.11s
Removing ZIPs from device and host...

Total time: 95.91s


In [8]:
# Initializing the Lookup table using:
lookup_table_np = np.fromfile(os.path.join(des_dir_qwen2_models_2B, embed_weights_filename), dtype=np.float32) # CHANGE
# Reshape lookup table to n-vocab x embedding_vector_len
lookup_table_np = lookup_table_np.reshape(genie_config["dialog"]["context"]["n-vocab"], genie_config["dialog"]["embedding"]["size"])
def get_embeddings(token_ids):
    token_embeddings =  []
    # Get embedding for each token:
    for token_id in token_ids:
        token_embeddings.append(lookup_table_np[token_id, :])
    # Stack all token embeddings together:
    token_embeddings_np = np.stack(token_embeddings, axis=0)
    return token_embeddings_np

qwen2_vl_processor = AutoProcessor.from_pretrained(qwen2_vl_processor_input) # CHANGE

def data_preprocess(processor, img_path, inp_h=342, inp_w=512, prompt=''):
    messages = [{
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": img_path,
                "resized_height": inp_h,
                "resized_width": inp_w,
            },
            {
                "type": "text",
                "text": prompt
            },
        ],
    }]
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, _ = process_vision_info(messages)
    inputs = processor(
        text=text,
        images=image_inputs,
        return_tensors="pt",
    )
    return inputs

# Define generic qnn-net-run block
def run_qnn_net_run(model_context, input_data_list):
    # Define tmp directory path for intermediate artifacts
    tmp_dirpath = os.path.abspath('tmp/inputs')
    os.makedirs(tmp_dirpath, exist_ok=True)
    
    # Dump each input data from input_data_list as raw file
    # and prepare input_list_filepath for qnn-net-run
    input_list_text = ''
    for index, input_data in enumerate(input_data_list):
        # Create and dump each input into raw file
        raw_file_path = f'{tmp_dirpath}/input_{index}.raw'
        input_data.tofile(raw_file_path)
        # Keep appending raw_file_path into input_list_text for input_list_filepath file
        input_list_text += target_device_dir + '/inputs/' + os.path.basename(raw_file_path) + ' '

    cos_data  = os.path.join(context_path, "position_ids_cos.raw")
    sin_data  = os.path.join(context_path, "position_ids_sin.raw")
    mask_data = os.path.join(context_path, "mask.raw")
    shutil.copy(cos_data, tmp_dirpath)
    shutil.copy(sin_data, tmp_dirpath)
    shutil.copy(mask_data, tmp_dirpath)
    input_list_text += target_device_dir + '/inputs/position_ids_cos.raw' + ' '
    input_list_text += target_device_dir + '/inputs/position_ids_sin.raw' + ' '
    input_list_text += target_device_dir + '/inputs/mask.raw' + ' '

    print(input_list_text)
    
    # Create input_list_filepath and add prepared input_list_text into this file
    input_list_filepath = f'{tmp_dirpath}/../input_list.txt'
    with open(input_list_filepath, 'w') as f:
        f.write(input_list_text)

    # Push input_list_filepath and input data raw files to device
    !{ADB} -H {RH} -s {device_id} push {input_list_filepath} {target_device_dir} > /dev/null
    !{ADB} -H {RH} -s {device_id} push {tmp_dirpath} {target_device_dir} > /dev/null
    
    # Execute qnn-net-run on shell
    !{ADB} -H {RH} -s {device_id} shell LD_LIBRARY_PATH={target_device_dir} ADSP_LIBRARY_PATH={target_device_dir} \
    {target_device_dir}/qnn-net-run --retrieve_context {model_context} --backend {target_device_dir}/libQnnHtp.so \
    --input_list {target_device_dir}/input_list.txt --output_dir {target_device_dir} \
    --config_file {target_device_dir}/htp_backend_extensions.json > {tmp_dirpath}/log.txt

    # Pull the output file from device
    !{ADB} -H {RH} -s {device_id} pull {target_device_dir}/Result_0/vision_embedding.raw {tmp_dirpath} > /dev/null
    
    # Read the output data generated by qnn-net-run
    output_data = np.fromfile(f'{tmp_dirpath}/vision_embedding.raw', dtype=np.float32)
    
    # Delete all intermediate artifacts
    # shutil.rmtree(tmp_dirpath)
    
    return output_data

def run_veg(pixel_values, n_tokens=216, embedding_dim=1536):
    input_data_list = [pixel_values]
    output_data = run_qnn_net_run(f'{target_device_dir}/veg.serialized.bin', input_data_list)
    # VEG output should be of shape (1, 529, 1536) 
    # (640. 640) -> (644, 644) -> (46  46) -> 529 = 46 * 46 / 4
    # (342, 512) -> (336, 504) -> (24, 36) -> 24 * 36 /4 =  216
    output_data = output_data.reshape((1, n_tokens, embedding_dim))
    return output_data


In [10]:
# --- Configuration ---
host_image_folder = "dataset/images"
host_prompts_folder = "dataset/prompts"
device_embeds_dir = f"{target_device_dir}/ImageEmbeds"
llm_config = AutoConfig.from_pretrained(
    llm_config_input, # CHANGE: let contestants config this model name
    trust_remote_code=True
) 

# --- Ensure Device Directory Exists ---
print(f"Creating directory on device: {device_embeds_dir}")
subprocess.run(
    f'{ADB} -H {RH} -s {device_id} shell "mkdir -p {device_embeds_dir}"', 
    shell=True, 
    check=True
)

# --- Prepare File Lists ---
image_files = glob.glob(os.path.join(host_image_folder, "*.png"))
prompt_files = glob.glob(os.path.join(host_prompts_folder, "*.txt"))
print(f"Found {len(image_files)} images and {len(prompt_files)} prompt variations.")

TMP_ROOT = Path("tmp")
TMP_ROOT.mkdir(parents=True, exist_ok=True)

# Processing Loop
for img_path in image_files:
    img_name_root = Path(img_path).stem

    # 1. Image Preprocessing & Vision Encoder (Done ONCE per image)
    try:
        temp_inputs = data_preprocess(qwen2_vl_processor, img_path, inp_h_input, inp_w_input, "")
        pixel_values = temp_inputs['pixel_values'].detach().numpy().astype(np.float32)

        print(f"Running Vision Encoder for {img_name_root}...")
        image_embeddings_raw = run_veg(pixel_values)
        image_embeddings_torch = torch.from_numpy(image_embeddings_raw)

    except Exception as e:
        print(f"Failed to process image {img_name_root}: {e}")
        continue

    # 2. Prompt Loop (Combine the fixed image with each unique prompt)
    for prompt_path in prompt_files:
        prompt_name = Path(prompt_path).stem
        raw_filename = f"{img_name_root}_{prompt_name}.raw"
        raw_path = TMP_ROOT / raw_filename

        try:
            with open(prompt_path, "r", encoding="utf-8") as f:
                prompt_text = f.read()

            # Preprocess text with the image
            inputs = data_preprocess(qwen2_vl_processor, img_path, inp_h_input, inp_w_input, prompt_text)
            token_ids = inputs['input_ids']

            # Generate Text Embeddings locally
            inputs_embeds = torch.from_numpy(get_embeddings(token_ids))

            # Masking logic to insert cached image embeddings into new text embeddings
            image_mask = ((inputs['input_ids'] == llm_config.image_token_id)
                          .unsqueeze(-1)
                          .expand_as(inputs_embeds))

            # Combine
            final_embeds = inputs_embeds.masked_scatter(image_mask, image_embeddings_torch).detach().numpy()

            final_embeds.tofile(raw_path)
            print(f"Generated {raw_filename}")

        except Exception as e:
            print(f"  -> Failed prompt {prompt_name} for image {img_name_root}: {e}")

# --- 3. Zip everything once ---
zip_path = TMP_ROOT / "all_image_prompt_embeddings.zip"
print(f"\nZipping all embeddings → {zip_path.name}")
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_STORED) as zf:
    for raw_file in TMP_ROOT.glob("*.raw"):
        zf.write(raw_file, arcname=raw_file.name)

# --- 4. Push zip to device ---
print("Pushing zip to device...")
subprocess.run(
    f'{ADB} -H {RH} -s {device_id} push {zip_path} {device_embeds_dir}/',
    shell=True,
    check=True
)

# --- 5. Unzip on device ---
print("Unzipping on device...")
subprocess.run(
    f'{ADB} -H {RH} -s {device_id} shell '
    f'"cd {device_embeds_dir} && unzip -o {zip_path.name}"',
    shell=True,
    check=True
)

# --- 6. Cleanup ---
for raw_file in TMP_ROOT.glob("*.raw"):
    raw_file.unlink()
zip_path.unlink()

print("All embeddings processed, zipped, pushed, and unzipped on device")

Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


Creating directory on device: /data/local/tmp/qwen2_vl_assets/ImageEmbeds
Found 10 images and 2 prompt variations.
Running Vision Encoder for kev2...
/data/local/tmp/qwen2_vl_assets/inputs/input_0.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_cos.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_sin.raw /data/local/tmp/qwen2_vl_assets/inputs/mask.raw 
/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs/../input_list.txt: 1 file pushed, 0 skipped. 0.0 MB/s (219 bytes in 0.007s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs\: 6 files pushed, 0 skipped. 25.8 MB/s (8654337 bytes in 0.321s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/data/local/tmp/qwen2_vl_assets/Result_0/vision_embedding.raw: 1 file pulled, 0 skipped. 24.8 MB/s (1327104 bytes in 0.051s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Generated kev2_prompt2.raw
Generated kev2_prompt1.raw
Running Vision Encoder for kev5...
/data/local/tmp/qwen2_vl_assets/inputs/input_0.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_cos.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_sin.raw /data/local/tmp/qwen2_vl_assets/inputs/mask.raw 
/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs/../input_list.txt: 1 file pushed, 0 skipped. 0.0 MB/s (219 bytes in 0.008s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs\: 6 files pushed, 0 skipped. 24.8 MB/s (8654337 bytes in 0.333s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/data/local/tmp/qwen2_vl_assets/Result_0/vision_embedding.raw: 1 file pulled, 0 skipped. 19.1 MB/s (1327104 bytes in 0.066s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Generated kev5_prompt2.raw
Generated kev5_prompt1.raw
Running Vision Encoder for SDXL_catA_p5...
/data/local/tmp/qwen2_vl_assets/inputs/input_0.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_cos.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_sin.raw /data/local/tmp/qwen2_vl_assets/inputs/mask.raw 
/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs/../input_list.txt: 1 file pushed, 0 skipped. 0.0 MB/s (219 bytes in 0.005s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs\: 6 files pushed, 0 skipped. 25.3 MB/s (8654337 bytes in 0.327s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/data/local/tmp/qwen2_vl_assets/Result_0/vision_embedding.raw: 1 file pulled, 0 skipped. 17.6 MB/s (1327104 bytes in 0.072s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Generated SDXL_catA_p5_prompt2.raw
Generated SDXL_catA_p5_prompt1.raw
Running Vision Encoder for SDXL_catA_p4...
/data/local/tmp/qwen2_vl_assets/inputs/input_0.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_cos.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_sin.raw /data/local/tmp/qwen2_vl_assets/inputs/mask.raw 
/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs/../input_list.txt: 1 file pushed, 0 skipped. 0.0 MB/s (219 bytes in 0.009s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs\: 6 files pushed, 0 skipped. 33.8 MB/s (8654337 bytes in 0.244s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/data/local/tmp/qwen2_vl_assets/Result_0/vision_embedding.raw: 1 file pulled, 0 skipped. 18.4 MB/s (1327104 bytes in 0.069s)
Generated SDXL_catA_p4_prompt2.raw
Generated SDXL_catA_p4_prompt1.raw
Running Vision Encoder for SDXL_catA_p3...
/data/local/tmp/qwen2_vl_assets/inputs/input_0.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_cos.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_sin.raw /data/local/tmp/qwen2_vl_assets/inputs/mask.raw 
/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs/../input_list.txt: 1 file pushed, 0 skipped. 0.0 MB/s (219 bytes in 0.008s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs\: 6 files pushed, 0 skipped. 34.4 MB/s (8654337 bytes in 0.240s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/data/local/tmp/qwen2_vl_assets/Result_0/vision_embedding.raw: 1 file pulled, 0 skipped. 18.9 MB/s (1327104 bytes in 0.067s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Generated SDXL_catA_p3_prompt2.raw
Generated SDXL_catA_p3_prompt1.raw
Running Vision Encoder for kev1...
/data/local/tmp/qwen2_vl_assets/inputs/input_0.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_cos.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_sin.raw /data/local/tmp/qwen2_vl_assets/inputs/mask.raw 
/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs/../input_list.txt: 1 file pushed, 0 skipped. 0.0 MB/s (219 bytes in 0.008s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs\: 6 files pushed, 0 skipped. 21.2 MB/s (8654337 bytes in 0.388s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/data/local/tmp/qwen2_vl_assets/Result_0/vision_embedding.raw: 1 file pulled, 0 skipped. 15.0 MB/s (1327104 bytes in 0.084s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Generated kev1_prompt2.raw
Generated kev1_prompt1.raw
Running Vision Encoder for SDXL_catA_p2...
/data/local/tmp/qwen2_vl_assets/inputs/input_0.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_cos.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_sin.raw /data/local/tmp/qwen2_vl_assets/inputs/mask.raw 
/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs/../input_list.txt: 1 file pushed, 0 skipped. 0.0 MB/s (219 bytes in 0.011s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs\: 6 files pushed, 0 skipped. 25.4 MB/s (8654337 bytes in 0.325s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/data/local/tmp/qwen2_vl_assets/Result_0/vision_embedding.raw: 1 file pulled, 0 skipped. 19.4 MB/s (1327104 bytes in 0.065s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Generated SDXL_catA_p2_prompt2.raw
Generated SDXL_catA_p2_prompt1.raw
Running Vision Encoder for SDXL_catA_p1...
/data/local/tmp/qwen2_vl_assets/inputs/input_0.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_cos.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_sin.raw /data/local/tmp/qwen2_vl_assets/inputs/mask.raw 
/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs/../input_list.txt: 1 file pushed, 0 skipped. 0.0 MB/s (219 bytes in 0.010s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs\: 6 files pushed, 0 skipped. 23.9 MB/s (8654337 bytes in 0.346s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/data/local/tmp/qwen2_vl_assets/Result_0/vision_embedding.raw: 1 file pulled, 0 skipped. 23.8 MB/s (1327104 bytes in 0.053s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Generated SDXL_catA_p1_prompt2.raw
Generated SDXL_catA_p1_prompt1.raw
Running Vision Encoder for kev3...
/data/local/tmp/qwen2_vl_assets/inputs/input_0.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_cos.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_sin.raw /data/local/tmp/qwen2_vl_assets/inputs/mask.raw 
/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs/../input_list.txt: 1 file pushed, 0 skipped. 0.0 MB/s (219 bytes in 0.015s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs\: 6 files pushed, 0 skipped. 26.6 MB/s (8654337 bytes in 0.310s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/data/local/tmp/qwen2_vl_assets/Result_0/vision_embedding.raw: 1 file pulled, 0 skipped. 19.7 MB/s (1327104 bytes in 0.064s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Generated kev3_prompt2.raw
Generated kev3_prompt1.raw
Running Vision Encoder for kev4...
/data/local/tmp/qwen2_vl_assets/inputs/input_0.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_cos.raw /data/local/tmp/qwen2_vl_assets/inputs/position_ids_sin.raw /data/local/tmp/qwen2_vl_assets/inputs/mask.raw 
/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs/../input_list.txt: 1 file pushed, 0 skipped. 0.0 MB/s (219 bytes in 0.007s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/home/zacks/Tutorial_for_Qwen2_VL_2b_IoT/inference_script/tmp/inputs\: 6 files pushed, 0 skipped. 34.8 MB/s (8654337 bytes in 0.237s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/data/local/tmp/qwen2_vl_assets/Result_0/vision_embedding.raw: 1 file pulled, 0 skipped. 19.8 MB/s (1327104 bytes in 0.064s)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Generated kev4_prompt2.raw
Generated kev4_prompt1.raw

Zipping all embeddings → all_image_prompt_embeddings.zip
Pushing zip to device...


tmp/all_image_prompt_embeddings.zip: 1 file pushed, 0 skipped. 38.4 MB/s (29862182 bytes in 0.742s)


Unzipping on device...
Archive:  all_image_prompt_embeddings.zip
  inflating: SDXL_catA_p5_prompt2.raw
  inflating: SDXL_catA_p1_prompt2.raw
  inflating: kev2_prompt2.raw
  inflating: SDXL_catA_p1_prompt1.raw
  inflating: SDXL_catA_p3_prompt1.raw
  inflating: SDXL_catA_p3_prompt2.raw
  inflating: SDXL_catA_p4_prompt1.raw
  inflating: kev4_prompt2.raw
  inflating: kev5_prompt2.raw
  inflating: SDXL_catA_p2_prompt2.raw
  inflating: kev3_prompt2.raw
  inflating: kev1_prompt1.raw
  inflating: kev1_prompt2.raw
  inflating: SDXL_catA_p4_prompt2.raw
  inflating: kev4_prompt1.raw
  inflating: kev2_prompt1.raw
  inflating: SDXL_catA_p5_prompt1.raw
  inflating: kev5_prompt1.raw
  inflating: kev3_prompt1.raw
  inflating: SDXL_catA_p2_prompt1.raw
All embeddings processed, zipped, pushed, and unzipped on device


In [11]:
# --- Configuration ---

# Paths
device_output_dir = "Outputs"      # Relative to target_device_dir
host_output_dir = "Host_Outputs"    # Local folder for results to pull into
os.makedirs(host_output_dir, exist_ok=True)

# --- 1. Define the Batch Script ---
batch_script_content = f"""#!/bin/sh
cd {target_device_dir}
export LD_LIBRARY_PATH={target_device_dir}
export ADSP_LIBRARY_PATH={target_device_dir}

# Clean and create output directories
rm -rf {device_output_dir}
mkdir -p {device_output_dir}

# Use a count to track if anything actually gets done
count=0

for f in {device_embeds_dir}/*.raw; do
    # Strict check to see if the file exists
    [ -e "$f" ] || continue
    
    filename=$(basename "$f")
    output_name="${{filename%.*}}.txt"
    
    echo "------------------------------------------------" >> timing_summary.txt
    echo "$filename" >> timing_summary.txt
    echo "Running $filename..."
    
    {{ time ./genie-t2t-run \\
        -c qwen2-vl-e2t-htp.json \\
        -e "$f" \\
        -t models/qwen2-vl/2B-FT/{embed_weights_filename} \\
        > "{device_output_dir}/$output_name"; }} 2>> timing_summary.txt
    
    count=$((count + 1))
done

echo "\nBatch processing complete. Processed $count files."
"""

script_name = "batch_run.sh"
# Use newline='\n' to ensure Linux line endings for the script
with open(script_name, "w", newline='\n') as f:
    f.write(batch_script_content)

# --- 2. Push Script & Prepare Device ---
print("Pushing batch script to device...")
subprocess.run(f'{ADB} -H {RH} -s {device_id} push {script_name} {target_device_dir}/', shell=True, check=True)
subprocess.run(f'{ADB} -H {RH} -s {device_id} shell "chmod +x {target_device_dir}/{script_name}"', shell=True, check=True)

# --- 3. Execute Batch Script (One ADB Call) ---
print("\nStarting continuous batch inference on device...")
host_start_time = time.time()

# This command triggers the script and blocks until it finishes
cmd = f'{ADB} -H {RH} -s {device_id} shell "{target_device_dir}/{script_name}"'
process = subprocess.run(cmd, shell=True)

host_end_time = time.time()

if process.returncode != 0:
    print("Error occurred during batch execution. Check connection.")
else:
    print("Device finished execution.")

# --- 4. Pull Results and Timing Logs ---
print("\nPulling inference outputs...")
# Pull the folder containing all output text files
subprocess.run(f'{ADB} -H {RH} -s {device_id} pull {target_device_dir}/{device_output_dir}/. {host_output_dir}/', shell=True, check=True)

print("Pulling timing summary...")
# Pull the timing log file
subprocess.run(f'{ADB} -H {RH} -s {device_id} pull {target_device_dir}/timing_summary.txt {host_output_dir}/', shell=True, check=True)

# --- 5. Report ---
total_duration = host_end_time - host_start_time
print(f"\nTotal Host Wall-Clock Time: {total_duration:.2f} seconds")
print(f"Results saved to: {host_output_dir}/")
print(f"Timing log saved to: {host_output_dir}/timing_summary.txt")

# Cleanup local script file
if os.path.exists(script_name):
    os.remove(script_name)

Pushing batch script to device...


batch_run.sh: 1 file pushed, 0 skipped. 0.1 MB/s (1074 bytes in 0.020s)



Starting continuous batch inference on device...
Starting Batch Inference...
Running SDXL_catA_p1_prompt1.raw...
Running SDXL_catA_p1_prompt2.raw...
Running SDXL_catA_p2_prompt1.raw...
Running SDXL_catA_p2_prompt2.raw...
Running SDXL_catA_p3_prompt1.raw...
Running SDXL_catA_p3_prompt2.raw...
Running SDXL_catA_p4_prompt1.raw...
Running SDXL_catA_p4_prompt2.raw...
Running SDXL_catA_p5_prompt1.raw...
Running SDXL_catA_p5_prompt2.raw...
Running kev1_prompt1.raw...
Running kev1_prompt2.raw...
Running kev2_prompt1.raw...
Running kev2_prompt2.raw...
Running kev3_prompt1.raw...
Running kev3_prompt2.raw...
Running kev4_prompt1.raw...
Running kev4_prompt2.raw...
Running kev5_prompt1.raw...
Running kev5_prompt2.raw...
Batch processing complete. Processed 20 files.
Device finished execution.

Pulling inference outputs...
Pulling timing summary...

Total Host Wall-Clock Time: 50.30 seconds
Results saved to: HostOutputs/
Timing log saved to: HostOutputs/timing_summary.txt


/data/local/tmp/qwen2_vl_assets/Outputs/./: 20 files pulled, 0 skipped. 0.1 MB/s (10382 bytes in 0.085s)
/data/local/tmp/qwen2_vl_assets/timing_summary.txt: 1 file pulled, 0 skipped. 1.1 MB/s (4400 bytes in 0.004s)
