In [1]:
import sys
print(sys.executable)
print("Hello MindSpore!")


/home/noor/A/projects/Upstyle/.venv/bin/python
Hello MindSpore!


In [5]:
!pip install -q torch torchvision --upgrade

# Install Hugging Face transformers + tokenizers + huggingface_hub
!pip install -q transformers tokenizers huggingface-hub --upgrade

# ONNX + ONNX Runtime for validation
!pip install -q onnx onnxruntime onnx-simplifier

# Pillow for image I/O, and timm (if FashionCLIP uses timm backbones)
!pip install -q pillow timm

# Optional for debugging graphs
!pip install -q graphviz

# (Do not install MindSpore (large) here if you only need converter_lite)
# You need to download MindSpore-Lite converter offline package (see later cell).

!pip install onnx --upgrade
!pip install onnxruntime --upgrade





In [6]:
# Cell 2 — Python imports and utility functions
import os
import torch
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer, CLIPVisionModel, CLIPTextModel
import onnx
import onnxruntime as ort


In [7]:
print("working?")

working?


In [9]:

HF_MODEL = "patrickjohncyh/fashion-clip"

# Use CLIPModel / CLIPProcessor if HF model is compatible
# If this errors, clone the GitHub repo and use model loading from there (code cell below).
try:
    model = CLIPModel.from_pretrained(HF_MODEL, dtype=torch.float32)  # loads vision + text encoders
    processor = CLIPProcessor.from_pretrained(HF_MODEL)
    model.eval()
    print("Loaded HF FashionCLIP via Transformers (CLIPModel).")
except Exception as e:
    print("Transformers CLIPModel load failed — error:", e)
    print("Fallback: clone the GitHub repo and use the provided model code (see README).")
    # If fallback is needed, you'll need to clone repo and import FashionCLIP class (not automated here).


Loaded HF FashionCLIP via Transformers (CLIPModel).


In [11]:
# Cell 4 — Prepare inputs and run a forward pass in PyTorch
from PIL import Image
import requests
from io import BytesIO

# Example image: use any URL or local file. Here we use a tiny sample; replace with your dataset.
img_url = "/home/noor/A/projects/Upstyle/upstyle_ai/clothes_data/Hoodie8.jpg"  # sample fashion-like image
image = Image.open(img_url).convert("RGB")

# Example text prompt(s)
texts = ["a photo of a red dress", "black leather jacket"]

# Process inputs using the CLIPProcessor (handles transforms + tokenization)
inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)

# Inspect shapes
print({k: v.shape for k, v in inputs.items()})

# Run the model to get embeddings (PyTorch)
with torch.no_grad():
    outputs = model(**{k: inputs[k] for k in ["input_ids", "attention_mask", "pixel_values"] if k in inputs})
    # CLIPModel typically returns: vision_model_output, text_model_output, image_embeds, text_embeds, logits_per_image, logits_per_text
    # fetch image and text embeddings
    if hasattr(outputs, "image_embeds"):
        image_embeds = outputs.image_embeds
        text_embeds = outputs.text_embeds
    else:
        # Adapt if different return structure
        image_embeds = outputs[1] if len(outputs) > 1 else outputs[0]
        text_embeds = outputs[0] if len(outputs) > 0 else outputs[1]

print("PyTorch image_embeds.shape:", image_embeds.shape)
print("PyTorch text_embeds.shape:", text_embeds.shape)


{'pixel_values': torch.Size([1, 3, 224, 224]), 'input_ids': torch.Size([2, 8]), 'attention_mask': torch.Size([2, 8])}
PyTorch image_embeds.shape: torch.Size([1, 512])
PyTorch text_embeds.shape: torch.Size([2, 512])


In [12]:
# Cell 5 — Create a wrapper nn.Module for ONNX export that accepts (pixel_values, input_ids, attention_mask)
# and returns normalized embeddings or logits (depending on FashionCLIP's forward signature).

import torch.nn as nn

class FashionCLIPWrapper(nn.Module):
    def __init__(self, clip_model):
        super().__init__()
        self.clip = clip_model
    def forward(self, pixel_values, input_ids, attention_mask):
        # Use the HF CLIPModel forward and return the image & text embeddings
        outputs = self.clip(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        # Standard transformers CLIPModel returns .image_embeds and .text_embeds
        image_embeds = outputs.image_embeds
        text_embeds = outputs.text_embeds
        # Optionally L2-normalize embeddings as CLIP does for similarity computations
        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
        # Return as tuple
        return image_embeds, text_embeds

# instantiate wrapper
wrapper = FashionCLIPWrapper(model)
wrapper.eval()


FashionCLIPWrapper(
  (clip): CLIPModel(
    (text_model): CLIPTextTransformer(
      (embeddings): CLIPTextEmbeddings(
        (token_embedding): Embedding(49408, 512)
        (position_embedding): Embedding(77, 512)
      )
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-11): 12 x CLIPEncoderLayer(
            (self_attn): CLIPAttention(
              (k_proj): Linear(in_features=512, out_features=512, bias=True)
              (v_proj): Linear(in_features=512, out_features=512, bias=True)
              (q_proj): Linear(in_features=512, out_features=512, bias=True)
              (out_proj): Linear(in_features=512, out_features=512, bias=True)
            )
            (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
              (activation_fn): QuickGELUActivation()
              (fc1): Linear(in_features=512, out_features=2048, bias=True)
              (fc2): Linear(in_features=2048, out_features=512, bias=Tru

In [15]:
!pip -q install onnxscript onnx onnxruntime --upgrade

# Cell 6 — Export the wrapper to ONNX
import torch

# Construct dummy inputs matching real shapes
# pixel_values: (batch_size, 3, H, W)
batch_size = 1
pixel_values = inputs["pixel_values"]  # from processor earlier (tensor)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# If processor returned batch dimension mismatch, ensure shapes:
print("pixel_values.shape:", pixel_values.shape)
print("input_ids.shape:", input_ids.shape, "attention_mask.shape:", attention_mask.shape)

onnx_path = "fashionclip.onnx"

# Set export options
# - opset_version: 13 or higher recommended
# - dynamic_axes: batch size dynamic and text sequence length dynamic
dynamic_axes = {
    "pixel_values": {0: "batch_size", 2: "height", 3: "width"},
    "input_ids": {0: "batch_size", 1: "seq_len"},
    "attention_mask": {0: "batch_size", 1: "seq_len"},
    "image_embeds": {0: "batch_size"},
    "text_embeds": {0: "batch_size", 1: "seq_len"}  # text_embeds may have shape (batch, embed_dim) - adjust if needed
}

# ONNX export requires a tuple of inputs; ensure wrapper signature matches export
try:
    torch.onnx.export(
        wrapper,
        (pixel_values, input_ids, attention_mask),
        onnx_path,
        export_params=True,
        opset_version=13,
        do_constant_folding=True,
        input_names=["pixel_values", "input_ids", "attention_mask"],
        output_names=["image_embeds", "text_embeds"],
        dynamic_axes=dynamic_axes,
    )
    print("ONNX export succeeded:", onnx_path)
except Exception as e:
    print("ONNX export failed — check model structure / ops. Error:", e)
    # Useful debugging: try torch.onnx.export with verbose=True or export smaller components (vision/text separately).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


pixel_values.shape: torch.Size([1, 3, 224, 224])
input_ids.shape: torch.Size([2, 8]) attention_mask.shape: torch.Size([2, 8])


  torch.onnx.export(
W1118 15:28:53.307000 15323 torch/onnx/_internal/exporter/_compat.py:114] Setting ONNX exporter to use operator set version 18 because the requested opset_version 13 is a lower version than we have implementations for. Automatic version conversion will be performed, which may not be successful at converting to the requested version. If version conversion is unsuccessful, the opset version of the exported model will be kept at 18. Please consider setting opset_version >=18 to leverage latest ONNX features


[torch.onnx] Obtain model graph for `FashionCLIPWrapper([...]` with `torch.export.export(..., strict=False)`...
[torch.onnx] Obtain model graph for `FashionCLIPWrapper([...]` with `torch.export.export(..., strict=False)`... ❌
[torch.onnx] Obtain model graph for `FashionCLIPWrapper([...]` with `torch.export.export(..., strict=True)`...


E1118 15:28:57.313000 15323 torch/_guards.py:368] [0/0] Error while creating guard:
E1118 15:28:57.313000 15323 torch/_guards.py:368] [0/0] Name: ''
E1118 15:28:57.313000 15323 torch/_guards.py:368] [0/0]     Source: shape_env
E1118 15:28:57.313000 15323 torch/_guards.py:368] [0/0]     Create Function: SHAPE_ENV
E1118 15:28:57.313000 15323 torch/_guards.py:368] [0/0]     Guard Types: None
E1118 15:28:57.313000 15323 torch/_guards.py:368] [0/0]     Code List: None
E1118 15:28:57.313000 15323 torch/_guards.py:368] [0/0]     Object Weakref: None
E1118 15:28:57.313000 15323 torch/_guards.py:368] [0/0]     Guarded Class Weakref: None
E1118 15:28:57.313000 15323 torch/_guards.py:368] [0/0] Traceback (most recent call last):
E1118 15:28:57.313000 15323 torch/_guards.py:368] [0/0]   File "/home/noor/A/projects/Upstyle/.venv/lib/python3.11/site-packages/torch/_guards.py", line 366, in create
E1118 15:28:57.313000 15323 torch/_guards.py:368] [0/0]     return self.create_fn(builder, self)
E1118 1

[torch.onnx] Obtain model graph for `FashionCLIPWrapper([...]` with `torch.export.export(..., strict=True)`... ❌
ONNX export failed — check model structure / ops. Error: Failed to export the model with torch.export. [96mThis is step 1/3[0m of exporting the model to ONNX. Next steps:
- Modify the model code for `torch.export.export` to succeed. Refer to https://pytorch.org/docs/stable/generated/exportdb/index.html for more information.
- Debug `torch.export.export` and submit a PR to PyTorch.
- Create an issue in the PyTorch GitHub repository against the [96m*torch.export*[0m component and attach the full error stack as well as reproduction scripts.

## Exception summary

<class 'torch._dynamo.exc.UserError'>: Constraints violated (L['pixel_values'].size()[2], L['pixel_values'].size()[3])! For more information, run with TORCH_LOGS="+dynamic".
  - You marked L['pixel_values'].size()[2] as dynamic but your code specialized it to be a constant (224). If you're using mark_dynamic, eithe

In [34]:
# Cell 7 — Simplify ONNX model (helps with compatibility for some converters)
# Requires onnx-simplifier
from onnxsim import simplify

onnx_model = onnx.load(onnx_path)
model_simp, check = simplify(onnx_model)
if check:
    simplified_path = "fashionclip.simplified.onnx"
    onnx.save(model_simp, simplified_path)
    print("Saved simplified ONNX to:", simplified_path)
    onnx_path = simplified_path
else:
    print("ONNX simplification failed or returned not checkable. Keeping original ONNX.")


Saved simplified ONNX to: fashionclip.simplified.onnx


In [35]:
# Cell 8 — Validate parity: run ONNXRuntime and compare to PyTorch outputs (cosine similarity)
import numpy as np
from numpy.linalg import norm

# Load ONNX model with ONNX Runtime
ort_session = ort.InferenceSession(onnx_path)

# Prepare inputs as numpy arrays
ort_inputs = {
    "pixel_values": pixel_values.cpu().numpy(),
    "input_ids": input_ids.cpu().numpy(),
    "attention_mask": attention_mask.cpu().numpy(),
}

ort_outs = ort_session.run(None, ort_inputs)
onnx_image_embeds = ort_outs[0]
onnx_text_embeds = ort_outs[1]

# Convert torch embeddings to numpy
pt_image = image_embeds.cpu().numpy()
pt_text = text_embeds.cpu().numpy()

def cos_sim(a, b):
    a = a.flatten()
    b = b.flatten()
    return np.dot(a, b) / (norm(a) * norm(b) + 1e-8)

print("Cosine similarity image embedding (PyTorch vs ONNX):",
      cos_sim(pt_image, onnx_image_embeds))
print("Cosine similarity text embedding (PyTorch vs ONNX):",
      cos_sim(pt_text, onnx_text_embeds))
# Expect values close to 1.0 (small numerical differences OK).


Cosine similarity image embedding (PyTorch vs ONNX): 0.9999999303953517
Cosine similarity text embedding (PyTorch vs ONNX): 1.0000000546046477


In [38]:
!pip install onnx onnxruntime numpy --upgrade
!pip install numpy==1.23.5 tokenizers==0.15.0 mindspore==2.7.1 mindformers==1.3.0

# MindSpore Lite Python wheel (CPU)
# Example for Linux (adjust version if needed)
!pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/2.3.0-rc1/MindSpore/lite/release/linux/cpu/x86_64/mindspore_lite-2.3.0rc1-cp310-cp310-linux_x86_64.whl


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting tokenizers==0.15.0
  Using cached tokenizers-0.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
INFO: pip is looking at multiple versions of scipy to determine which version is compatible with other requirements. This could take a while.
Collecting scipy>=1.5.4 (from mindspore==2.7.1)
  Downloading scipy-1.16.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (62 kB)
  Using cached scipy-1.16.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
  Downloading scipy-1.16.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
  Downloading scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[31mERROR: mindspore_lite-2.3.0rc1-cp310-cp310-linux_x86_64.whl is not a supported wheel on this platform.[0m[31m
[0m