## CLIP Demo Code

In [None]:
# CLIP Demo Code

import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("images/CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]

## CLIP processing for our app

In [13]:
import clip
import torch
from PIL import Image
import requests
from io import BytesIO

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
clip_model, preprocess = clip.load("ViT-B/32", device=device)

def process_and_upload_image(image_url):
    # Download the image
    response = requests.get(image_url)
    image = Image.open(BytesIO(response.content))
    
    # Preprocess the image and generate embedding
    image_input = preprocess(image).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = clip_model.encode_image(image_input)
        image_embedding = image_features.cpu().numpy().tolist()[0]
    
    return image_embedding, image_features

url = "https://i.imgflip.com/3u1lfv.jpg"
embedding, features = process_and_upload_image(url)

print(type(features), features.shape)
print(type(embedding), len(embedding))

cuda
<class 'torch.Tensor'> torch.Size([1, 512])
<class 'list'> 512
