# Zero Shot Image Classification using CLIP

In [3]:
import torch
import clip
import os
from PIL import Image

## Development

### Load Model

On linux, you can use wget in the terminal to download the pretrained model,
```bash
wget https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt
```

In [20]:
model_path = 'ViT-B-32.pt'
model, transformations = clip.load(model_path, device='cpu', jit=True)

### Preprocess

In [5]:
def preprocess(image, classes):
    
    image = transformations(image).unsqueeze(0)
    classes = clip.tokenize(classes)

    return image, classes

### Load Sample Data

In [6]:
image_paths = []
images = []
for i, file in enumerate(os.listdir('images')):
    img_path = os.path.join('images', file)
    image_paths.append(img_path)
    images.append(Image.open(img_path))
    print(f'{i}: {file}')

0: dancing.png
1: elon_masked.jpg
2: kids_playing.jpg
3: plane.jpg
4: traffic.jpg


In [14]:
classes = [
    "kids playing", 
    "dancing",
    "elon musk wearing a face mask", 
    "aeroplane",
    "traffic",
]

In [21]:
def predict(image, classes):

    # Preprocess inputs
    image_input, classes_input = preprocess(image, classes)

    # Forward pass on the model
    logits_per_image, logits_per_text = model(image_input, classes_input)

    # Normalize the cosine distances using softmax
    probs = logits_per_image.softmax(dim=-1).squeeze().tolist()

    # Format and sort the final output
    output = []
    for i, prob in enumerate(probs):
        output.append(
            (classes[i], round(prob, 4))
        )

    sorted_outputs = sorted(output, key=lambda x: x[1], reverse=True)
    
    return sorted_outputs

In [16]:
for path, image in zip(image_paths, images):
    print(f'Input: {path}')
    print('Output:')
    print(predict(image, list(map(lambda x: f"A photo of {x}", classes))))
    print('--------------------------------')

Input: images\dancing.png
Output:
[('A photo of dancing', 0.9004), ('A photo of kids playing', 0.0744), ('A photo of aeroplane', 0.0137), ('A photo of elon musk wearing a face mask', 0.0072), ('A photo of traffic', 0.0043)]
--------------------------------
Input: images\elon_masked.jpg
Output:
[('A photo of elon musk wearing a face mask', 1.0), ('A photo of kids playing', 0.0), ('A photo of dancing', 0.0), ('A photo of aeroplane', 0.0), ('A photo of traffic', 0.0)]
--------------------------------
Input: images\kids_playing.jpg
Output:
[('A photo of kids playing', 0.9994), ('A photo of dancing', 0.0005), ('A photo of elon musk wearing a face mask', 0.0), ('A photo of aeroplane', 0.0), ('A photo of traffic', 0.0)]
--------------------------------
Input: images\plane.jpg
Output:
[('A photo of aeroplane', 0.9983), ('A photo of traffic', 0.0016), ('A photo of dancing', 0.0001), ('A photo of kids playing', 0.0), ('A photo of elon musk wearing a face mask', 0.0)]
----------------------------

## Deployment

### Test Inference Script

In [17]:
from inference import CLIPImageClassifier

In [18]:
classifier = CLIPImageClassifier(model_path)

In [19]:
for path, image in zip(image_paths, images):
    print(f'Input: {path}')
    print('Output:')
    print(classifier.predict(image, list(map(lambda x: f"A photo of {x}", classes))))
    print('--------------------------------')

Input: images\dancing.png
Output:
[('A photo of dancing', 0.9004), ('A photo of kids playing', 0.0744), ('A photo of aeroplane', 0.0137), ('A photo of elon musk wearing a face mask', 0.0072), ('A photo of traffic', 0.0043)]
--------------------------------
Input: images\elon_masked.jpg
Output:
[('A photo of elon musk wearing a face mask', 1.0), ('A photo of kids playing', 0.0), ('A photo of dancing', 0.0), ('A photo of aeroplane', 0.0), ('A photo of traffic', 0.0)]
--------------------------------
Input: images\kids_playing.jpg
Output:
[('A photo of kids playing', 0.9994), ('A photo of dancing', 0.0005), ('A photo of elon musk wearing a face mask', 0.0), ('A photo of aeroplane', 0.0), ('A photo of traffic', 0.0)]
--------------------------------
Input: images\plane.jpg
Output:
[('A photo of aeroplane', 0.9983), ('A photo of traffic', 0.0016), ('A photo of dancing', 0.0001), ('A photo of kids playing', 0.0), ('A photo of elon musk wearing a face mask', 0.0)]
----------------------------

### 1. Initialize Hub API Project
Open a terminal and run the following command,
```
hub init clip-classifier
```

### 2. Integration

#### i. Copy the files from `model_files/` to `clip-classifier/model/` folder in Hub API project

#### ii. Replace the `clip-classifier/src/main.py` code with this,
```python
import json
import os
# Add your own import statements
from inference import CLIPImageClassifier

# This environment variable gives you the
# path to the directory of your model. You
# can use this in your code to load model
# and other large files
MODEL_DIR = os.getenv("MODEL_DIR")
classifier = CLIPImageClassifier(os.path.join(MODEL_DIR, 'ViT-B-32.pt'))

```

#### iii. Add the libraries in `zero-shot/src/requirements.txt`
```
torch
clip
```

### 3. Build and Deploy

Change directory into the `clip-classifier` project folder in the terminal and then run the following commands,
```bash
hub build
hub deploy
```

### Test the Deployed API

In [22]:
import os
import json
import requests
import base64

# Paste your API URL here
API_KEY = "YOUR API KEY HERE"
USERNAME = "YOUR USERNAME HERE"
API_NAME = "clip-classifier" # replace with your project name if you named it anything else other than "clip-classifier"

# The API endpoint for your Hub API project
endpoint = f"https://api.cellstrathub.com/{USERNAME}/{API_NAME}"

headers = {
  "x-api-key": API_KEY,
  "Content-Type": "application/json"
}

In [23]:
# Load images as base64 encoded strings
image_strings = []

# Read all the images
for img in os.listdir('images'):
    img_path = os.path.join('images', img)

    if os.path.isfile(img_path):
        
        # read the image
        with open(img_path, 'rb') as f:
            img_bytes = f.read()
            
            # convert to a base64 string
            img_str = base64.b64encode(img_bytes).decode('utf-8')
            
            image_strings.append(img_str)

In [None]:
payload = {
    'image': image_strings[0],
    'classes': classes
}

# Send the POST request
response = requests.post(endpoint, headers=headers, data=json.dumps(payload)).json()

if response.get('statusCode') == 200:
    # Parse the output
    body = json.loads(response['body'])
    print('Predictions:', json.loads(body['output']))
else:
    print(response)