# Zero Shot Image Classification using CLIP

In [1]:
pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-c9xrf7en
  Running command git clone -q https://github.com/openai/CLIP.git /tmp/pip-req-build-c9xrf7en
Collecting ftfy
  Downloading ftfy-6.0.3.tar.gz (64 kB)
[K     |████████████████████████████████| 64 kB 2.9 MB/s  eta 0:00:01
Building wheels for collected packages: clip, ftfy
  Building wheel for clip (setup.py) ... [?25ldone
[?25h  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369094 sha256=59b0f981b2c99028935eb125929e43950cd1bd3bc9e776daf46589f3f37e64aa
  Stored in directory: /tmp/pip-ephem-wheel-cache-utuq989v/wheels/ab/4f/3a/5e51521b55997aa6f0690e095c08824219753128ce8d9969a3
  Building wheel for ftfy (setup.py) ... [?25ldone
[?25h  Created wheel for ftfy: filename=ftfy-6.0.3-py3-none-any.whl size=41913 sha256=dcd7cacfb4bb4660fe347aef780ade7e057a40a68711f553ed0b8dfae98176ea
  Stored in directory: /home/ubuntu/.cache/pip/wheels/7f/40/63/4bf60

In [2]:
import torch
import clip
import os
from PIL import Image

## Development

### Load Model

On linux, you can use wget in the terminal to download the pretrained model,
```bash
wget https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt
```

In [3]:
model_path = 'ViT-B-32.pt'
model, transformations = clip.load(model_path, device='cpu', jit=True)

### Preprocess

In [4]:
def preprocess(image, classes):
    
    image = transformations(image).unsqueeze(0)
    classes = clip.tokenize(classes)

    return image, classes

### Load Sample Data

In [5]:
image_paths = []
images = []
for i, file in enumerate(os.listdir('images')):
    img_path = os.path.join('images', file)
    image_paths.append(img_path)
    images.append(Image.open(img_path))
    print(f'{i}: {file}')

0: dancing.jpg
1: elon_masked.jpg
2: kids_playing.jpg
3: plane.jpg
4: traffic.jpg


In [1]:
classes = [
    "kids playing", 
    "dancing",
    "elon musk wearing a face mask", 
    "aeroplane",
    "traffic",
]

In [7]:
def predict(image, classes):

    # Preprocess inputs
    image_input, classes_input = preprocess(image, classes)

    # Forward pass on the model
    logits_per_image, logits_per_text = model(image_input, classes_input)

    # Normalize the cosine distances using softmax
    probs = logits_per_image.softmax(dim=-1).squeeze().tolist()

    # Format and sort the final output
    output = []
    for i, prob in enumerate(probs):
        output.append(
            (classes[i], round(prob, 4))
        )

    sorted_outputs = sorted(output, key=lambda x: x[1], reverse=True)
    
    return sorted_outputs

In [9]:
for path, image in zip(image_paths, images):
    print(f'Input: {path}')
    print('Output:')
    print(predict(image, list(map(lambda x: f"A photo of {x}", classes))))
    print('--------------------------------')

Input: images/dancing.jpg
Output:
[('A photo of dancing', 0.8808), ('A photo of kids playing', 0.0899), ('A photo of aeroplane', 0.0159), ('A photo of traffic', 0.0078), ('A photo of elon musk wearing a face mask', 0.0056)]
--------------------------------
Input: images/elon_masked.jpg
Output:
[('A photo of elon musk wearing a face mask', 1.0), ('A photo of kids playing', 0.0), ('A photo of dancing', 0.0), ('A photo of aeroplane', 0.0), ('A photo of traffic', 0.0)]
--------------------------------
Input: images/kids_playing.jpg
Output:
[('A photo of kids playing', 0.9994), ('A photo of dancing', 0.0005), ('A photo of elon musk wearing a face mask', 0.0), ('A photo of aeroplane', 0.0), ('A photo of traffic', 0.0)]
--------------------------------
Input: images/plane.jpg
Output:
[('A photo of aeroplane', 0.9987), ('A photo of traffic', 0.0012), ('A photo of dancing', 0.0001), ('A photo of kids playing', 0.0), ('A photo of elon musk wearing a face mask', 0.0)]
----------------------------

## Deployment

### Test Inference Script

In [10]:
from inference import CLIPImageClassifier

In [11]:
classifier = CLIPImageClassifier(model_path)

In [12]:
for path, image in zip(image_paths, images):
    print(f'Input: {path}')
    print('Output:')
    print(classifier.predict(image, list(map(lambda x: f"A photo of {x}", classes))))
    print('--------------------------------')

Input: images/dancing.jpg
Output:
[('A photo of dancing', 0.8808), ('A photo of kids playing', 0.0899), ('A photo of aeroplane', 0.0159), ('A photo of traffic', 0.0078), ('A photo of elon musk wearing a face mask', 0.0056)]
--------------------------------
Input: images/elon_masked.jpg
Output:
[('A photo of elon musk wearing a face mask', 1.0), ('A photo of kids playing', 0.0), ('A photo of dancing', 0.0), ('A photo of aeroplane', 0.0), ('A photo of traffic', 0.0)]
--------------------------------
Input: images/kids_playing.jpg
Output:
[('A photo of kids playing', 0.9994), ('A photo of dancing', 0.0005), ('A photo of elon musk wearing a face mask', 0.0), ('A photo of aeroplane', 0.0), ('A photo of traffic', 0.0)]
--------------------------------
Input: images/plane.jpg
Output:
[('A photo of aeroplane', 0.9987), ('A photo of traffic', 0.0012), ('A photo of dancing', 0.0001), ('A photo of kids playing', 0.0), ('A photo of elon musk wearing a face mask', 0.0)]
----------------------------

### 1. Initialize Hub API Project
Open a terminal and run the following command,
```
hub init clip-classifier
```

### 2. Integration

#### i. Copy the `ViT-B-32.pt` model file to `clip-classifier/model/` folder in Hub API project

#### ii. Replace the `clip-classifier/src/main.py` code with this,
```python
import os
from hub import hub_handler
# Add your own import statements
from inference import CLIPImageClassifier
from utils import convert_base64_to_image

# This environment variable gives you the
# path to the directory of your model. You
# can use this in your code to load model
# and other large files
MODEL_DIR = os.getenv("MODEL_DIR")
classifier = CLIPImageClassifier(os.path.join(MODEL_DIR, 'ViT-B-32.pt'))

@hub_handler
def inference_handler(inputs, _):
    '''The main inference function which gets triggered when the API is invoked'''
    
    image = convert_base64_to_image(inputs['image'], return_type='pillow')
    print(image)
    print(inputs['classes'])
    
    output = classifier.predict(image, inputs['classes'])

    return output
```

#### iii. Add the libraries in `zero-shot/src/requirements.txt`
```
torch
git+https://github.com/openai/CLIP.git
```

### 3. Build and Deploy

Change directory into the `clip-classifier` project folder in the terminal and then run the following commands,
```bash
hub build
hub deploy
```

#### Tip:
You can directly run the `build` and `deploy` commands on `clip-hub-api` folder without the integration step as it contains the final source code for the project.

### Test the Deployed API

In [2]:
import os
import json
import requests
import base64

# Paste your key and username here
API_KEY = "CJaYCj7gL3azRRolVuEcm8G9Baam9b8L7m9gW0sl"
USERNAME = "nerdimite"
API_NAME = "clip-classifier" # replace with your project name if you named it anything else other than "clip-classifier"

# The API endpoint for your Hub API project
endpoint = f"https://api.cellstrathub.com/{USERNAME}/{API_NAME}"

headers = {
  "x-api-key": API_KEY,
  "Content-Type": "application/json"
}

In [3]:
# Load images as base64 encoded strings
image_strings = []

# Read all the images
for img in os.listdir('images'):
    img_path = os.path.join('images', img)

    if os.path.isfile(img_path):
        
        # read the image
        with open(img_path, 'rb') as f:
            img_bytes = f.read()
            
            # convert to a base64 string
            img_str = base64.b64encode(img_bytes).decode('utf-8')
            
            image_strings.append(img_str)

In [4]:
%%time
payload = {
    'image': image_strings[2],
    'classes': classes
}

# Send the POST request
response = requests.post(endpoint, headers=headers, data=json.dumps(payload)).json()

if response.get('statusCode') == 200:
    # Parse the output
    print('Predictions:', response['body']['output'])
else:
    print(response)

Predictions: [['kids playing', 0.999], ['aeroplane', 0.0006], ['dancing', 0.0004], ['elon musk wearing a face mask', 0.0], ['traffic', 0.0]]
Wall time: 15.3 s


In [5]:
requests.get(endpoint, headers=headers).json()

{'statusCode': 200,
 'headers': {'Content-Type': 'application/json',
  'Access-Control-Allow-Origin': '*'},
 'body': 'Model Loaded in Memory'}