In [2]:
%pip install transformers Pillow torch torchvision torchaudio
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image

Collecting transformers
  Using cached transformers-4.47.1-py3-none-any.whl (10.1 MB)
Collecting torch
  Using cached torch-2.5.1-cp39-cp39-manylinux1_x86_64.whl (906.5 MB)
Collecting torchvision
  Using cached torchvision-0.20.1-cp39-cp39-manylinux1_x86_64.whl (7.2 MB)
Collecting torchaudio
  Using cached torchaudio-2.5.1-cp39-cp39-manylinux1_x86_64.whl (3.4 MB)
Collecting tokenizers<0.22,>=0.21
  Using cached tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
Collecting safetensors>=0.4.1
  Using cached safetensors-0.4.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (436 kB)
Collecting huggingface-hub<1.0,>=0.24.0
  Using cached huggingface_hub-0.27.0-py3-none-any.whl (450 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01

In [5]:
# Init model and processor from Hugging Face
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Load image
image = Image.open("ai-logo.png")

# Preprocess image
inputs = processor(images=image, return_tensors="pt")

# Generate caption
outputs = model.generate(**inputs)
caption = processor.decode(outputs[0], skip_special_tokens=True)

print(caption)

a dinosaur with headphones and a keyboard


## Asking question about the image

In [10]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

#Load BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Image
image_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
raw_image = Image.open(requests.get(image_url, stream=True).raw)
#raw_image = Image.open("ai-logo.png")

# Question
question = "What animal is it?"

# Use the processor to encode the question and the image
inputs = processor(images=raw_image, text=question, return_tensors="pt")

outputs = model.generate(**inputs)

answer = processor.decode(outputs[0], skip_special_tokens=True)
print(answer)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


what animal is it?


## Using gradio to make a demo

### Simple example

In [None]:
'''
    In the code below, we define a simple function that takes two inputs: a text and a number. 
    The function returns the text repeated a number of times equal to the number input. 
    We then create a Gradio interface for this function, specifying the input types and the output type. Finally, we launch the interface.
    
    Why Gradio?
    Gradio is a Python library that allows you to create simple interfaces for your machine learning models.
'''
import gradio as gr
def greet(name, intensity):
    return "Hello " + name + "!" * int(intensity)

demo = gr.Interface(
    fn=greet, 
    inputs=["text", "number"], 
    outputs="text")
demo.launch()

### Create a Gradio interface for the BLIP model 

In [None]:
# What is BLIP ?
# BLIP is a multimodal model that can generate image captions from images and questions.

In [None]:
import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def generate_caption(image):
    """Generate a caption for an image

    Args:
        image (PIL.Image): Image to be captioned

    Returns:
        str: Generated caption
    """
    inputs = processor(images=image, return_tensors="pt")
    outputs = model.generate(**inputs)
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

def caption_image(image):
    """Take an image and return a caption

    Args:
        image (PIL.Image): Image to be captioned

    Returns:
        str: Generated caption
    """
    try:
        caption = generate_caption(image)
        return caption
    except Exception as e:
        return str(e)
    
iface = gr.Interface(fn=caption_image, 
    inputs=gr.Image(type="pil"), 
    outputs="text",
    title="Image Captioning with BLIP",
    description="Updload an image and generate a caption.")

iface.launch()

### Image classification in PyTorch

In [2]:
import torch
# What is resnet18 ?
# ResNet-18 is a convolutional neural network that is 18 layers deep. You can load the model using torch.hub.load().
model = torch.hub.load('pytorch/vision:v0.9.0', 'resnet18', pretrained=True).eval()

Downloading: "https://github.com/pytorch/vision/zipball/v0.9.0" to /home/daoliangshu/.cache/torch/hub/v0.9.0.zip
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /home/daoliangshu/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:03<00:00, 12.8MB/s]


In [9]:
import requests
from PIL import Image
from torchvision import transforms

# Download human-readable labels for ImageNet.
response = requests.get("https://git.io/JJkYN")
labels= response.text.split("\n")

def predict(inp):
    """Predict the class of an image
    Args:
        inp (PIL.Image): Image to be classified
    Returns:
        dict: Class confidences
    """
    
    # Convert the PIL image to a PyTorch tensor
    inp = transforms.ToTensor()(inp).unsqueeze(0)
    with torch.no_grad():
        prediction = torch.nn.functional.softmax(model(inp)[0], dim=0)
        confidences = {labels[i]: float(prediction[i]) for i in range(1000)}
    
    # The dictionary's keys are the class labels, and its values are the corresponding confidence probabilities.
    return confidences

import gradio as gr

gr.Interface(fn = predict, 
    inputs = gr.Image(type="pil"),
    outputs = gr.Label(num_top_classes=6), # customize it to show only the top 6 classes 
    examples=["/content/lion.jpg", "/content/cheetah.jpg"]).launch()
        

Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "/home/daoliangshu/anaconda3/lib/python3.9/site-packages/gradio/queueing.py", line 536, in process_events
    response = await route_utils.call_process_api(
  File "/home/daoliangshu/anaconda3/lib/python3.9/site-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
  File "/home/daoliangshu/anaconda3/lib/python3.9/site-packages/gradio/blocks.py", line 1935, in process_api
    result = await self.call_function(
  File "/home/daoliangshu/anaconda3/lib/python3.9/site-packages/gradio/blocks.py", line 1520, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "/home/daoliangshu/anaconda3/lib/python3.9/site-packages/anyio/to_thread.py", line 28, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(func, *args, cancellable=cancellable,
  File "/home/daoliangshu/anaconda3/lib/python3.9/site-packages/anyio/_backends/_asyncio.py", line 818, in r