In [4]:
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
from transformers import AutoProcessor, BlipForConditionalGeneration

In [5]:
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

In [6]:
response = requests.get("https://en.wikipedia.org/wiki/Google")
soup = BeautifulSoup(response.text, 'html.parser')

In [7]:
img_elements = soup.find_all("img")

In [10]:
with open('captions.txt', 'w') as cap_file:
    for img in img_elements:
        img_url = img.get('src')

        # Skip if image SVG or too small
        if 'svg' in img_url or '1x1' in img_url:
            continue
        
        # Correct URL
        if img_url.startswith('//'):
            img_url = 'https:' + img_url

        # Skip URLs that don't start with http:// or https://
        elif not img_url.startswith('http://') or not img_url.startswith('https://'):
            continue

        try:
            response = requests.get(img_url)
            # Convert image data to PIL Image
            raw_img = Image.open(BytesIO(response.content))
            # Skip very small images
            if raw_img.size[0] * raw_img.size[1] < 400:
                continue

            raw_img = raw_img.convert('RGB')

            # Process image, generate & decode caption
            inp = processor(images=raw_img, return_tensors='pt')
            out = model.generate(**inp)
            caption = processor.decode(out[0], skip_special_tokens=True)

            cap_file.write("{}: {}\n".format(img_url, caption))
        except Exception as e:
            print("Error processing image {}: {}".format(img_url, e))
            continue