In [2]:
# !pip install streamlit

Collecting streamlit
  Downloading streamlit-1.48.1-py3-none-any.whl.metadata (9.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.48.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m69.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInst

In [3]:
# ==============================
# STREAMLIT CIVIL CAPTION APP
# ==============================
import torch
from PIL import Image
import streamlit as st
from transformers import BlipProcessor, BlipForConditionalGeneration, CLIPProcessor, CLIPModel

# Device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load models
@st.cache_resource
def load_models():
    blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(DEVICE).eval()
    clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(DEVICE).eval()
    clip_proc = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    return blip_processor, blip_model, clip_proc, clip_model

blip_processor, blip_model, clip_proc, clip_model = load_models()

# Vocabulary
equipment_labels = [
    "bulldozer", "tower crane", "front loader", "hydraulic excavator",
    "backhoe loader", "soil compactor", "motor grader", "road roller",
    "asphalt paver", "cement mixer truck", "skid steer loader",
    "construction forklift", "dump truck", "drilling rig",
    "concrete pump truck", "wheel scraper"
]
action_phrases = [
    "digging soil", "lifting materials", "hauling debris",
    "paving road", "mixing concrete", "compacting ground",
    "grading surface", "pushing dirt", "loading truck",
    "pouring concrete", "drilling ground"
]

# Functions
@torch.inference_mode()
def generate_blip_caption(image: Image.Image) -> str:
    inputs = blip_processor(images=image, return_tensors="pt").to(DEVICE)
    out = blip_model.generate(**inputs, max_new_tokens=30)
    return blip_processor.decode(out[0], skip_special_tokens=True)

@torch.inference_mode()
def classify_equipment(image: Image.Image):
    inputs = clip_proc(text=equipment_labels, images=image, return_tensors="pt", padding=True).to(DEVICE)
    out = clip_model(**inputs)
    img_emb = out.image_embeds / out.image_embeds.norm(dim=-1, keepdim=True)
    txt_emb = out.text_embeds / out.text_embeds.norm(dim=-1, keepdim=True)
    sims = (img_emb @ txt_emb.T).squeeze(0)
    return equipment_labels[sims.argmax().item()]

@torch.inference_mode()
def pick_action(image: Image.Image):
    inputs = clip_proc(text=action_phrases, images=image, return_tensors="pt", padding=True).to(DEVICE)
    out = clip_model(**inputs)
    img_emb = out.image_embeds / out.image_embeds.norm(dim=-1, keepdim=True)
    txt_emb = out.text_embeds / out.text_embeds.norm(dim=-1, keepdim=True)
    sims = (img_emb @ txt_emb.T).squeeze(0)
    return action_phrases[sims.argmax().item()]

def refine_caption(raw_caption: str, equip: str, action: str) -> str:
    refined = raw_caption.lower()
    if equip not in refined:
        refined = f"{equip} at construction site"
    if action not in refined:
        refined = f"{refined}, {action}"
    return refined

# ==============================
# STREAMLIT UI
# ==============================
st.set_page_config(page_title="Construction Image Captioning", layout="centered")

st.title("🏗️ Construction Equipment Caption Generator")
st.markdown("Upload an image of construction equipment and get an **AI-generated caption** tailored to civil engineering context.")

uploaded_file = st.file_uploader("📤 Upload an Image", type=["jpg", "png", "jpeg"])

if uploaded_file:
    image = Image.open(uploaded_file).convert("RGB")
    st.image(image, caption="Uploaded Image", use_column_width=True)

    with st.spinner("🔍 Analyzing image..."):
        raw = generate_blip_caption(image)
        equip = classify_equipment(image)
        action = pick_action(image)
        tailored = refine_caption(raw, equip, action)

    st.success("✅ Caption Generated!")
    st.markdown(
        f"<h3 style='color:#1E90FF; text-align:center;'>📌 {tailored}</h3>",
        unsafe_allow_html=True
    )


2025-08-16 12:43:58.343 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

