from IPython.display import Image, display

display(Image(filename='Image-to-Text-LLM-700x450.png'))  # .png, .jpeg de olabilir


In [None]:
![Project Overview](assets/Image-to-Text-LLM-700x450.png)

### **1. Loading Necessary Libraries**

In [None]:
import streamlit as st
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent, AgentType
from langchain.tools import BaseTool, Tool
from dotenv import load_dotenv

### **2. Load Environment - Do not forget! If you passed this step you can not use your API key!**

#### To use the OpenAI API, you need to obtain an API key and store it in a `.env` file as follows:

##### **OPENAI_API_KEY=sk-...**

Then you may run the code below:

In [None]:
load_dotenv()

False

### **3. Agent Tool Design**

**Pretrained Model Link :** https://huggingface.co/Salesforce/blip-image-captioning-base<br>
**Paper :** https://arxiv.org/abs/2201.12086

**BLIP (Bootstrapped Language-Image Pretraining):** is a powerful and modern vision-language model that brings together images and texts to generate better captions, answer visual questions (VQA), and improve image-level understanding.

**Aim:** To analyze an image and generate text in natural language (e.g., "A red mug on the table.")

**Area of use:** Automatic product descriptions, social media captions, visual descriptions for people with visual impairments, etc.


In [None]:
def generate_caption(image_path: str):
    # Downloads model and processor from the HuggingFace and save it to hte Cache...
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    # Captioning is processing on this step...
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
    # Color code - RGB- is configured...
    image = Image.open(image_path).convert('RGB')
    #The processor converts the input image into a tensor format required by the model.  (pt :PyTorch, tf : TensorFlow)
    inputs = processor(image, return_tensors="pt")
    # Processes the image and returns token IDs in PyTorch tensor format
    output = model.generate(**inputs)
    # Converts the model output IDs into a natural language caption
    caption = processor.decode(output[0], skip_special_tokens=True)
    return caption # # At the end of this process, the generated text will be sent to the LLM Agent for further use.

### **4. Agent Design**

##### a. Model definition using  LLM:

In [None]:
llm = ChatOpenAI(model = "gpt4", temperature = 0.7)

##### b. I used here Langchain tool object:

In [None]:
caption_tool = Tool (name = "image_caption_tool", function = generate_caption,
                     description = "Gets the image path, and make a sentence that explains what the image is."

##### c. Inıtializing Agent:

In [None]:
agent = Initialize_agent(
    tools = [caption_tool],
    llm = llm,
    # If you would like to create more complex structures, it will be better to use other Agent Types.
    agent = AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose = True

### **5. Result And Streamlit**

In [None]:
st.title("🤖💬 Creates a product comment and generates SEO-compliant tags using the loaded image.")

In [None]:
image = st.file_uploader("Please Load a Product Image", type=["jpg", "jpeg", "png"])
if image:
    # This code creates a file named 'temp.jpg', which will be used by the application and the model each time a new image is uploaded.
    with open("temp.jpg", "wb") as f:
        f.write(image.read())

    st.image("temp.jpg", caption="The Image has been loaded", use_container_width=True)
    st.info("Analyzing...")

In [None]:
# I used a Chain of Thought (CoT) prompt to guide the model through step-by-step reasoning.
prompt_CoT = """
    Image file path: temp.jpg

    1. Use the image_caption_tool directly to analyze the content of the given image.
       This tool returns a sentence that describes what is shown in the image.
       Do not guess without using the tool.

    2. Based on the generated description:
        - Create a short and creative product title.
        - Write a unique product description in 3–4 sentences.
        - Suggest 10 SEO-friendly tags that define the product.

    Format your response clearly, with properly aligned step numbers one by one and structured output.
    Use the tool’s output directly and indicate it.
    """

In [None]:
# I added a try-except block, although no error has occurred so far.
# This ensures that if any error does happen, Streamlit won't crash and the error message will be shown to the user.
    try:
        output = agent.invoke(prompt_CoT)
        st.success("✅ Analyze Over and Comment - Tags Generated")
        st.write(output["output"])
    except Exception as e:
        st.error(f"Error!{e}")

##### To run this project please go to terminal and write the command below and enter:<br>
    streamlit run /project_Path/project_name

