diff --git a/ai/generative-ai-service/image-to-text/LICENSE b/ai/generative-ai-service/image-to-text/LICENSE new file mode 100644 index 000000000..62c949c4e --- /dev/null +++ b/ai/generative-ai-service/image-to-text/LICENSE @@ -0,0 +1,35 @@ +Copyright (c) 2024 Oracle and/or its affiliates. + +The Universal Permissive License (UPL), Version 1.0 + +Subject to the condition set forth below, permission is hereby granted to any +person obtaining a copy of this software, associated documentation and/or data +(collectively the "Software"), free of charge and under any and all copyright +rights in the Software, and any and all patent rights owned or freely +licensable by each licensor hereunder covering either (i) the unmodified +Software as contributed to or provided by such licensor, or (ii) the Larger +Works (as defined below), to deal in both + +(a) the Software, and +(b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +one is included with the Software (each a "Larger Work" to which the Software +is contributed by such licensors), + +without restriction, including without limitation the rights to copy, create +derivative works of, display, perform, and distribute the Software and make, +use, sell, offer for sale, import, export, have made, and have sold the +Software and the Larger Work(s), and to sublicense the foregoing rights on +either these or other terms. + +This license is subject to the following condition: +The above copyright notice and either this complete permission notice or at +a minimum a reference to the UPL must be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/ai/generative-ai-service/image-to-text/README.md b/ai/generative-ai-service/image-to-text/README.md new file mode 100644 index 000000000..ef44ab1b1 --- /dev/null +++ b/ai/generative-ai-service/image-to-text/README.md @@ -0,0 +1,116 @@ + +# Image-to-Text with Oracle OCI Gen AI + +This application is built using **Streamlit** and **Oracle OCI Generative AI**, allowing users to upload an image, input a prompt, and receive a text-based response generated by the AI model. It leverages Oracle's Gen AI Inference API for processing multimodal data (text and image). + +Reviewed: 19.11.2024 + + + + +--- + +## Features + +- Upload an image file (`.png`, `.jpg`, `.jpeg`). +- Provide a natural language prompt describing your query about the image. +- Get a detailed response generated by Oracle's Generative AI model. +- Easy-to-use interface built with Streamlit. + +--- + +## Prerequisites + +1. **Oracle OCI Configuration** + - Set up your Oracle Cloud Infrastructure (OCI) account. + - Obtain the following: + - **Compartment OCID** + - **Generative AI Service Endpoint** + - **Model ID** (e.g., `meta.llama-3.2-90b-vision-instruct`). + - Configure your `~/.oci/config` file with your profile details. + +2. **Python Environment** + - Install Python 3.8 or later. + - Install required dependencies (see below). + +--- + +## Installation + +1. Clone the repository: + + +2. Install dependencies: + ```bash + pip install -r requirements.txt + ``` + +3. Configure OCI: + Ensure your `~/.oci/config` file is set up with the correct credentials and profile. + +--- + +## Usage + +1. Run the application: + ```bash + streamlit run app.py + ``` + +2. Open the web application in your browser at `http://localhost:8501`. + +3. Upload an image and provide a prompt in the text input field. Click **Generate Response** to receive the AI-generated output. + +--- + +## File Structure + +```plaintext +. +├── app.py # Main application file +├── requirements.txt # Python dependencies +└── README.md # Project documentation +``` + +--- + +## Dependencies + +List of dependencies (found in `requirements.txt`): +- **Streamlit**: For creating the web UI. +- **oci**: Oracle Cloud Infrastructure SDK. +- **base64**: For encoding images. + +Install them using: +```bash +pip install -r requirements.txt +``` + +--- + +## Notes + +- Ensure your OCI credentials and Compartment OCID are correct in the script. +- Check the image format and size for compatibility. +- Use the appropriate Generative AI service endpoint for your region. + +--- + +## Troubleshooting + +- **Error: `oci.exceptions.ServiceError`** + - Check if your compartment OCID and API keys are configured correctly. + +- **Streamlit does not load:** + - Verify that Streamlit is installed and the application is running on the correct port. + + + +--- + +## Acknowledgments + +- [Oracle Cloud Infrastructure (OCI)](https://www.oracle.com/cloud/) +- [Streamlit Documentation](https://docs.streamlit.io/) + +For questions or feedback, please contact [anshuman.p.panda@oracle.com]. diff --git a/ai/generative-ai-service/image-to-text/app.py b/ai/generative-ai-service/image-to-text/app.py new file mode 100644 index 000000000..5b8f6cd69 --- /dev/null +++ b/ai/generative-ai-service/image-to-text/app.py @@ -0,0 +1,96 @@ +# Author: Ansh +import streamlit as st +import oci +import base64 +from PIL import Image + +# OCI Configuration ( Put your compartment id below) +compartmentId = "ocid1.compartment.oc1..***************************" +llm_service_endpoint = "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com" + +# Define functions +def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + +def get_message(encoded_image, user_prompt): + content1 = oci.generative_ai_inference.models.TextContent() + content1.text = user_prompt + + content2 = oci.generative_ai_inference.models.ImageContent() + image_url = oci.generative_ai_inference.models.ImageUrl() + image_url.url = f"data:image/jpeg;base64,{encoded_image}" + content2.image_url = image_url + + message = oci.generative_ai_inference.models.UserMessage() + message.content = [content1, content2] + return message + +def get_chat_request(encoded_image, user_prompt): + chat_request = oci.generative_ai_inference.models.GenericChatRequest() + chat_request.messages = [get_message(encoded_image, user_prompt)] + chat_request.api_format = oci.generative_ai_inference.models.BaseChatRequest.API_FORMAT_GENERIC + chat_request.num_generations = 1 + chat_request.is_stream = False + chat_request.max_tokens = 500 + chat_request.temperature = 0.75 + chat_request.top_p = 0.7 + chat_request.top_k = -1 + chat_request.frequency_penalty = 1.0 + return chat_request + +def get_chat_detail(chat_request): + chat_detail = oci.generative_ai_inference.models.ChatDetails() + chat_detail.serving_mode = oci.generative_ai_inference.models.OnDemandServingMode(model_id="meta.llama-3.2-90b-vision-instruct") + chat_detail.compartment_id = compartmentId + chat_detail.chat_request = chat_request + return chat_detail + +# Streamlit UI +st.title("Image to Text with Oci Gen AI") +st.write("Upload an image, provide a prompt, and get a response from Oci Gen AI.") + +# Upload image +uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"]) + +# Prompt input +user_prompt = st.text_input("Enter your prompt for the image:", value="Tell me about this image.") + +if uploaded_file: + # Save the uploaded image temporarily + temp_image_path = "temp_uploaded_image.jpg" + with open(temp_image_path, "wb") as f: + f.write(uploaded_file.getbuffer()) + + # Display the uploaded image + st.image(temp_image_path, caption="Uploaded Image", use_column_width=True) + + # Process and call the model + if st.button("Generate Response"): + with st.spinner("Please wait while the model processes the image..."): + try: + # Encode the image + encoded_image = encode_image(temp_image_path) + + # Setup OCI client + CONFIG_PROFILE = "DEFAULT" + config = oci.config.from_file('~/.oci/config', CONFIG_PROFILE) + + llm_client = oci.generative_ai_inference.GenerativeAiInferenceClient( + config=config, + service_endpoint=llm_service_endpoint, + retry_strategy=oci.retry.NoneRetryStrategy(), + timeout=(10, 240) + ) + + # Get the chat request and response + llm_request = get_chat_request(encoded_image, user_prompt) + llm_payload = get_chat_detail(llm_request) + llm_response = llm_client.chat(llm_payload) + + # Extract and display the response + llm_text = llm_response.data.chat_response.choices[0].message.content[0].text + st.success("Model Response:") + st.write(llm_text) + except Exception as e: + st.error(f"An error occurred: {str(e)}") diff --git a/ai/generative-ai-service/image-to-text/image1.png b/ai/generative-ai-service/image-to-text/image1.png new file mode 100644 index 000000000..bb0673b02 Binary files /dev/null and b/ai/generative-ai-service/image-to-text/image1.png differ diff --git a/ai/generative-ai-service/image-to-text/requirements.txt b/ai/generative-ai-service/image-to-text/requirements.txt new file mode 100644 index 000000000..51a642715 --- /dev/null +++ b/ai/generative-ai-service/image-to-text/requirements.txt @@ -0,0 +1,3 @@ +streamlit==1.33.0 +oci==3.50.1 +Pillow \ No newline at end of file