In [None]:
!pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers

In [None]:
!pip install flash-attn --no-build-isolation

In [3]:
!pip uninstall -y ninja && pip install ninja

Found existing installation: ninja 1.11.1.1
Uninstalling ninja-1.11.1.1:
  Successfully uninstalled ninja-1.11.1.1
Collecting ninja
  Using cached ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
Installing collected packages: ninja
Successfully installed ninja-1.11.1.1


In [2]:
pip list | grep numpy

numpy                                    1.26.4
Note: you may need to restart the kernel to use updated packages.


In [4]:
!pip install packaging

Note: you may need to restart the kernel to use updated packages.


In [5]:
!pip install wheel



In [2]:
from PIL import Image 
import requests 
from transformers import AutoModelForCausalLM 
from transformers import AutoProcessor 

model_id = "microsoft/Phi-3-vision-128k-instruct" 

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", trust_remote_code=True, torch_dtype="auto", _attn_implementation='flash_attention_2')
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

def process_image_with_phi3(image_path, prompt_content, model_id="microsoft/Phi-3-vision-128k-instruct"):
    # Load model and processor
 
    # Prepare messages and image
    messages = [
        {"role": "user", "content": f"<|image_1|>\n{prompt_content}"},
    ]
    image = Image.open(image_path)

    # Process input
    prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")

    # Generate response
    generation_args = {
        "max_new_tokens": 500,
        "temperature": 0.0,
        "do_sample": False,
    }
    generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)

    # Process and return response
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return response



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [19]:
print( process_image_with_phi3("Images/paint-can.jpg", "Extract name and color code"))

The name of the product is 'FEATHER FALLS' and the color code is '6515'.


In [20]:
print( process_image_with_phi3("Images/santa-barbara.jpg", "Where is this picture taken ?"))

The image shows an interior space with a large mural on the wall, a chandelier, and wooden pews. The specific location cannot be determined from the image alone without additional context or information.


In [17]:
print( process_image_with_phi3("Images/rower_027.png", "Find Average and Peak heart rate"))

The average heart rate (Avg HR) is 129 and the peak heart rate (Peak HR) is 167.


In [21]:
print( process_image_with_phi3("Images/rower_027.png", "Find Workout Date"))


The workout date is Sunday, March 24.


In [22]:
print( process_image_with_phi3("Images/Leaning-Tower-of-Pisa-Italy.webp", "Describe this Image"))

The image captures the iconic Leaning Tower of Pisa, a freestanding bell tower of the cathedral of the Italian city of Pisa. The tower is white with a black base and is tilted to one side. It is surrounded by a green lawn with people scattered around, enjoying the view. The sky is clear and blue, and there are trees and a building in the background. The perspective of the image is from a low angle, looking up at the tower, emphasizing its height and lean. The identifier 'sa_1760' doesn't provide additional information about the landmark.


In [23]:
print( process_image_with_phi3("Images/thali.jpg", "Describe this Image"))

The image shows a silver plate with a variety of Indian dishes. There are two bowls of rice, one with white rice and the other with yellow rice, both garnished with green herbs. There are also two small bowls of soup, one with a pinkish hue and the other with a red hue, both with green herbs on top. A large piece of bread is placed on the plate, and there is a round, golden-brown dish that appears to be a type of flatbread or pattie, garnished with a lime wedge. The plate is on a dark surface, and there is a piece of yellow paper to the left of the plate.


In [24]:
print( process_image_with_phi3("Images/thali.jpg", "Is this vegeterian meal?"))

Yes, the meal appears to be vegetarian as it includes rice, lentils, and what looks like a vegetable curry, all of which are plant-based ingredients.


In [25]:
print( process_image_with_phi3("Images/tirewithnail.jpg", "What is wrong?"))

There is no clear indication of what is wrong in the image. The tire appears to be in good condition without visible damage or flat spots.


In [28]:
print( process_image_with_phi3("Images/santa_barbara_courthouse.jpg", "Where is this?"))

This is a photograph of a building with a clock tower, likely a historical or cultural site, surrounded by palm trees and a clear blue sky.


In [31]:
print( process_image_with_phi3("Images/orangefitness_hike.jpg", "Extract event details. When and where"))

The event details are for a hiking experience on June 29th at Rancho San Antonio Open Space Preserve, Wildcat Loop Trail. The meet-up is at Deer Hollow Farm Hay Barn Picnic Tables at 8:45 am, and participants are instructed to take off their tents at 9 am. There is also an option to sign up with the studio for more details.


In [33]:
# Marathi Input
print( process_image_with_phi3("Images/patrika.png", "Extract event details. When and where"))

The event details are not explicitly stated in the image, but it appears to be a religious or cultural ceremony, possibly related to the Hindu festival of Diwali, given the context of the text and the traditional attire of the figure in the image. The exact date and location are not provided.


In [34]:
# Markdown
print( process_image_with_phi3("Images/matcha_menu.jpg", "Extract menu in Markdown Format"))

The image displays a menu board for "COLD DRINKS" with various matcha latte options. Below is the markdown table conversion of the menu:

```markdown
| Drink Name                | Description                                                                 |
|---------------------------|-----------------------------------------------------------------------------|
| HONEY MATCHA LATTE        | [ Premium Matcha, natural honey, and whole milk ]                            |
| HONEY MATCHA TEA          | [ Premium Matcha, natural honey, and purified water ]                        |
| HONEY HOJICHA LATTE       | [ Premium HOJICHA, natural honey, and whole milk ]                           |
| LYCHEE MATCHA TEA         | [ Premium Matcha, real lychee, and cane sugar ]                              |
| RED DRAGON REFRESHER      | [ Real Red Dragon and Yuzu, and cane sugar ]                                 |
| HIBISCUS YUZU TEA W BASIL | [ Hibiscus Flower Tea, real Yuzu, Basil seed, and cane suga