In [3]:
import h5py
import numpy as np    
import matplotlib.pyplot as plt
import pandas as pd
import os
import torch
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

######## Very basic access to the dataset - let's see what we are working with! #######
raw_dataset = h5py.File('climatevisions_2019_popular.h5','r+') 
dataset = raw_dataset['tweet_data']
image_directory = 'C:\\Users\\Admin\\Documents\\Dataset_small\\'
cols_to_strip = ['created_at', 'img_name', 'language', 'referenced_tweets', 'text', 'tweet_id']   

data_dict = {}
# Iterate through the keys (assuming each key is a column name)
for key in dataset.keys():
     # Access the data for each column
     column_data = dataset[key][:]
        
     # Store the data in the dictionary with the column name as the key
     data_dict[key] = column_data
     
df = pd.DataFrame(data_dict)
df[cols_to_strip] = df[cols_to_strip].astype('string')
df[cols_to_strip] = df[cols_to_strip].replace(to_replace=r'^b\':?(.*)\'$', value=r'\1', regex=True)


print(df.shape)
df.dtypes

## only keep images here
# drop all columns exepct img_ columns
selected_columns = ['img_name']
df_selected = df.loc[:, selected_columns]
df_selected.head()

print(device)

True
(5000, 9)
cuda


In [5]:
## Write into .csv file
import json
import torchvision.transforms as transforms
import torchvision

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, AutoImageProcessor, AutoModelForImageClassification
from transformers import pipeline
from sklearn.metrics import pairwise
from PIL import Image
from ultralytics import YOLO
import cv2
from openpyxl import Workbook

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

wb = Workbook()
ws = wb.active

image_data = df["img_name"].tolist()

header = ["image_name", "object_detection_results", "aisak_description", "moondream_description", "gemini_description","facial_emotion", "confidence"]
moondream_captions = {}
aisak_captions = {}
gemini_captions = {}

with open("gemini-captions.json", "r") as file:
    gemini_captions = json.load(file) ### read out captions

ws.append(header)

# Required lists and co
animals = ['bear', 'penguin', 'polar bear'] 


# All Models so they are only loaded once
yolo_model = YOLO("yolov8n.pt").to(device)
emotion_processor = AutoImageProcessor.from_pretrained("Rajaram1996/FacialEmoRecog")
emotion_model = AutoModelForImageClassification.from_pretrained("Rajaram1996/FacialEmoRecog").to(device)
long_desc_model_id = "vikhyatk/moondream2"
long_desc_revision = "2024-04-02"
long_desc_model = AutoModelForCausalLM.from_pretrained(
    long_desc_model_id, trust_remote_code=True, revision=long_desc_revision
).to(device)
long_desc_tokenizer = AutoTokenizer.from_pretrained(long_desc_model_id, revision=long_desc_revision)
short_desc_pipeline = pipeline("image-to-text", model="aisak-ai/aisak-visual")
sbert_model = AutoModel.from_pretrained("Voicelab/sbert-base-cased-pl").to(device)
sbert_tokenizer = AutoTokenizer.from_pretrained("Voicelab/sbert-base-cased-pl")

i = 0 ### only temporary to test for some images and not all (locally)
for img_name in image_data: 
   
    img_name = img_name.replace("\n","")
    image = Image.open(image_directory + img_name)
    #image.show()
    results = yolo_model(image)
    names = yolo_model.names
    #bounding boxes and confidence scores
    emotion = "---"
 
    for result in results:
        detected_objects = []
        detections = result.pred[0] if hasattr(result, 'pred') else None  # Get the detected objects if available
        
        # Display the detected objects
        ids = result.boxes.cls
        for id in ids:
            name = names[int(id)]
            if (name not in detected_objects):
                detected_objects.append(name)
        print(detected_objects)
        
        if ('person' in detected_objects):
            # Preprocess the image using the image processor
            inputs = emotion_processor(images=image, return_tensors="pt").to(device)
            
            # Get the predicted logits from the model
            outputs = emotion_model(**inputs)
            logits = outputs.logits

            # Get the predicted label index
            predicted_label = logits.argmax(dim=-1).item()

            # Get the predicted label name using the model's config
            emotion = emotion_model.config.id2label[predicted_label]

            # Print the predicted label
            print("Predicted label: ", emotion)
            
        elif (any(animal in detected_objects for animal in animals)):
            #TODO
            print('animals')
        else:
            #TODO
            print('else')
            print()
            
            
    
    # long description model
    enc_image = long_desc_model.encode_image(image).to(device)
    moonDreamResult = long_desc_model.answer_question(enc_image, "Describe this image.", long_desc_tokenizer)
    moondream_captions[img_name] = moonDreamResult
    
    # short description model
    aisakResult = short_desc_pipeline(image)
    aisak_captions[img_name] = aisakResult
    
    # gemini description
    geminiResult = gemini_captions[img_name]
    
    object_string = ''
    for index, object in enumerate(detected_objects):
        if (index <= len(detected_objects)):
            object_string += object + ', '
        else:
            object_string += object

    
    if object_string.endswith(','):
        object_string = object_string[:-1]
    
    mapEntry = aisakResult[0]
    aisakResult = next(iter(mapEntry.values()))
    
    ### Comparison of the models ###
    tokens = sbert_tokenizer([geminiResult, moonDreamResult, aisakResult], padding=True, truncation=True, return_tensors='pt').to(device)
    x = sbert_model(tokens["input_ids"], tokens["attention_mask"]).pooler_output
    similarity_matrix = pairwise.cosine_similarity(x.cpu().detach().numpy())
    print("Similarities: " + str(similarity_matrix[0,1]) + ", " + str(similarity_matrix[0,2]) + ", " + str(similarity_matrix[1,2]))
    avg_sim = (similarity_matrix[0,1] + similarity_matrix[0,2] + similarity_matrix[1,2]) / 3
    print("Average: " , avg_sim)


    data_row = [img_name, object_string, aisakResult, moonDreamResult, geminiResult, emotion, avg_sim]
    
    
    ws.append(data_row)
    detected_objects.clear()
    i+=1
    if (i == 20):
        break
    
wb.save("result.xlsx")



cuda


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



0: 640x384 (no detections), 77.7ms
Speed: 2.6ms preprocess, 77.7ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)
[]
else





Similarities: 0.96043956, 0.8199624, 0.84015167
Average:  0.8735178311665853

0: 416x640 (no detections), 20.7ms
Speed: 1.0ms preprocess, 20.7ms inference, 1.0ms postprocess per image at shape (1, 3, 416, 640)
[]
else





Similarities: 0.93199754, 0.83413255, 0.8501033
Average:  0.8720777829488119

0: 480x640 1 bird, 31.2ms
Speed: 3.0ms preprocess, 31.2ms inference, 10.8ms postprocess per image at shape (1, 3, 480, 640)
['bird']
else





Similarities: 0.9622147, 0.8615202, 0.88565075
Average:  0.9031285444895426

0: 640x608 1 person, 41.1ms
Speed: 2.5ms preprocess, 41.1ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 608)
['person']
Predicted label:  disgust




Similarities: 0.94819474, 0.92981994, 0.87666273
Average:  0.9182257652282715

0: 512x640 1 person, 20.0ms
Speed: 3.0ms preprocess, 20.0ms inference, 2.0ms postprocess per image at shape (1, 3, 512, 640)
['person']
Predicted label:  surprise




Similarities: 0.9687861, 0.875968, 0.8803078
Average:  0.9083539644877116

0: 512x640 1 person, 23.2ms
Speed: 2.0ms preprocess, 23.2ms inference, 3.0ms postprocess per image at shape (1, 3, 512, 640)
['person']
Predicted label:  disgust




Similarities: 0.95590293, 0.85181236, 0.86298215
Average:  0.8902324835459391

0: 352x640 9 persons, 1 book, 171.3ms
Speed: 1.0ms preprocess, 171.3ms inference, 9.5ms postprocess per image at shape (1, 3, 352, 640)
['book', 'person']
Predicted label:  happy




Similarities: 0.96384436, 0.9152341, 0.88421476
Average:  0.9210977554321289

0: 416x640 1 person, 1 couch, 24.0ms
Speed: 1.0ms preprocess, 24.0ms inference, 2.0ms postprocess per image at shape (1, 3, 416, 640)
['person', 'couch']
Predicted label:  surprise




Similarities: 0.94020826, 0.85486245, 0.8784007
Average:  0.8911571502685547

0: 448x640 (no detections), 101.2ms
Speed: 2.0ms preprocess, 101.2ms inference, 0.0ms postprocess per image at shape (1, 3, 448, 640)
[]
else





Similarities: 0.96661425, 0.8814354, 0.8617691
Average:  0.9032729466756185

0: 480x640 1 person, 32.6ms
Speed: 1.5ms preprocess, 32.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
['person']
Predicted label:  disgust




Similarities: 0.9404683, 0.8889817, 0.8852731
Average:  0.9049077033996582

0: 480x640 7 persons, 14.8ms
Speed: 2.0ms preprocess, 14.8ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)
['person']
Predicted label:  sadness




Similarities: 0.9499887, 0.80549806, 0.7977738
Average:  0.8510868549346924

0: 640x384 1 person, 22.7ms
Speed: 2.0ms preprocess, 22.7ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)
['person']
Predicted label:  surprise




Similarities: 0.9447664, 0.88645184, 0.89222336
Average:  0.9078138669331869

0: 640x640 2 persons, 27.1ms
Speed: 3.0ms preprocess, 27.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)
['person']
Predicted label:  disgust




Similarities: 0.92845845, 0.90658784, 0.8888881
Average:  0.9079781373341879

0: 640x384 1 person, 23.6ms
Speed: 1.0ms preprocess, 23.6ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 384)
['person']
Predicted label:  sadness




Similarities: 0.9262829, 0.8808224, 0.8526229
Average:  0.8865760962168375

0: 448x640 7 persons, 3 airplanes, 8 ties, 22.0ms
Speed: 3.0ms preprocess, 22.0ms inference, 2.0ms postprocess per image at shape (1, 3, 448, 640)
['person', 'tie', 'airplane']
Predicted label:  neutral




Similarities: 0.9667012, 0.86994743, 0.8606601
Average:  0.8991029262542725

0: 640x544 3 persons, 2 ties, 94.1ms
Speed: 2.0ms preprocess, 94.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 544)
['tie', 'person']
Predicted label:  anger




Similarities: 0.95445585, 0.9084374, 0.90367115
Average:  0.9221881230672201

0: 448x640 7 persons, 1 airplane, 9 ties, 22.0ms
Speed: 2.0ms preprocess, 22.0ms inference, 2.0ms postprocess per image at shape (1, 3, 448, 640)
['person', 'tie', 'airplane']
Predicted label:  neutral




Similarities: 0.26992, 0.32475388, 0.86637014
Average:  0.4870146910349528

0: 448x640 9 persons, 2 skiss, 1 snowboard, 22.6ms
Speed: 2.0ms preprocess, 22.6ms inference, 0.9ms postprocess per image at shape (1, 3, 448, 640)
['person', 'snowboard', 'skis']
Predicted label:  disgust




Similarities: 0.89004815, 0.8959971, 0.90223324
Average:  0.8960928916931152

0: 640x544 5 persons, 1 car, 35.4ms
Speed: 3.0ms preprocess, 35.4ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 544)
['person', 'car']
Predicted label:  happy




Similarities: 0.9665592, 0.9200032, 0.89857966
Average:  0.9283806482950846

0: 480x640 1 person, 24.5ms
Speed: 6.5ms preprocess, 24.5ms inference, 1.5ms postprocess per image at shape (1, 3, 480, 640)
['person']
Predicted label:  surprise




Similarities: 0.93611336, 0.82761157, 0.8636832
Average:  0.8758026758829752


In [None]:

## Testing Playground for DINOv2 - Classifier not trained = fail ##

import torch
import torchvision.transforms as transforms
from PIL import Image
import torch.nn as nn
import requests

# Load the pretrained DINOv2 model
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14')

# Set the model to evaluation mode
model.eval()

# Transformation to preprocess the input image
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load an example image
image_test = "id_1088293078027374592_2019-01-24.jpg"
input_image = Image.open(image_directory + image_test)
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0)  # Create a mini-batch as expected by the model

# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_batch = input_batch.to(device)
model = model.to(device)

with torch.no_grad():
    # Get the features from the model
    features = model(input_batch)

# Define a simple classifier on top of the features
class SimpleClassifier(nn.Module):
    def __init__(self, num_features, num_classes=1000):  # Assume 1000 classes for illustration
        super(SimpleClassifier, self).__init__()
        self.fc = nn.Linear(num_features, num_classes)

    def forward(self, x):
        return self.fc(x)

# Instantiate and use the classifier
num_classes = 1000  # Change this to the number of classes in your dataset
classifier = SimpleClassifier(features.size(1), num_classes)
classifier = classifier.to(device)

# Predict the object class
with torch.no_grad():
    class_logits = classifier(features)
    class_probabilities = torch.softmax(class_logits, dim=1)
    predicted_classes = torch.argmax(class_probabilities, dim=1)

# Download the ImageNet class labels
LABELS_URL = "https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json"
response = requests.get(LABELS_URL)
class_labels = response.json()

# Map predicted class indices to class names
predicted_class_names = [class_labels[idx] for idx in predicted_classes]

print(predicted_class_names)


Using cache found in C:\Users\Admin/.cache\torch\hub\facebookresearch_dinov2_main


['St. Bernard']


In [5]:
##STEP 1: Load the captions JSON file to extract object labels.
import json
# Load the JSON file containing filename-caption pairs
with open('gemini-captions.json', 'r') as f:
    captions_dict = json.load(f)

# Filter out empty captions
captions = [caption for caption in captions_dict.values() if caption]

#Step2: extract objects from captions using LLaMA3 model
import json
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import spacy

# Load LLaMA3 model and spaCy tokenizer
from transformers import AutoModel, AutoTokenizer

# Load LLaMA3 model and tokenizer
model_llama3 = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True, torch_dtype=torch.float16)
model_llama3 = model_llama3.to(device='cuda')
tokenizer_llama3 = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True)
model_llama3.eval()

# Load spaCy model for tokenization
nlp = spacy.load('en_core_web_sm')

# Function to load first n captions from JSON file
def load_first_n_captions(file_path, n=5):
    with open(file_path, 'r') as f:
        captions_dict = json.load(f)
    captions = list(captions_dict.values())[:n]
    return captions

# Function to extract objects from caption using LLaMA3 model
def extract_objects_from_caption(caption):
    # Prepare question for the model
    question = f'What objects are in the image? Caption: {caption}'
    msgs = [{'role': 'user', 'content': question}]

    # Generate response from the model
    res = model_llama3.chat(
        image=None,  # We don't use image input here
        msgs=msgs,
        tokenizer=tokenizer_llama3,
        sampling=True,
        temperature=0.7,
        stream=False  # Ensure streaming is off for single response
    )

    # Extract objects from the generated text
    generated_text = next(res)  # Get the generated response
    doc = nlp(generated_text)   # Tokenize generated text using spaCy
    objects = [token.text.lower() for token in doc if token.pos_ == 'NOUN']

    return objects

# Main function to process captions and extract objects
def main():
    file_path = 'subset-captions.json'
    captions = load_first_n_captions(file_path, n=5)

    object_labels = set()
    for caption in captions:
        objects = extract_objects_from_caption(caption)
        object_labels.update(objects)

    # Convert set to list for grounding DINO model input
    object_labels = list(object_labels)
    object_labels_text = " ".join(object_labels)  # Convert to space-separated string
#step 3: Grounding DINO model for object detection
    # Define the model ID for grounding DINO
    model_id = "IDEA-Research/grounding-dino-tiny"

    # Initialize processor and model for grounding DINO
    processor = AutoProcessor.from_pretrained(model_id)
    model_dino = AutoModelForZeroShotObjectDetection.from_pretrained(model_id)

    # Load image from local file path
    image_path = "C:\\Users\\User\\OneDrive\\Desktop\\ProjectCode\\Team_Project_ComputerVision-1\\Dataset_small\\id_1088293078027374592_2019-01-24.jpg"
    image = Image.open(image_path)

    # Prepare text input with object labels extracted from LLaMA3
    text = object_labels_text

    # Ensure the model is on the correct device (e.g., GPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_dino.to(device)

    # Preprocess inputs
    inputs = processor(images=image, text=text, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Perform inference
    with torch.no_grad():
        outputs = model_dino(**inputs)

    # Post-process the outputs to get object detection results
    results = processor.post_process_grounded_object_detection(
        outputs,
        inputs["input_ids"],
        box_threshold=0.4,
        text_threshold=0.3,
        target_sizes=[image.size[::-1]]
    )

    # Print the detected objects and their confidence scores
    print(results)

    # Display the image with bounding boxes around detected objects
    fig, ax = plt.subplots(1)
    ax.imshow(image)

    # Draw bounding boxes and labels
    for detection in results:
        if "score" in detection:
            score = detection["score"].item()
            box = detection["box"]
            class_name = detection["class_name"]

            # Convert normalized coordinates to pixels
            xmin, ymin, xmax, ymax = box[0] * image.width, box[1] * image.height, box[2] * image.width, box[3] * image.height

            # Create a Rectangle patch
            rect = patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, linewidth=1, edgecolor='r', facecolor='none')
            ax.add_patch(rect)

            # Display class name and score
            ax.text(xmin, ymin, f"{class_name} {score:.2f}", bbox=dict(facecolor='red', alpha=0.5))

    plt.axis('off')  # Turn off axis labels
    plt.show()

if _name_ == '_main_':
    main()

ModuleNotFoundError: No module named 'spacy'

In [5]:
## Testing Playground for DINOv2 - Classifier not trained = fail ##

import torch
import torchvision.transforms as transforms
from PIL import Image
import torch.nn as nn
import requests

# Load the pretrained DINOv2 model
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14')

# Set the model to evaluation mode
model.eval()

# Transformation to preprocess the input image
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load an example image
image_test = "id_1088293078027374592_2019-01-24.jpg"
input_image = Image.open(image_directory + image_test)
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0)  # Create a mini-batch as expected by the model

# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_batch = input_batch.to(device)
model = model.to(device)

with torch.no_grad():
    # Get the features from the model
    features = model(input_batch)

# Define a simple classifier on top of the features
class SimpleClassifier(nn.Module):
    def __init__(self, num_features, num_classes=1000):  # Assume 1000 classes for illustration
        super(SimpleClassifier, self).__init__()
        self.fc = nn.Linear(num_features, num_classes)

    def forward(self, x):
        return self.fc(x)

# Instantiate and use the classifier
num_classes = 1000  # Change this to the number of classes in your dataset
classifier = SimpleClassifier(features.size(1), num_classes)
classifier = classifier.to(device)

# Predict the object class
with torch.no_grad():
    class_logits = classifier(features)
    class_probabilities = torch.softmax(class_logits, dim=1)
    predicted_classes = torch.argmax(class_probabilities, dim=1)

# Download the ImageNet class labels
LABELS_URL = "https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json"
response = requests.get(LABELS_URL)
class_labels = response.json()

# Map predicted class indices to class names
predicted_class_names = [class_labels[idx] for idx in predicted_classes]

print(predicted_class_names)


PyTorch Version: 2.2.1+cu121
Torchvision Version: 0.17.1+cpu
