In [1]:
import h5py
import numpy as np    
import matplotlib.pyplot as plt
import pandas as pd
import os
import torch
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

######## Very basic access to the dataset - let's see what we are working with! #######
raw_dataset = h5py.File('climatevisions_2019_popular.h5','r+') 
dataset = raw_dataset['tweet_data']
image_directory = '/home/tp-socialmedia/Dataset_small/'
cols_to_strip = ['created_at', 'img_name', 'language', 'referenced_tweets', 'text', 'tweet_id']   

data_dict = {}
# Iterate through the keys (assuming each key is a column name)
for key in dataset.keys():
     # Access the data for each column
     column_data = dataset[key][:]
        
     # Store the data in the dictionary with the column name as the key
     data_dict[key] = column_data
     
df = pd.DataFrame(data_dict)
df[cols_to_strip] = df[cols_to_strip].astype('string')
df[cols_to_strip] = df[cols_to_strip].replace(to_replace=r'^b\':?(.*)\'$', value=r'\1', regex=True)


print(df.shape)
df.dtypes

## only keep images here
# drop all columns exepct img_ columns
selected_columns = ['img_name']
df_selected = df.loc[:, selected_columns]
df_selected.head()

print(device)

(5000, 9)


Unnamed: 0,img_name
0,id_1153283149360762880_2019-07-22.jpg
1,id_1163744643600637952_2019-08-20.jpg
2,id_1122574040936452097_2019-04-28.jpg
3,id_1188805167958974465_2019-10-28.jpg
4,id_1108042949449969666_2019-03-19.jpg


In [2]:
## Write into .csv file
import json
import torchvision.transforms as transforms
import torchvision

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, AutoImageProcessor, AutoModelForImageClassification
from transformers import pipeline
from sklearn.metrics import pairwise
from PIL import Image
from ultralytics import YOLO
import cv2
from openpyxl import Workbook

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

wb = Workbook()
ws = wb.active

image_data = df["img_name"].tolist()

header = ["image_name", "object_detection_results", "aisak_description", "moondream_description", "gemini_description","facial_emotion", "confidence"]
moondream_captions = {}
aisak_captions = {}
gemini_captions = {}

with open("gemini-captions.json", "r") as file:
    gemini_captions = json.load(file) ### read out captions

ws.append(header)

# Required lists and co
animals = ['bear', 'penguin', 'polar bear'] 


# All Models so they are only loaded once
yolo_model = YOLO("yolov8n.pt").to(device)
emotion_processor = AutoImageProcessor.from_pretrained("Rajaram1996/FacialEmoRecog")
emotion_model = AutoModelForImageClassification.from_pretrained("Rajaram1996/FacialEmoRecog").to(device)
long_desc_model_id = "vikhyatk/moondream2"
long_desc_revision = "2024-04-02"
long_desc_model = AutoModelForCausalLM.from_pretrained(
    long_desc_model_id, trust_remote_code=True, revision=long_desc_revision
).to(device)
long_desc_tokenizer = AutoTokenizer.from_pretrained(long_desc_model_id, revision=long_desc_revision)
short_desc_pipeline = pipeline("image-to-text", model="aisak-ai/aisak-visual")
sbert_model = AutoModel.from_pretrained("Voicelab/sbert-base-cased-pl").to(device)
sbert_tokenizer = AutoTokenizer.from_pretrained("Voicelab/sbert-base-cased-pl")

i = 0 ### only temporary to test for some images and not all (locally)
for img_name in image_data: 
   
    img_name = img_name.replace("\n","")
    image = Image.open(image_directory + img_name)
    #image.show()
    results = yolo_model(image)
    names = yolo_model.names
    #bounding boxes and confidence scores
    emotion = "---"
 
    for result in results:
        detected_objects = []
        detections = result.pred[0] if hasattr(result, 'pred') else None  # Get the detected objects if available
        
        # Display the detected objects
        ids = result.boxes.cls
        for id in ids:
            name = names[int(id)]
            if (name not in detected_objects):
                detected_objects.append(name)
        print(detected_objects)
        
        if ('person' in detected_objects):
            # Preprocess the image using the image processor
            inputs = emotion_processor(images=image, return_tensors="pt").to(device)
            
            # Get the predicted logits from the model
            outputs = emotion_model(**inputs)
            logits = outputs.logits

            # Get the predicted label index
            predicted_label = logits.argmax(dim=-1).item()

            # Get the predicted label name using the model's config
            emotion = emotion_model.config.id2label[predicted_label]

            # Print the predicted label
            print("Predicted label: ", emotion)
            
        elif (any(animal in detected_objects for animal in animals)):
            #TODO
            print('animals')
        else:
            #TODO
            print('else')
            print()
            
            
    
    # long description model
    enc_image = long_desc_model.encode_image(image).to(device)
    moonDreamResult = long_desc_model.answer_question(enc_image, "Describe this image.", long_desc_tokenizer)
    moondream_captions[img_name] = moonDreamResult
    
    # short description model
    aisakResult = short_desc_pipeline(image)
    aisak_captions[img_name] = aisakResult
    
    # gemini description
    geminiResult = gemini_captions[img_name]
    
    object_string = ''
    for index, object in enumerate(detected_objects):
        if (index <= len(detected_objects)):
            object_string += object + ', '
        else:
            object_string += object

    
    if object_string.endswith(','):
        object_string = object_string[:-1]
    
    mapEntry = aisakResult[0]
    aisakResult = next(iter(mapEntry.values()))
    
    ### Comparison of the models ###
    tokens = sbert_tokenizer([geminiResult, moonDreamResult, aisakResult], padding=True, truncation=True, return_tensors='pt').to(device)
    x = sbert_model(tokens["input_ids"], tokens["attention_mask"]).pooler_output
    similarity_matrix = pairwise.cosine_similarity(x.cpu().detach().numpy())
    print("Similarities: " + str(similarity_matrix[0,1]) + ", " + str(similarity_matrix[0,2]) + ", " + str(similarity_matrix[1,2]))
    avg_sim = (similarity_matrix[0,1] + similarity_matrix[0,2] + similarity_matrix[1,2]) / 3
    print("Average: " , avg_sim)


    data_row = [img_name, object_string, aisakResult, moonDreamResult, geminiResult, emotion, avg_sim]
    
    
    ws.append(data_row)
    detected_objects.clear()
    i+=1
    if (i == 6):
        break
    
wb.save("result.xlsx")

  from .autonotebook import tqdm as notebook_tqdm



0: 640x384 (no detections), 89.9ms
Speed: 2.0ms preprocess, 89.9ms inference, 1376.3ms postprocess per image at shape (1, 3, 640, 384)
[]
else



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.873517910639445

0: 416x640 (no detections), 442.8ms
Speed: 2.0ms preprocess, 442.8ms inference, 0.0ms postprocess per image at shape (1, 3, 416, 640)
[]
else



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.8720777829488119

0: 480x640 1 bird, 453.6ms
Speed: 2.5ms preprocess, 453.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
['bird']
else



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.903128465016683

0: 640x608 1 person, 548.4ms
Speed: 2.0ms preprocess, 548.4ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 608)
['person']
person


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.9182259241739908

0: 512x640 1 person, 574.8ms
Speed: 2.5ms preprocess, 574.8ms inference, 1.0ms postprocess per image at shape (1, 3, 512, 640)
['person']
person


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.9083538850148519

0: 512x640 1 person, 455.5ms
Speed: 2.0ms preprocess, 455.5ms inference, 1.0ms postprocess per image at shape (1, 3, 512, 640)
['person']
person


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.8902325630187988

0: 352x640 9 persons, 1 book, 569.0ms
Speed: 1.0ms preprocess, 569.0ms inference, 2.8ms postprocess per image at shape (1, 3, 352, 640)
['book', 'person']
person


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.9210977554321289

0: 416x640 1 person, 1 couch, 494.6ms
Speed: 1.0ms preprocess, 494.6ms inference, 1.0ms postprocess per image at shape (1, 3, 416, 640)
['person', 'couch']
person


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.8911571502685547

0: 448x640 (no detections), 497.8ms
Speed: 2.0ms preprocess, 497.8ms inference, 0.0ms postprocess per image at shape (1, 3, 448, 640)
[]
else



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.9032729466756185

0: 480x640 1 person, 520.0ms
Speed: 1.0ms preprocess, 520.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
['person']
person


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
