In [1]:
import h5py
import numpy as np    
import matplotlib.pyplot as plt
import pandas as pd
import os
from PIL import Image


######## Very basic access to the dataset - let's see what we are working with! #######
raw_dataset = h5py.File('climatevisions_2019_popular.h5','r+') 
dataset = raw_dataset['tweet_data']
image_directory = 'C:\\Users\\Admin\\Documents\\Dataset_small\\'
cols_to_strip = ['created_at', 'img_name', 'language', 'referenced_tweets', 'text', 'tweet_id']   

data_dict = {}
# Iterate through the keys (assuming each key is a column name)
for key in dataset.keys():
     # Access the data for each column
     column_data = dataset[key][:]
        
     # Store the data in the dictionary with the column name as the key
     data_dict[key] = column_data
     
df = pd.DataFrame(data_dict)
df[cols_to_strip] = df[cols_to_strip].astype('string')
df[cols_to_strip] = df[cols_to_strip].replace(to_replace=r'^b\':?(.*)\'$', value=r'\1', regex=True)


print(df.shape)
df.dtypes

## only keep images here
# drop all columns exepct img_ columns
selected_columns = ['img_name']
df_selected = df.loc[:, selected_columns]
df_selected.head()

(5000, 9)


Unnamed: 0,img_name
0,id_1153283149360762880_2019-07-22.jpg
1,id_1163744643600637952_2019-08-20.jpg
2,id_1122574040936452097_2019-04-28.jpg
3,id_1188805167958974465_2019-10-28.jpg
4,id_1108042949449969666_2019-03-19.jpg


In [2]:
## Write into .csv file
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
from transformers import pipeline
from sklearn.metrics import pairwise


from PIL import Image
from ultralytics import YOLO
import cv2
from openpyxl import Workbook

wb = Workbook()
ws = wb.active

image_data = df["img_name"].tolist()

header = ["image_name", "object_detection_results", "aisak_description", "moondream_description", "gemini_description" "confidence"]
moondream_captions = {}
aisak_captions = {}
gemini_captions = {}

with open("gemini-captions.json", "r") as file:
    gemini_captions = json.load(file) ### read out captions

ws.append(header)

i = 0 ### only temporary to test for some images and not all (locally)
for img_name in image_data: 
    # Load a model
    model = YOLO("yolov8n.pt")  # load a pretrained model (recommended for training)
    animals = ['bear', 'penguin', 'polar bear'] 
   
    img_name = img_name.replace("\n","")
 
    image = Image.open(image_directory + img_name)
   
    results = model(image)  # predict on an image or image directory
    names = model.names
    #bounding boxes and confidence scores
 
    for result in results:
        detected_objects = []
        detections = result.pred[0] if hasattr(result, 'pred') else None  # Get the detections if available
        result.save()
        #result.show()
        # Display the detected objects
        ids = result.boxes.cls
        for id in ids:
            name = names[int(id)]
            if (name not in detected_objects):
                detected_objects.append(name)
        print(detected_objects)
        
        if ('person' in detected_objects):
            #TODO FACE RECOGNITION
            print('person')
        elif (any(animal in detected_objects for animal in animals)):
            #TODO
            print('animals')
        else:
            #TODO
            print('else')
            print()
            
            
    
    # long description model
    model_id = "vikhyatk/moondream2"
    revision = "2024-04-02"
    model = AutoModelForCausalLM.from_pretrained(
        model_id, trust_remote_code=True, revision=revision
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
 
    enc_image = model.encode_image(image)
    moonDreamResult = model.answer_question(enc_image, "Describe this image.", tokenizer)
    moondream_captions[img_name] = moonDreamResult
    
    # short description model
    imgToText = pipeline("image-to-text", model="aisak-ai/aisak-visual")
    aisakResult = imgToText(image)
    aisak_captions[img_name] = aisakResult
    
    # gemini description
    geminiResult = gemini_captions[img_name]
    
    object_string = ''
    for index, object in enumerate(detected_objects):
        if (index < len(detected_objects)):
            object_string += object + ',' + ' '
        else:
            object_string += object

    i+=1
    if (i == 10):
        break
    
    if object_string.endswith(','):
        object_string = object_string[:-1]
    
    mapEntry = aisakResult[0]
    aisakResult = next(iter(mapEntry.values()))
    
    ### Comparison of the models ###
    sbert = AutoModel.from_pretrained("Voicelab/sbert-base-cased-pl")
    tokenizer = AutoTokenizer.from_pretrained("Voicelab/sbert-base-cased-pl")
    tokens = tokenizer([geminiResult, moonDreamResult, aisakResult], padding=True, truncation=True, return_tensors='pt')
    x = sbert(tokens["input_ids"], tokens["attention_mask"]).pooler_output
    similarity_matrix = pairwise.cosine_similarity(x.detach().numpy())
    # print("Similarities: " + str(similarity_matrix[0,1]) + " " + similarity_matrix[0,2] + " " + similarity_matrix[1,2])
    avg_sim = (similarity_matrix[0,1] + similarity_matrix[0,2] + similarity_matrix[1,2]) / 3
    print("Average: " , avg_sim)


    data_row = [img_name, object_string, aisakResult, moonDreamResult, geminiResult, avg_sim]
    
    
    ws.append(data_row)
        
    detected_objects.clear
    
    
wb.save("result.xlsx")



  from .autonotebook import tqdm as notebook_tqdm



0: 640x384 (no detections), 89.9ms
Speed: 2.0ms preprocess, 89.9ms inference, 1376.3ms postprocess per image at shape (1, 3, 640, 384)
[]
else



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.873517910639445

0: 416x640 (no detections), 442.8ms
Speed: 2.0ms preprocess, 442.8ms inference, 0.0ms postprocess per image at shape (1, 3, 416, 640)
[]
else



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.8720777829488119

0: 480x640 1 bird, 453.6ms
Speed: 2.5ms preprocess, 453.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
['bird']
else



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.903128465016683

0: 640x608 1 person, 548.4ms
Speed: 2.0ms preprocess, 548.4ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 608)
['person']
person


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.9182259241739908

0: 512x640 1 person, 574.8ms
Speed: 2.5ms preprocess, 574.8ms inference, 1.0ms postprocess per image at shape (1, 3, 512, 640)
['person']
person


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.9083538850148519

0: 512x640 1 person, 455.5ms
Speed: 2.0ms preprocess, 455.5ms inference, 1.0ms postprocess per image at shape (1, 3, 512, 640)
['person']
person


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.8902325630187988

0: 352x640 9 persons, 1 book, 569.0ms
Speed: 1.0ms preprocess, 569.0ms inference, 2.8ms postprocess per image at shape (1, 3, 352, 640)
['book', 'person']
person


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.9210977554321289

0: 416x640 1 person, 1 couch, 494.6ms
Speed: 1.0ms preprocess, 494.6ms inference, 1.0ms postprocess per image at shape (1, 3, 416, 640)
['person', 'couch']
person


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.8911571502685547

0: 448x640 (no detections), 497.8ms
Speed: 2.0ms preprocess, 497.8ms inference, 0.0ms postprocess per image at shape (1, 3, 448, 640)
[]
else



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Average:  0.9032729466756185

0: 480x640 1 person, 520.0ms
Speed: 1.0ms preprocess, 520.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)
['person']
person


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:

## Testing Playground - Classifier not trained = fail ##

import torch
import torchvision.transforms as transforms
from PIL import Image
import torch.nn as nn
import requests

# Load the pretrained DINOv2 model
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14')

# Set the model to evaluation mode
model.eval()

# Transformation to preprocess the input image
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load an example image
image_test = "id_1088293078027374592_2019-01-24.jpg"
input_image = Image.open(image_directory + image_test)
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0)  # Create a mini-batch as expected by the model

# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_batch = input_batch.to(device)
model = model.to(device)

with torch.no_grad():
    # Get the features from the model
    features = model(input_batch)

# Define a simple classifier on top of the features
class SimpleClassifier(nn.Module):
    def __init__(self, num_features, num_classes=1000):  # Assume 1000 classes for illustration
        super(SimpleClassifier, self).__init__()
        self.fc = nn.Linear(num_features, num_classes)

    def forward(self, x):
        return self.fc(x)

# Instantiate and use the classifier
num_classes = 1000  # Change this to the number of classes in your dataset
classifier = SimpleClassifier(features.size(1), num_classes)
classifier = classifier.to(device)

# Predict the object class
with torch.no_grad():
    class_logits = classifier(features)
    class_probabilities = torch.softmax(class_logits, dim=1)
    predicted_classes = torch.argmax(class_probabilities, dim=1)

# Download the ImageNet class labels
LABELS_URL = "https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json"
response = requests.get(LABELS_URL)
class_labels = response.json()

# Map predicted class indices to class names
predicted_class_names = [class_labels[idx] for idx in predicted_classes]

print(predicted_class_names)


Using cache found in C:\Users\Admin/.cache\torch\hub\facebookresearch_dinov2_main


['St. Bernard']
