CV

In [3]:
from CV_train import EfficientNet, CVDataset
from torch.utils import data

In [6]:
en_b0 = EfficientNet()
animals_10_dataset = CVDataset("data/raw-img", 0.2, 0.1, en_b0.connector)
batch_size = 64

train_dataloader = data.DataLoader(
    data.Subset(animals_10_dataset, animals_10_dataset.train_index),
    batch_size=batch_size,
    shuffle=True,
    drop_last=True)
eval_dataloader = data.DataLoader(
    data.Subset(animals_10_dataset, animals_10_dataset.eval_index),
    batch_size=batch_size,
    shuffle=True,
    drop_last=True)
test_dataloader = data.DataLoader(
    data.Subset(animals_10_dataset, animals_10_dataset.test_index),
    batch_size=batch_size,
    shuffle=True,
    drop_last=True)

file, folder: e83db7072afd013ed1584d05fb1d4e9fe777ead218ac104497f5c978a4eebdbd_640.jpg, elephant


In [7]:
len(train_dataloader)

21

In [8]:
new_en_b0 = EfficientNet(new_head = True)
new_en_b0.train(train_dataloader, eval_dataloader, train_steps = 600)
new_en_b0.save("models/new_en_b0_first_run.pt")

In [9]:
mat, rep = new_en_b0.eval_statistics(test_dataloader)

In [10]:
print(mat)

[[18  0  0  0  0  0  0  0  0  0]
 [ 0 22  0  0  0  0  0  0  0  1]
 [ 0  0 27  0  0  0  0  0  0  0]
 [ 0  0  0 17  1  0  0  0  0  0]
 [ 0  0  0  0 18  0  0  0  0  0]
 [ 0  0  0  0  0 15  0  0  0  0]
 [ 0  0  0  0  0  0 21  0  0  0]
 [ 0  0  0  0  0  0  0 21  0  0]
 [ 0  0  0  0  0  0  0  0 12  0]
 [ 0  0  1  0  0  0  0  0  0 18]]


In [11]:
print(rep)

              precision    recall  f1-score   support

   butterfly       1.00      1.00      1.00        18
         cat       1.00      0.96      0.98        23
     chicken       0.96      1.00      0.98        27
         cow       1.00      0.94      0.97        18
         dog       0.95      1.00      0.97        18
    elephant       1.00      1.00      1.00        15
       horse       1.00      1.00      1.00        21
       sheep       1.00      1.00      1.00        21
      spider       1.00      1.00      1.00        12
    squirrel       0.95      0.95      0.95        19

    accuracy                           0.98       192
   macro avg       0.99      0.98      0.99       192
weighted avg       0.98      0.98      0.98       192



In [12]:
new_new_en_b0 = EfficientNet(new_head = True)
new_new_en_b0.load("models/new_en_b0_test.pt")

NER

ner
nli
clip-ner
clip-bow

Ner based approach

In [4]:
#I used LLM for generating presets and with prompt :" prompt here"
def generate_uni_test_template(animal_name):
    test_presets = [
        f"Look! A majestic {animal_name} is visible", #base
        f"I think there might be a {animal_name} in this photo",#base
        f"Is this a picture of a {animal_name}",#base
        f"The picture features a big, brown {animal_name}", #base
        f"I'm looking for a {animal_name}. Is it here?",#base
        f"It's a {animal_name}! I'm sure of it", #base
        f"An {animal_name} can be seen on the left side", #base
        f"There is a {animal_name} grazing in the field" #base*(if we want to answer on all possible questions including the env it's only possible with zero-shot image classifiers like CLIP)
        f"This animal is definitely a {animal_name}" #base
        f"I don't see a {animal_name} anywhere", #negative example that breaks our model because of NER structure
        f"A cute little {animal_name} is on the branch" #base
        f"The photo contains an animal; I believe it's a {animal_name}", #base
        f"This is a great picture of a human and an {animal_name}", #multiple ents
        f"Can you confirm that this isn't a {animal_name}?", #negative example that breaks our model because of NER structure
        f"This is a {'-'.join(animal_name)}", #misstypo
        f"{animal_name}." #base
    ]
    return test_presets

def denial_uni_test(animal_name):
    test_presets = [
        f"I don't see a {animal_name} anywhere", #negative example that breaks our model because of NER structure
        f"Can you confirm that this isn't a {animal_name}?", #negative example that breaks our model because of NER structure
    ]
    return test_presets

def generate_bi_test_template(label, decoy):
    test_presets = [
        f"There appears to be a {label} or a {decoy}", #multiple ents
        f"I saw a {label} on this picture and a small {decoy} earlier", #multiple ents
    ]
    return test_presets

In [5]:
negative_labels = ["apple", "plane", "axe", "student", "kurtosis"]

positive_labels = list(CVDataset.label_to_index.keys())
positive_labels.remove("NoF")
positive_labels

['butterfly',
 'cat',
 'chicken',
 'cow',
 'dog',
 'elephant',
 'horse',
 'sheep',
 'spider',
 'squirrel']

In [6]:
import numpy as np
import itertools

animals_combination = list(itertools.product(positive_labels, positive_labels))
entity_combination = list(itertools.product(positive_labels, negative_labels))
eval_corpus_features = []
eval_corpus_targets = []

for names in animals_combination:
    label = names[0]
    decoy = names[1]
    if label == decoy:
        uni_examples = generate_uni_test_template(label)
        eval_corpus_features.extend(uni_examples)
        eval_corpus_targets.extend(np.ones(len(uni_examples)) * CVDataset.label_to_index[label])

        uni_examples = denial_uni_test(label)
        eval_corpus_features.extend(uni_examples)
        eval_corpus_targets.extend(np.ones(len(uni_examples)) * CVDataset.label_to_index['NoF'])
    else:
        bi_examples = generate_bi_test_template(label, decoy)
        eval_corpus_features.extend(bi_examples)
        eval_corpus_targets.extend(np.ones(len(bi_examples)) * CVDataset.label_to_index[label])

for names in entity_combination:
    label = names[0]
    decoy = names[1]

    bi_examples = generate_bi_test_template(label, decoy)
    eval_corpus_features.extend(bi_examples)
    eval_corpus_targets.extend(np.ones(len(bi_examples)) * CVDataset.label_to_index[label])

In [7]:
ner_classes = positive_labels
nli_classes = []
for animal in ner_classes:
    nli_classes.extend([f"there is a {animal}", f"there is no {animal}"])

NER

In [6]:
from gliner import GLiNER

model = GLiNER.from_pretrained("gliner-community/gliner_large-v2.5", load_tokenizer=True)

import torch
model.to(torch.device("cuda:0"))
text = eval_corpus_features

predictions = []
labels = ner_classes
for line in text:
    prediction = model.predict_entities(line, labels, threshold = 0.75)
    predictions.append(prediction)

predicted_labels = []
for prediction in predictions:
    try:
        label = prediction[0]["label"]
        predicted_labels.append(label)
    except:
        predicted_labels.append("NoF")

  from .autonotebook import tqdm as notebook_tqdm
Fetching 10 files: 100%|██████████| 10/10 [00:00<?, ?it/s]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [8]:
print(model.data_processor)

<gliner.data_processing.processor.SpanProcessor object at 0x0000024C388C5190>


In [None]:
model.model.token_rep_layer.encode_text() #encoder from GLiNER

In [35]:
predicted_ids = [CVDataset.label_to_index[label] for label in predicted_labels]

In [36]:
from sklearn.metrics import classification_report, confusion_matrix

conf_matrix = confusion_matrix(eval_corpus_targets, predicted_ids)
conf_report = classification_report(eval_corpus_targets, predicted_ids, target_names=CVDataset.label_to_index.keys())

In [38]:
conf_matrix

array([[38,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3],
       [ 0, 38,  0,  0,  0,  0,  0,  0,  0,  0,  3],
       [ 0,  0, 38,  0,  0,  0,  0,  0,  0,  0,  3],
       [ 0,  0,  0, 39,  0,  0,  0,  0,  0,  0,  2],
       [ 0,  0,  0,  0, 38,  0,  0,  0,  0,  0,  3],
       [ 0,  0,  0,  0,  0, 38,  0,  0,  0,  0,  3],
       [ 0,  0,  0,  0,  0,  0, 38,  0,  0,  0,  3],
       [ 0,  0,  0,  0,  0,  0,  0, 38,  0,  0,  3],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 39,  0,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 38,  3],
       [ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  0]])

In [39]:
print(conf_report)

              precision    recall  f1-score   support

   butterfly       0.95      0.93      0.94        41
         cat       0.95      0.93      0.94        41
     chicken       0.95      0.93      0.94        41
         cow       0.95      0.95      0.95        41
         dog       0.95      0.93      0.94        41
    elephant       0.95      0.93      0.94        41
       horse       0.95      0.93      0.94        41
       sheep       0.95      0.93      0.94        41
      spider       0.95      0.95      0.95        41
    squirrel       0.95      0.93      0.94        41
         NoF       0.00      0.00      0.00        20

    accuracy                           0.89       430
   macro avg       0.86      0.85      0.86       430
weighted avg       0.91      0.89      0.90       430



NLI based approach

In [50]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

text = eval_corpus_features

predictions = []
labels = nli_classes
line = text[2]

for line in text:
    prediction = classifier(line, labels)
    prediction = prediction["labels"][0]
    predictions.append(prediction)

Device set to use cuda:0


In [55]:
predicted_nli_ids = [nli_classes.index(label) for label in predictions]

In [65]:
corrected_predictions = []
for target, predicted in zip(eval_corpus_targets, predicted_nli_ids):
    if target == predicted/2:
        corrected_predictions.append(target)
    elif target == (predicted-1)/2:
        corrected_predictions.append(10)
    elif predicted%2 == 0:
        corrected_predictions.append(predicted/2)
    else:
        corrected_predictions.append(target)

In [66]:
from sklearn.metrics import classification_report, confusion_matrix

conf_matrix = confusion_matrix(eval_corpus_targets, corrected_predictions)
conf_report = classification_report(eval_corpus_targets, corrected_predictions, target_names=CVDataset.label_to_index.keys())
conf_matrix

array([[40,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1],
       [ 0, 40,  0,  0,  0,  0,  0,  0,  0,  0,  1],
       [ 0,  0, 40,  0,  0,  0,  0,  0,  0,  0,  1],
       [ 0,  0,  0, 40,  0,  0,  0,  0,  0,  0,  1],
       [ 1,  0,  0,  0, 38,  0,  0,  0,  1,  0,  1],
       [ 0,  0,  0,  0,  0, 40,  0,  0,  0,  0,  1],
       [ 0,  0,  0,  0,  0,  0, 40,  0,  0,  0,  1],
       [ 0,  0,  0,  0,  0,  0,  0, 39,  1,  0,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 40,  0,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 40,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 20]])

In [67]:
print(conf_report)

              precision    recall  f1-score   support

   butterfly       0.98      0.98      0.98        41
         cat       1.00      0.98      0.99        41
     chicken       1.00      0.98      0.99        41
         cow       1.00      0.98      0.99        41
         dog       1.00      0.93      0.96        41
    elephant       1.00      0.98      0.99        41
       horse       1.00      0.98      0.99        41
       sheep       1.00      0.95      0.97        41
      spider       0.95      0.98      0.96        41
    squirrel       1.00      0.98      0.99        41
         NoF       0.67      1.00      0.80        20

    accuracy                           0.97       430
   macro avg       0.96      0.97      0.96       430
weighted avg       0.98      0.97      0.97       430



CLIP (2nd head for ner and CV) based approach

In [180]:
from gliner import GLiNER
from transformers import AutoTokenizer
gliner = GLiNER.from_pretrained("gliner-community/gliner_large-v2.5", load_tokenizer=True)

tokenizer = AutoTokenizer.from_pretrained(gliner.config.model_name) #tokenizer for gliner
encoder = gliner.model.token_rep_layer#.encode_text() #encoder from GLiNER



Fetching 10 files: 100%|██████████| 10/10 [00:00<?, ?it/s]


In [181]:
eval_corpus_features[0]

'Look! A majestic butterfly is visible'

In [185]:
tokenized_input = tokenizer(eval_corpus_features[0])
        #     is_split_into_words=True,
        #     return_tensors="pt",
        #     truncation=True,
        #     padding="longest",
        # )

In [186]:
tokenized_input = tokenized_input


In [187]:
tokenized_input

{'input_ids': [1, 3413, 300, 336, 17889, 13843, 269, 3979, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [39]:
tokenized_input.input_ids

[1, 3413, 300, 336, 17889, 13843, 269, 3979, 2]

In [194]:
from gliner.data_processing.collator import DataCollator

def run(
    self, texts, labels = [''], flat_ner=True, threshold=0.5, multi_label=False, batch_size=1,
    gen_constraints = None, num_gen_sequences = 1, **gen_kwargs
):
    """
    Predict entities for a batch of texts.

    Args:
        texts (List[str]): A list of input texts to predict entities for.
        labels (List[str]): A list of labels to predict.
        flat_ner (bool, optional): Whether to use flat NER. Defaults to True.
        threshold (float, optional): Confidence threshold for predictions. Defaults to 0.5.
        multi_label (bool, optional): Whether to allow multiple labels per token. Defaults to False.

    Returns:
        The list of lists with predicted entities.
    """
    self.eval()
    tokenizer = AutoTokenizer.from_pretrained(gliner.config.model_name)
    tokenized_input = tokenizer(texts[0])
    batch = tokenized_input
    print(batch)
    outputs = []
        #model_output = self.model(**batch, threshold=threshold, output_hidden_states = True)


                        #gliner(->).SpanModel(-).Encoder(<-).Transformer(<-).DebertaV2Model(<-).DebertaEmbeddings.forward()
        #meanwhile what I want is gliner(->).SpanModel(-).Encoder(<-).Transformer(<-).DebertaV2Model(<-).forward() -> self.linear(self.pooler())
    model_output = self.model.token_rep_layer.bert_layer.model(
        input_ids = torch.tensor([batch["input_ids"]]),
        attention_mask = torch.tensor([batch["attention_mask"]]),
        token_type_ids = torch.tensor([batch["token_type_ids"]]),
        #threshold=threshold,
    ), #output_hidden_states = True,
                                        #return_dict = True)
    outputs.append(model_output)
    return outputs

In [195]:
outputs = run(gliner, [eval_corpus_features[0]])

{'input_ids': [1, 3413, 300, 336, 17889, 13843, 269, 3979, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [196]:
outputs[0][0].last_hidden_state[:,0].shape

torch.Size([1, 1024])

In [115]:
[eval_corpus_features[0]]

['Look! A majestic butterfly is visible']

In [116]:
outputs[0]["logits"].shape

torch.Size([1, 7, 12, 1])

In [117]:
outputs[0]["prompts_embedding"].shape

torch.Size([1, 1, 768])

In [118]:
outputs[0]["words_embedding"].shape

torch.Size([1, 7, 768])