In [65]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torchvision.transforms import ToTensor
from transformers import VisionEncoderDecoderModel, BertTokenizer
import pytorch_lightning as pl
import os
import seaborn as sns
import matplotlib.pyplot as plt
import copy
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from pytorch_lightning.loggers import WandbLogger
from dataset import NWPU_Captions
from torchmetrics import BLEUScore,  MetricCollection, SacreBLEUScore, CHRFScore
from torchmetrics.multimodal import CLIPScore
import torchmetrics

In [64]:
metrics = MetricCollection([ BLEUScore(n_gram=3), SacreBLEUScore(), CHRFScore()
        ]).clone(prefix='train/')

In [51]:
metrics.update(['The planes are parked on the runway lot. to the water.<|endoftext|><|endoftext|><|endoftext|>'],['Three planes are parked on the open space next to the terminal .'])

  total_n_grams[n] = tensor(sum(n_grams_counts[n].values()))
  matching_n_grams[n] = tensor(


In [52]:
metrics.compute()

{'train/BLEUScore': tensor(0.),
 'train/SacreBLEUScore': tensor(0.),
 'train/CHRFScore': tensor(0.4733)}

In [3]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
torch.multiprocessing.set_sharing_strategy('file_system')

In [4]:
p = {
    'device': 'cpu',
    'sample_method': 'random',
    'bs': 10,
    'lr': 0.0001,
    'epochs': 1,
    'epochs_total': 10,
    'maxcycles': 5,
    'init_set_size': .05,
    'new_data_size': .05,
}

In [5]:
class ImageCaptioningSystem(pl.LightningModule):
    def __init__(self, batchsize, lr, nepochs):
        super().__init__()
        self.model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
            "google/vit-base-patch16-224-in21k", "bert-base-uncased"
        )
        self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

        self.model.config.decoder_start_token_id = self.bert_tokenizer.cls_token_id
        self.model.config.pad_token_id = self.bert_tokenizer.pad_token_id
        self.model.config.vocab_size = self.model.config.decoder.vocab_size
        
        self.batch_size = batchsize
        self.lr = lr
        self.epochs = nepochs

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        pixel_values, sentences, _ = batch

        pixel_values = pixel_values.squeeze()        
        tokens = self.bert_tokenizer(sentences, return_tensors = 'pt', padding='longest').input_ids.to(self.device)

        outputs = self.model(pixel_values, labels=tokens)
        loss = outputs.loss
        return loss
    
    def validation_step(self, batch, batch_idx):
        pixel_values, sentences, _ = batch

        pixel_values = pixel_values.squeeze()        
        tokens = self.bert_tokenizer(sentences, return_tensors = 'pt', padding='longest').input_ids.to(self.device)

        outputs = self.forward(pixel_values, labels=tokens)
        loss = outputs.loss
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

In [6]:
# Sampling Methods

def random_selection(dataset, new_items):
    new_unlabeled_set, new_train_set = train_test_split(dataset, test_size=new_items, shuffle=True)
    return new_train_set, new_unlabeled_set

def least_confidence(model, dataloader, new_items):
    confidences = []
    softmax = nn.Softmax()
    for batch in iter(dataloader):
        imgs, _, ids = batch
        with torch.no_grad():
            model.eval()
            logits = model.forward(imgs.to(p['device']))
        logits_softmax = softmax(logits)
        predicted, _ = torch.max(logits_softmax, dim=1)
        for i in range(len(predicted) - 1):
            confidences.append((predicted[i], ids[i]))  
    sorted_confidences = sorted(confidences, key=lambda x: x[0])
    identifiers = [i[1].item() for i in sorted_confidences]
    identifiers, _ = train_test_split(identifiers, test_size=1-new_items, shuffle=False)
    return identifiers

def label_new_data(unlabeled_set, ids):
    new_data = [elem for elem in unlabeled_set if elem[2] in ids]
    # Remove datapoints from unlabeled set
    new_unlabeled_set = [elem for elem in unlabeled_set if elem[2] not in ids]
    return new_train, new_unlabeled_set

In [7]:
def train_select_loop(ic_model, train_set, unlabeled_set, val_loader, sample_method, maxcycles):
    print(sample_method)
    model = copy.deepcopy(ic_model)
    logger = None
    cycle = 0
    while cycle < maxcycles:
        print('-' * 110 + f"\nRun {cycle}/{maxcycles-1} with {len(train_set)} training images, {len(val_set)} validation images, and {len(unlabeled_set)} unlabeled images.\n" + '-' * 110)
        trainer = pl.Trainer(fast_dev_run=True, max_epochs=p['epochs'], num_sanity_val_steps=0, enable_model_summary=False,
                             enable_progress_bar=False, accelerator=p['device'], logger=logger)
        # Dataloaders for unlabeled and labeled training sets get re-initialized at every cycle
        train_loader = DataLoader(train_set, batch_size=20, num_workers=os.cpu_count(), drop_last=False)
        inference_loader = DataLoader(unlabeled_set, batch_size=100, num_workers=os.cpu_count(), drop_last=False)

        trainer.fit(model, train_loader, val_loader)
        
        match sample_method:
            case 'uncertainty':
                identifiers = least_confidence(model, inference_loader, p['new_data_size'])
                new_train, unlabeled_set = label_new_data(unlabeled_set, identifiers)
            case 'random':
                new_train, unlabeled_set = random_selection(unlabeled_set, p['new_data_size'])
            case 'diversity':
                ...
        
        train_set = train_set + new_train
        
        cycle += 1

In [68]:
%cd "c:\Users\Lorenz\EigeneDokumente\Universität\Semester 11\Machine Learning for Remote Sensing\image captioning\activelearning_ic"

c:\Users\Lorenz\EigeneDokumente\Universität\Semester 11\Machine Learning for Remote Sensing\image captioning\activelearning_ic


In [70]:

val_set = NWPU_Captions(root='NWPU-Captions/NWPU_images', annotations_file='NWPU-Captions/dataset_nwpu.json', split='val', transform=ToTensor())
# val_set = NWPU_Captions(root='NWPU_images', annotations_file='dataset_nwpu.json', split='val', transform=ToTensor())
# test_set = NWPU_Captions(root='NWPU_images', annotations_file='dataset_nwpu.json', split='test', transform=ToTensor())
# print(len(train_set), len(val_set), len(test_set))

In [71]:
val_loader = DataLoader(val_set, batch_size=20, num_workers=os.cpu_count(), drop_last=False, shuffle=False)

In [72]:
it = iter(val_loader)

In [73]:
batch = next(it)

In [74]:
images, sentences, _ = batch

In [77]:
from transformers import ViTFeatureExtractor

In [79]:
image_processor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [112]:
image_processor([i for i in images]).pixel_values

[array([[[-0.09019601, -0.09019601, -0.09803915, ..., -0.19215679,
          -0.47450978, -0.5921568 ],
         [-0.09803915, -0.09803915, -0.09803915, ..., -0.09019601,
          -0.29411763, -0.41176468],
         [-0.11372542, -0.10588229, -0.09803915, ..., -0.13725483,
           0.00392163, -0.2156862 ],
         ...,
         [-0.49019605, -0.2156862 , -0.01176465, ..., -0.11372542,
          -0.11372542, -0.11372542],
         [-0.07450974,  0.05098045,  0.09019613, ..., -0.12156856,
          -0.11372542, -0.10588229],
         [-0.00392151,  0.06666672,  0.05098045, ..., -0.11372542,
          -0.09803915, -0.09019601]],
 
        [[-0.09019601, -0.09019601, -0.09803915, ..., -0.19999993,
          -0.4823529 , -0.59999996],
         [-0.09803915, -0.09803915, -0.09803915, ..., -0.09019601,
          -0.30196077, -0.41960782],
         [-0.11372542, -0.10588229, -0.09803915, ..., -0.13725483,
           0.00392163, -0.2156862 ],
         ...,
         [-0.5921568 , -0.3098039

In [86]:
images[]

torch.Size([3, 256, 256])

In [21]:
train_set_unlabeled, train_set_labeled = train_test_split(train_set, test_size=p['init_set_size'], shuffle=True)

In [35]:
val_loader = DataLoader(val_set, batch_size=20, num_workers=os.cpu_count(), drop_last=False, shuffle=False)
test_loader = DataLoader(test_set, batch_size=20, num_workers=os.cpu_count(), drop_last=False, shuffle=False)

In [41]:
ic_model = ImageCaptioningSystem(p['bs'],p['lr'], p['epochs'])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.5.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.5.crossattention.self.key.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.6.crossattention.self.key.weight', 'bert.e

In [None]:
if p['sample_method'] is not None:
    # train with active learning rule
    train_select_loop(ic_model, train_set_labeled, train_set_unlabeled, val_loader, p['sample_method'], p['maxcycles'])
else:
    # train on whole training set
    print('-' * 80 + f"\nTraining in the whole dataset with {len(train_set)} training images and {len(val_set)} validation images\n" + '-' * 80)
    train_loader = DataLoader(train_set, batch_size=20, num_workers=os.cpu_count(), drop_last=False, shuffle=False)
    model = copy.deepcopy(ic_model)
    trainer = pl.Trainer(fast_dev_run=True, max_epochs=p['epochs_total'], num_sanity_val_steps=0, 
                         enable_progress_bar=False, accelerator=p['device'], logger=logger)
    trainer.fit(model, train_loader, val_loader)

random


GPU available: False, used: False
TPU available: None, using: 0 TPU cores
Running in fast_dev_run mode: will run a full train, val and test loop using 1 batch(es).

  | Name  | Type                      | Params
----------------------------------------------------
0 | model | VisionEncoderDecoderModel | 224 M 
----------------------------------------------------
224 M     Trainable params
0         Non-trainable params
224 M     Total params


--------------------------------------------------------------------------------------------------------------
Run 0/4 with 280 training images, 5600 validation images, and 5320 unlabeled images.
--------------------------------------------------------------------------------------------------------------


Training: 0it [00:00, ?it/s]