<a href="https://colab.research.google.com/github/pbamotra/15513/blob/master/sota_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.0-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 7.8 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 61.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 60.3 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 61.0 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully u

In [None]:
!wget http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz && tar xzf food-101.tar.gz

--2021-07-26 04:26:00--  http://data.vision.ee.ethz.ch/cvl/food-101.tar.gz
Resolving data.vision.ee.ethz.ch (data.vision.ee.ethz.ch)... 129.132.52.178, 2001:67c:10ec:36c2::178
Connecting to data.vision.ee.ethz.ch (data.vision.ee.ethz.ch)|129.132.52.178|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://data.vision.ee.ethz.ch/cvl/food-101.tar.gz [following]
--2021-07-26 04:26:01--  https://data.vision.ee.ethz.ch/cvl/food-101.tar.gz
Connecting to data.vision.ee.ethz.ch (data.vision.ee.ethz.ch)|129.132.52.178|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4996278331 (4.7G) [application/x-gzip]
Saving to: ‘food-101.tar.gz’


2021-07-26 04:30:18 (18.6 MB/s) - ‘food-101.tar.gz’ saved [4996278331/4996278331]



In [None]:
!find food-101 -type f | grep '.jpg$' > food101_data.txt

In [None]:
!head -n 5000 food101_data.txt | shuf -n 3000 > 'small_data.txt'

In [None]:
from transformers import AutoTokenizer
kotokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

In [None]:
from torchvision import transforms

image_tranforms = transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ])

In [None]:
import os
import numpy as np
from PIL import Image

import torch
import torchvision
import torchvision.datasets
from torchvision import transforms
from torch.utils.data import Dataset

from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder


class LT_Dataset(Dataset):
    num_classes = 1000

    def __init__(self, root, txt, transform=None):
        self.img_path = []
        self.targets = []
        self.transform = transform

        with open(txt) as f:
            for line in f:
                filepath = line.split()[0]
                self.img_path.append(os.path.join(root, filepath))

                label = filepath.split('/')[2]
                self.targets.append(label)
        
        # Original assigned category -> 0, ... K
        self.label_decoder = {i: el for i, el in enumerate(np.unique(self.targets))}
        self.label_encoder = {v: k for k, v in self.label_decoder.items()}

        self.num_classes = len(self.label_encoder)
        self.targets = [self.label_encoder[el] for el in self.targets]

        cls_count_list_old = [np.sum(np.array(self.targets) == i) for i in range(self.num_classes)]
        sorted_classes = np.argsort(-np.array(cls_count_list_old)).argsort()

        # New assigned category based on descending order of sample count
        self.count_based_label_encoder = {}
        for i in range(self.num_classes):
            self.count_based_label_encoder[i] = sorted_classes[i]
        self.count_based_label_decoder = {v: k for k, v in self.count_based_label_encoder.items()}

        self.targets = [self.count_based_label_encoder[target] for target in self.targets]
        self.cls_count_list = [np.sum(np.array(self.targets)==i) for i in range(self.num_classes)]

        rev_cls_count_list = self.cls_count_list[::-1]
        
        head_index = self.num_classes - bisect.bisect_left(rev_cls_count_list, 1000)
        torso_index = self.num_classes - bisect.bisect_left(rev_cls_count_list, 100)
        many_shot_index = self.num_classes - bisect.bisect_left(rev_cls_count_list, 20)

        self.category_partitions = [(0, head_index), 
                                    (head_index+1, torso_index),
                                    (torso_index+1, many_shot_index),
                                    (many_shot_index, self.num_classes)]

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        path = self.img_path[index]
        target = self.targets[index]

        with open(path, 'rb') as f:
            sample = Image.open(f).convert('RGB')
        if self.transform is not None:
            sample = self.transform(sample)

        text = '모두의 말뭉치를 추가적으로 사용하여 KoELECTRA-v3를 제작하였습니다. 제작하였습니다'
        return sample, text, target 

In [None]:
def pad_input(input):
    """
    creates a padded tensor to fit the longest sequence in the batch
    """
    if len(input[0].size()) == 1:
        l = [len(elem) for elem in input]
        targets = torch.zeros(len(input), max(l)).long()
        for i, elem in enumerate(input):
            end = l[i]
            targets[i, :end] = elem[:end]
    else:
        n, l = [], []
        for elem in input:
            n.append(elem.size(0))
            l.append(elem.size(1))
        targets = torch.zeros(len(input), max(n), max(l)).long()
        for i, elem in enumerate(input):
            targets[i, :n[i], :l[i]] = elem
    return targets


def collate_fn(data, tokenizer, max_length=100):
    """ collate to consume and batchify recipe data
    """
    image, texts, target = zip(*data)

    image = torch.stack(image, 0)

    encoded_input = tokenizer.batch_encode_plus(list(texts), 
                                                add_special_tokens=True, 
                                                max_length=max_length, 
                                                truncation=True, 
                                                return_tensors="pt")
    
    input_ids = pad_input(encoded_input['input_ids'])
    attention_mask = pad_input(encoded_input['attention_mask'])
    
    target = torch.tensor(list(target), dtype=torch.int64)
    return image, input_ids, attention_mask, target

In [None]:
from functools import partial

In [None]:
collate_fn_with_tokenizer = partial(collate_fn, tokenizer=kotokenizer)

In [None]:
dataset = LT_Dataset(root='.', txt='food101_data.txt', transform=image_tranforms)

In [None]:
dataset[1900]

(tensor([[[-1.6042, -1.6555, -1.6727,  ..., -0.9705, -0.9363, -0.9363],
          [-1.7412, -1.7069, -1.7069,  ..., -0.9534, -0.9363, -0.9363],
          [-1.7754, -1.7412, -1.7240,  ..., -0.9020, -0.9020, -0.9020],
          ...,
          [-1.0219, -1.0562, -1.0904,  ...,  0.4166,  0.4679,  0.4337],
          [-1.0219, -1.0562, -1.0390,  ...,  0.4166,  0.4679,  0.4679],
          [-0.8507, -1.0048, -1.0562,  ...,  0.4166,  0.4337,  0.4508]],
 
         [[-1.5455, -1.6155, -1.6331,  ..., -0.8803, -0.8452, -0.8452],
          [-1.6856, -1.6681, -1.6681,  ..., -0.8627, -0.8452, -0.8452],
          [-1.7381, -1.7031, -1.6856,  ..., -0.8102, -0.8102, -0.8102],
          ...,
          [-0.9678, -1.0203, -1.0553,  ...,  0.5378,  0.5903,  0.5553],
          [-0.9503, -1.0203, -1.0028,  ...,  0.5378,  0.5903,  0.5903],
          [-0.7577, -0.9503, -1.0203,  ...,  0.5378,  0.5553,  0.5728]],
 
         [[-1.5256, -1.5953, -1.6127,  ..., -0.6890, -0.6541, -0.6541],
          [-1.6302, -1.6127,

In [None]:
dataset.label_decoder[dataset.count_based_label_decoder[67]]

'beignets'

In [None]:
dataset.count_based_label_encoder[dataset.label_encoder['beignets']]

67

In [None]:
import multiprocessing

In [None]:
dl = torch.utils.data.DataLoader(
            dataset,
            batch_size=64, 
            shuffle=True,
            num_workers=multiprocessing.cpu_count(), 
            pin_memory=True,
            collate_fn=collate_fn_with_tokenizer)

In [None]:
class PrefetchLoader:
    def __init__(self, loader):
        self.loader = loader
        self.stream = torch.cuda.Stream()

    def __iter__(self):
        first = True
        for batch in self.loader:
            with torch.cuda.stream(self.stream):
                self.next_image = batch[0].cuda(non_blocking=True)
                self.next_input_ids = batch[1].cuda(non_blocking=True)
                self.next_attention_masks = batch[2].cuda(non_blocking=True)
                self.next_target = batch[3].cuda(non_blocking=True)

            if not first:
                yield image, input_ids, attention_masks, target
            else:
                first = False

            torch.cuda.current_stream().wait_stream(self.stream)
            image = self.next_image
            input_ids = self.next_input_ids
            attention_masks = self.next_attention_masks
            target = self.next_target

            # Ensures that the tensor memory is not reused for another tensor until all current work queued on stream are complete.
            image.record_stream(torch.cuda.current_stream())
            input_ids.record_stream(torch.cuda.current_stream())
            attention_masks.record_stream(torch.cuda.current_stream())
            target.record_stream(torch.cuda.current_stream())

        # final batch
        yield image, input_ids, attention_masks, target

        # cleaning at the end of the epoch
        del self.next_image
        del self.next_input_ids
        del self.next_attention_masks
        del self.next_target
        
        self.next_image = None
        self.next_input_ids = None
        self.next_attention_masks = None
        self.next_target = None

    def __len__(self):
        return len(self.loader)

    @property
    def dataset(self):
        return self.loader.dataset

In [None]:
loader = PrefetchLoader(dl)

In [None]:
%%writefile gpu_usage.sh
#! /bin/sh

# Tracks GPU usage for 400 seconds, change it as per your use
end=$((SECONDS+400))

while [ $SECONDS -lt $end ]; do
    nvidia-smi --format=csv --query-gpu=power.draw,utilization.gpu,memory.used,memory.free,fan.speed,temperature.gpu >> gpu.log
done

Overwriting gpu_usage.sh


In [None]:
%%bash --bg

bash gpu_usage.sh

Starting job # 2 in a separate thread.


In [None]:
for batch in tqdm(loader, total=len(dataset) // 64):
  assert batch is not None, "batch was none"
  assert len(batch) > 1, "expected batch size > 1"