<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/VIT_ALL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# sentence-transformers Vs transformers

## sentence-transformers by UKPLab (To classify text)

page: https://www.libhunt.com/r/sentence-transformers

github: https://github.com/UKPLab/sentence-transformers

Installation

In [1]:
! pip -q install sentence-transformers

Text Feature Extraction

In [2]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

query = "I ate dinner"
query_vec = sbert_model.encode([query])[0]
print('Sample BERT embedding vector - length', len(query_vec))

## transformer by huggingface (ViT: To classify Image)
page: https://www.libhunt.com/r/transformers

Github: https://github.com/huggingface/transformers

Installation

In [4]:
! pip -q install transformers

In [5]:
from PIL import Image
import numpy as np
import torchvision

from transformers import ViTFeatureExtractor, ViTForImageClassification, BatchFeature
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor, Normalize, Resize, Compose

import torch


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
from torchvision import models
import torchvision.transforms as transforms
import os
import argparse
import copy
import random
import numpy as np
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def seed_everything(seed=12):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
parser = argparse.ArgumentParser(description='CIFAR-10H Training')
parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
parser.add_argument('--lr_schedule', default=0, type=int, help='lr scheduler')
parser.add_argument('--batch_size', default=58, type=int, help='batch size')
parser.add_argument('--test_batch_size', default=64, type=int, help='batch size')
parser.add_argument('--num_epoch', default=2, type=int, help='epoch number')
parser.add_argument('--num_classes', type=int, default=100, help='number classes')
args = parser.parse_args(args=[])

def train(model, trainloader, criterion, optimizer):
    model.train()
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs).logits
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Iter:',batch_idx,'/',len(trainloader), ' Loss:',loss.item())

def test(model, testloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs).logits
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    return correct / total

class ViTFeatureExtractorTransforms:
    def __init__(self, model_name_or_path):
        feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)
        transform = []

        if feature_extractor.do_resize:
            transform.append(Resize(feature_extractor.size))

        transform.append(ToTensor())

        if feature_extractor.do_normalize:
            transform.append(Normalize(feature_extractor.image_mean, feature_extractor.image_std))

        self.transform = Compose(transform)

    def __call__(self, x):
        return self.transform(x)


def main():
    seed_everything()
    model_name_or_path = 'google/vit-base-patch16-224-in21k'
    transform_vit = ViTFeatureExtractorTransforms(model_name_or_path)
    train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_vit)
    test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_vit)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True,num_workers=2)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, shuffle=False, num_workers=2)

    model = ViTForImageClassification.from_pretrained(model_name_or_path, num_labels=args.num_classes)
    model = model.to(device)

    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, nesterov=False, weight_decay=0.0001)
    criterion = nn.CrossEntropyLoss()

    best_epoch, best_acc = 0.0, 0
    for epoch in range(args.num_epoch):
        train(model, train_loader, criterion, optimizer)
        accuracy = test(model, test_loader)
        if accuracy > best_acc:
            patience = 0
            best_acc = accuracy
            best_epoch = epoch
            best_model = copy.deepcopy(model)
            torch.save(best_model.state_dict(), 'best_model_cifar10h_vit.pth.tar')
        print('epoch: {}  acc: {:.4f}  best epoch: {}  best acc: {:.4f}'.format(
                epoch, accuracy, best_epoch, best_acc, optimizer.param_groups[0]['lr']))
        
main()

Files already downloaded and verified
Files already downloaded and verified


Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Iter: 0 / 863  Loss: 4.627674579620361
Iter: 100 / 863  Loss: 2.7157750129699707
Iter: 200 / 863  Loss: 1.976897954940796
Iter: 300 / 863  Loss: 2.129338264465332
Iter: 400 / 863  Loss: 1.4080332517623901
Iter: 500 / 863  Loss: 1.377852201461792
Iter: 600 / 863  Loss: 1.5307177305221558
Iter: 700 / 863  Loss: 1.1656787395477295
Iter: 800 / 863  Loss: 1.4090977907180786
epoch: 0  acc: 0.6377  best epoch: 0  best acc: 0.6377
Iter: 0 / 863  Loss: 1.3454694747924805
Iter: 100 / 863  Loss: 1.5767067670822144
Iter: 200 / 863  Loss: 0.8645036220550537
Iter: 300 / 863  Loss: 0.9081160426139832
Iter: 400 / 863  Loss: 0.9121677875518799
Iter: 500 / 863  Loss: 0.9224331974983215
Iter: 600 / 863  Loss: 0.9078149199485779
Iter: 700 / 863  Loss: 0.7008175253868103
Iter: 800 / 863  Loss: 0.7572551965713501
epoch: 1  acc: 0.7246  best epoch: 1  best acc: 0.7246
