In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()

n_class = 10
img_siz = (32, 32, 3)

patch_siz = 4
p2 = (img_siz[0] // patch_siz) ** 2
d_model = 64
h = 8
N = 6


class Patches(layers.Layer):
    def __init__(self, patch_size):
        super(Patches, self).__init__()
        self.p_siz = patch_size

    def call(self, img):
        batch_size = tf.shape(img)[0]
        patches = tf.image.extract_patches(images=img, sizes=[1, self.p_siz, self.p_siz, 1],
                                           strides=[1, self.p_siz, self.p_siz, 1], rates=[1, 1, 1, 1], padding="VALID")
        patch_dims = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, -1, patch_dims])
        return patches


class PatchEncoder(layers.Layer):
    def __init__(self, p2, d_model):
        super(PatchEncoder, self).__init__()
        self.p2 = p2
        self.projection = layers.Dense(units=d_model)
        self.position_embedding = layers.Embedding(input_dim=p2, output_dim=d_model)

    def call(self, patch):
        positions = tf.range(start=0, limit=self.p2, delta=1)
        encoded = self.projection(patch) + self.position_embedding(positions)
        return encoded


def create_vit_classifier():
    input = layers.Input(shape=(img_siz))
    nor = layers.Normalization()(input)

    patches = Patches(patch_siz)(nor)
    x = PatchEncoder(p2, d_model)(patches)

    for _ in range(N):
        x1 = layers.LayerNormalization(epsilon=1e-6)(x)
        x2 = layers.MultiHeadAttention(num_heads=h, key_dim=d_model // h, dropout=0.1)(x1, x1)
        x3 = layers.Add()([x2, x])
        x4 = layers.LayerNormalization(epsilon=1e-6)(x3)
        x5 = layers.Dense(d_model * 2, activation=tf.nn.gelu)(x4)
        x6 = layers.Dropout(0.1)(x5)
        x7 = layers.Dense(d_model, activation=tf.nn.gelu)(x6)
        x8 = layers.Dropout(0.1)(x7)
        x = layers.Add()([x8, x3])

        x = layers.LayerNormalization(epsilon=1e-6)(x)
        x = layers.Flatten()(x)
        x = layers.Dropout(0.5)(x)
        x = layers.Dense(2048, activation=tf.nn.gelu)(x)
        x = layers.Dropout(0.5)(x)
        x = layers.Dense(1024, activation=tf.nn.gelu)(x)
        x = layers.Dropout(0.5)(x)
        output = layers.Dense(n_class, activation='softmax')(x)

        model = keras.Model(inputs=input, outputs=output)
        return model


model = create_vit_classifier()
model.layers[1].adapt(x_train)

model.compile(optimizer=Adam(), loss=SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
hist = model.fit(x_train, y_train, batch_size=128, epochs=100, validation_data=(x_test, y_test), verbose=1)

res = model.evaluate(x_test, y_test, verbose=0)
print('정확률 = ', res[1] * 100)

plt.plot(hist.history['accuracy'])
plt.plot(hist.history['val_accuracy'])
plt.title('Accuracy graph')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'])
plt.grid()
plt.show()

plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('Loss graph')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'])
plt.grid()
plt.show()

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()

n_class = 10
img_siz = (32, 32, 3)
img_expanded_siz = (72, 72, 3)

patch_siz = 6
p2 = (img_expanded_siz[0] // patch_siz) ** 2
d_model = 64
h = 8
N = 6


class Patches(layers.Layer):
    def __init__(self, patch_size):
        super(Patches, self).__init__()
        self.p_siz = patch_size

    def call(self, img):
        batch_size = tf.shape(img)[0]
        patches = tf.image.extract_patches(images=img, sizes=[1, self.p_siz, self.p_siz, 1],
                                           strides=[1, self.p_siz, self.p_siz, 1], rates=[1, 1, 1, 1], padding="VALID")
        patch_dims = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, -1, patch_dims])
        return patches


class PatchEncoder(layers.Layer):
    def __init__(self, p2, d_model):
        super(PatchEncoder, self).__init__()
        self.p2 = p2
        self.projection = layers.Dense(units=d_model)
        self.position_embedding = layers.Embedding(input_dim=p2, output_dim=d_model)

    def call(self, patch):
        positions = tf.range(start=0, limit=self.p2, delta=1)
        encoded = self.projection(patch) + self.position_embedding(positions)
        return encoded


def create_vit_classifier():
    input = layers.Input(shape=(img_siz))
    nor = layers.Normalization()(input)
    exp = layers.Resizing(img_expanded_siz[0], img_expanded_siz[1])(nor)

    x = layers.RandomFlip('horizontal')(exp)
    x = layers.RandomRotation(factor=0.02)(x)
    x = layers.RandomZoom(height_factor=0.2, width_factor=0.2)(x)

    patches = Patches(patch_siz)(x)
    x = PatchEncoder(p2, d_model)(patches)

    for _ in range(N):
        x1 = layers.LayerNormalization(epsilon=1e-6)(x)
        x2 = layers.MultiHeadAttention(num_heads=h, key_dim=d_model // h, dropout=0.1)(x1, x1)
        x3 = layers.Add()([x2, x])
        x4 = layers.LayerNormalization(epsilon=1e-6)(x3)
        x5 = layers.Dense(d_model * 2, activation=tf.nn.gelu)(x4)
        x6 = layers.Dropout(0.1)(x5)
        x7 = layers.Dense(d_model, activation=tf.nn.gelu)(x6)
        x8 = layers.Dropout(0.1)(x7)
        x = layers.Add()([x8, x3])

        x = layers.LayerNormalization(epsilon=1e-6)(x)
        x = layers.Flatten()(x)
        x = layers.Dropout(0.5)(x)
        x = layers.Dense(2048, activation=tf.nn.gelu)(x)
        x = layers.Dropout(0.5)(x)
        x = layers.Dense(1024, activation=tf.nn.gelu)(x)
        x = layers.Dropout(0.5)(x)
        output = layers.Dense(n_class, activation='softmax')(x)

        model = keras.Model(inputs=input, outputs=output)
        return model


model = create_vit_classifier()
model.layers[1].adapt(x_train)

model.compile(optimizer=Adam(), loss=SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
hist = model.fit(x_train, y_train, batch_size=128, epochs=100, validation_data=(x_test, y_test), verbose=1)

res = model.evaluate(x_test, y_test, verbose=0)
print('정확률 = ', res[1] * 100)

plt.plot(hist.history['accuracy'])
plt.plot(hist.history['val_accuracy'])
plt.title('Accuracy graph')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'])
plt.grid()
plt.show()

plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('Loss graph')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'])
plt.grid()
plt.show()

In [None]:
from transformers import ViTFeatureExtractor, TFViTForImageClassification
from PIL import Image
import tensorflow as tf
import matplotlib.pyplot as plt

img = [Image.open('BSDS_242078.jpg'), Image.open('BSDS_361010.jpg'), Image.open('BSDS_376001.jpg')]

feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model = TFViTForImageClassification.from_pretrained('google/vit-base-patch16-224')

inputs = feature_extractor(img, return_tensors='tf')
res = model(**inputs)

for i in range(res.logits.shape[0]):
    plt.imshow(img[i])
    plt.xticks([])
    plt.yticks([])
    plt.show()
    predicted_label = int(tf.math.argmax(res.logits[i], axis=-1))
    prob = float(tf.nn.softmax(res.logits[i])[predicted_label] * 100.0)
    print(i, '번째 영상의 1순위 부류 : ', model.config.id2label[predicted_label], prob)

In [None]:
from transformers import DetrFeatureExtractor, DetrForObjectDetection
from PIL import Image
import numpy as np
import cv2 as cv

img = Image.open('BSDS_361010.jpg')

feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50')
model = DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50')

inputs = feature_extractor(img, return_tensors='pt')
res = model(**inputs)

colors = np.random.uniform(0, 255, size=(100, 3))
im = cv.cvtColor(np.array(img), cv.COLOR_BGR2RGB)
for i in range(res.logits.shape[1]):
    predicted_label = res.logits[0, i].argmax(-1).item()
    if predicted_label != 91:
        name = model.config.id2label[predicted_label]
        prob = '{:.2f}'.format(float(res.logits[0, i].softmax(dim=0)[predicted_label]))
        cx, cy = int(481 * res.pred_boxes[0, i, 0]), int(321 * res.pred_boxes[0, i, 1])
        w, h = int(481 * res.pred_boxes[0, i, 2]), int(321 * res.pred_boxes[0, i, 3])
        cv.rectangle(im, (cx - w // 2, cy - h // 2), (cx + w // 2, cy + h // 2), colors[predicted_label], 2)
        cv.putText(im, name + str(prob), (cx - w // 2, cy - h // 2 - 5), cv.FONT_HERSHEY_SIMPLEX, 0.6,
                   colors[predicted_label], 1)

cv.imshow('DETR', im)
cv.waitKey()
cv.destroyAllWindows()

In [None]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import matplotlib.pyplot as plt

img = Image.open('BSDS_361010.jpg')

processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')

captions = ['Two horses are running on grass', 'Students are eating', 'Croquet playing on horses',
            'Golf playing on horses']
inputs = processor(text=captions, images=img, return_tensors='pt', padding=True)
res = model(**inputs)

plt.imshow(img)
plt.xticks([])
plt.yticks([])
plt.show()

logits = res.logits_per_image
probs = logits.softmax(dim=1)
for i in range(len(captions)):
    print(captions[i], ' : ', '{:.2f}'.format(float(probs[0, i] * 100.0)))