In [4]:
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models
from keras_preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from tensorflow.keras import optimizers
from transformers import TFViTModel, ViTConfig

# Hugging Face의 transformers에서 ViT 모델과 기능 추출기를 불러옵니다.
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

class ViTForImageClassification(tf.keras.Model):
    def __init__(self, num_labels):
        super(ViTForImageClassification, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        self.classifier = layers.Dense(num_labels, activation='softmax')

    def call(self, inputs):
        outputs = self.vit(inputs)
        logits = outputs.last_hidden_state[:, 0, :]
        return self.classifier(logits)

# 데이터셋 경로
train_dir = r'E:\AI\dataset_skeleton_sep\face\BicycleCrunch\training'
val_dir = r'E:\AI\dataset_skeleton_sep\face\BicycleCrunch\validation'
test_dir = r'E:\AI\dataset_skeleton_sep\face\BicycleCrunch\test'

# ImageDataGenerator 초기화
datagen = ImageDataGenerator(rescale=1./255)  # 이미지를 0과 1 사이의 값으로 정규화

# 훈련, 검증, 테스트 데이터셋을 위한 제너레이터 생성
train_generator = datagen.flow_from_directory(
    train_dir,
    target_size=(224, 224),  # ViT 모델에 맞게 이미지 크기 조정
    batch_size=32,
    class_mode='categorical',
    shuffle=True)

validation_generator = datagen.flow_from_directory(
    val_dir,
    target_size=(224, 224),  # ViT 모델에 맞게 이미지 크기 조정
    batch_size=32,
    class_mode='categorical',
    shuffle=False)

test_generator = datagen.flow_from_directory(
    test_dir,
    target_size=(224, 224),  # ViT 모델에 맞게 이미지 크기 조정
    batch_size=32,
    class_mode='categorical',
    shuffle=False)

Found 7874 images belonging to 8 classes.
Found 1691 images belonging to 8 classes.
Found 1687 images belonging to 8 classes.


In [6]:
num_labels = 8  # 예시로 사용할 레이블의 수

# ViT 모델 구성 설정
config = ViTConfig.from_pretrained('google/vit-base-patch16-224-in21k')
config.num_labels = num_labels

# ViT 모델 로드
vit_model = TFViTModel.from_pretrained('google/vit-base-patch16-224-in21k', config=config)

# 맞춤형 모델 정의
input_layer = layers.Input(shape=(None, 224, 224, 3))  # 입력 레이어 정의
vit_outputs = vit_model(input_layer)[0]  # ViT 모델의 출력
x = layers.GlobalAveragePooling1D()(vit_outputs)  # 평균 풀링 레이어
output_layer = layers.Dense(num_labels, activation='softmax')(x)  # 분류기 레이어

# 최종 모델 생성
model = models.Model(inputs=input_layer, outputs=output_layer)

# 모델 컴파일
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 모델 요약 출력
model.summary()

# 조기 종료 콜백
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)


All PyTorch model weights were used when initializing TFViTModel.

All the weights of TFViTModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFViTModel for predictions without further training.


ValueError: Exception encountered when calling layer "tf_vi_t_model_1" (type TFViTModel).

in user code:

    File "C:\Users\ajhoo\anaconda3\envs\tensorGPU\lib\site-packages\transformers\modeling_tf_utils.py", line 764, in run_call_with_unpacked_inputs  *
        return func(self, **unpacked_inputs)
    File "C:\Users\ajhoo\anaconda3\envs\tensorGPU\lib\site-packages\transformers\models\vit\modeling_tf_vit.py", line 766, in call  *
        outputs = self.vit(
    File "C:\Users\ajhoo\anaconda3\envs\tensorGPU\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\ajhoo\AppData\Local\Temp\__autograph_generated_filevskrz7j7.py", line 37, in tf__run_call_with_unpacked_inputs
        retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
    File "C:\Users\ajhoo\AppData\Local\Temp\__autograph_generated_file5ffkzam9.py", line 24, in tf__call
        embedding_output = ag__.converted_call(ag__.ld(self).embeddings, (), dict(pixel_values=ag__.ld(pixel_values), interpolate_pos_encoding=ag__.ld(interpolate_pos_encoding), training=ag__.ld(training)), fscope)
    File "C:\Users\ajhoo\AppData\Local\Temp\__autograph_generated_filexuv3jyzp.py", line 11, in tf__call
        (batch_size, num_channels, height, width) = ag__.converted_call(ag__.ld(shape_list), (ag__.ld(pixel_values),), None, fscope)

    ValueError: Exception encountered when calling layer "vit" "                 f"(type TFViTMainLayer).
    
    in user code:
    
        File "C:\Users\ajhoo\anaconda3\envs\tensorGPU\lib\site-packages\transformers\modeling_tf_utils.py", line 764, in run_call_with_unpacked_inputs  *
            return func(self, **unpacked_inputs)
        File "C:\Users\ajhoo\anaconda3\envs\tensorGPU\lib\site-packages\transformers\models\vit\modeling_tf_vit.py", line 596, in call  *
            embedding_output = self.embeddings(
        File "C:\Users\ajhoo\anaconda3\envs\tensorGPU\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "C:\Users\ajhoo\AppData\Local\Temp\__autograph_generated_filexuv3jyzp.py", line 11, in tf__call
            (batch_size, num_channels, height, width) = ag__.converted_call(ag__.ld(shape_list), (ag__.ld(pixel_values),), None, fscope)
    
        ValueError: Exception encountered when calling layer "embeddings" "                 f"(type TFViTEmbeddings).
        
        in user code:
        
            File "C:\Users\ajhoo\anaconda3\envs\tensorGPU\lib\site-packages\transformers\models\vit\modeling_tf_vit.py", line 129, in call  *
                batch_size, num_channels, height, width = shape_list(pixel_values)
        
            ValueError: too many values to unpack (expected 4)
        
        
        Call arguments received by layer "embeddings" "                 f"(type TFViTEmbeddings):
          • pixel_values=tf.Tensor(shape=(None, None, 224, 224, 3), dtype=float32)
          • interpolate_pos_encoding=None
          • training=False
    
    
    Call arguments received by layer "vit" "                 f"(type TFViTMainLayer):
      • self=tf.Tensor(shape=(None, None, 224, 224, 3), dtype=float32)
      • pixel_values=None
      • head_mask=None
      • output_attentions=False
      • output_hidden_states=False
      • interpolate_pos_encoding=None
      • return_dict=True
      • training=False


Call arguments received by layer "tf_vi_t_model_1" (type TFViTModel):
  • self=tf.Tensor(shape=(None, None, 224, 224, 3), dtype=float32)
  • pixel_values=None
  • head_mask=None
  • output_attentions=None
  • output_hidden_states=None
  • interpolate_pos_encoding=None
  • return_dict=None
  • training=False

In [5]:
# 모델 학습
history = model.fit(train_generator,
                    steps_per_epoch=train_generator.samples // train_generator.batch_size,
                    validation_data=validation_generator, 
                    validation_steps=validation_generator.samples // validation_generator.batch_size,
                    epochs = 25, 
                    verbose=1, 
                    callbacks=[early_stopping])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 11: early stopping

테스트 정확도: 0.614182710647583
