In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

data_path = './input/plant-pathology-2020-fgvc7/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

In [None]:
train.shape, test.shape

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head()

### 12.2.2 데이터 시각화

타깃값 분포

In [None]:
healthy = train.loc[train['healthy']==1]
multiple_diseases = train.loc[train['multiple_diseases'] == 1]
rust = train.loc[train['rust']==1]
scab = train.loc[train['scab']==1]

In [None]:
scab

In [None]:
healthy.shape, multiple_diseases.shape, rust.shape, scab.shape

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

mpl.rc('font', size=15)
plt.figure(figsize=(7,7))

label = ['healthy', 'multiple diseases', 'rust', 'scab']
plt.pie([len(healthy), len(multiple_diseases), len(rust), len(scab)],
        labels=label,
        autopct='%.1f%%');

이미지 출력

In [None]:
import matplotlib.gridspec as gridspec
import cv2

def show_image(img_ids, rows=2, cols=3):
    assert len(img_ids) <= rows * cols # 이미지가 행/열 개수보다 많으면 오류 발생
    
    plt.figure(figsize=(15,8))
    grid = gridspec.GridSpec(rows, cols)
    
    for idx, img_id in enumerate(img_ids):
        img_path = f'{data_path}/images/{img_id}.jpg'
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        ax = plt.subplot(grid[idx])
        ax.imshow(image)

In [None]:
num_of_imgs=6
last_healthy_img_ids = healthy['image_id'][-num_of_imgs:]
last_multiple_diseases_img_ids = multiple_diseases['image_id'][-num_of_imgs:]
last_rust_img_ids = rust['image_id'][-num_of_imgs:]
last_scab_img_ids = scab['image_id'][-num_of_imgs:]

In [None]:
show_image(last_healthy_img_ids)

In [None]:
show_image(last_multiple_diseases_img_ids)

In [None]:
show_image(last_rust_img_ids)

In [None]:
show_image(last_scab_img_ids)

In [None]:
show_image(last_healthy_img_ids, rows=4, cols=3)

### 12.3.1 시드값 고정 및 GPU 장비 설정

시드값 고정

In [None]:
import torch
import random
import numpy as np
import os

seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic=True
torch.backends.cudnn.benchmark=False
torch.backends.cudnn.enabled=False

GPU 장비 설정

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

### 12.3.2 데이터 준비

In [None]:
import pandas as pd

data_path = './input/plant-pathology-2020-fgvc7/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

훈련 데이터, 검증 데이터 분리

In [None]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(train, test_size=0.2,
                                stratify=train[['healthy', 'multiple_diseases', 'rust', 'scab']],
                                random_state=42)

데이터셋 클래스 정의

In [None]:
import cv2
from torch.utils.data import Dataset
import numpy as np

class ImageDataset(Dataset):
    def __init__(self, df, img_dir='./', transform=None, is_test=False):
        super().__init__()
        self.df = df
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        img_id = self.df.iloc[idx, 0]
        img_path = self.img_dir + img_id + '.jpg'
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.transform is not None:
            image = self.transform(image=image)['image']
        # 테스트 데이터이면 이미지 데이터만 반환, 그렇지 않으면 타깃값도 반환
        if self.is_test:
            return image
        
        else:
            # 타깃값 4개 중 가장 큰 값의 인덱스
            label = np.argmax(self.df.iloc[idx, 1:5])
            return image, label

이미지 변환기 정의

In [None]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

In [None]:
transform_train = A.Compose([
    #A.Resize(450, 650),
    A.Resize(224, 224),
    A.RandomBrightnessContrast(brightness_limit=0.2, # 밝기 대비 조절
                               contrast_limit=0.2, p=0.3),
    A.VerticalFlip(p=0.2),
    A.HorizontalFlip(p=0.5),
    A.ShiftScaleRotate(
        shift_limit=0.1,
        scale_limit=0.2,
        rotate_limit=30, p = 0.3),
    A.OneOf([A.Emboss(p=1), # 양각화, 날카로움, 블러 효과
             A.Sharpen(p=1),
             A.Blur(p=1)], p=0.3),
    A.PiecewiseAffine(p=0.3), # 어파인 변환
    A.Normalize(), # 정규화 변환
    ToTensorV2() # 텐서로 변환
])

In [None]:
transform_test = A.Compose([
    #A.Resize(450,650),
    A.Resize(224, 224),
    A.Normalize(),
    ToTensorV2()
])

데이터셋 및 데이터 로더 생성

In [None]:
img_dir = './input/plant-pathology-2020-fgvc7/images/'

dataset_train = ImageDataset(train, img_dir=img_dir, transform=transform_train)
dataset_valid = ImageDataset(valid, img_dir=img_dir, transform=transform_test)

In [None]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
    
g = torch.Generator()
g.manual_seed(0)

In [None]:
from torch.utils.data import DataLoader

batch_size = 4

trainloader = DataLoader(dataset_train, batch_size=batch_size,
                          shuffle=True, worker_init_fn = seed_worker,
                          generator=g, num_workers=0)
validloader = DataLoader(dataset_valid, batch_size=batch_size,
                          shuffle=False, worker_init_fn = seed_worker,
                          generator=g, num_workers=0)

### 12.3.3 모델 생성

In [None]:
# !pip install efficientnet-pytorch==0.7.1

In [None]:
from efficientnet_pytorch import EfficientNet

In [None]:
# 사전 훈련된 efficient-b7 모델 불러오기
model = EfficientNet.from_pretrained('efficientnet-b4', num_classes=4)

model = model.to(device) # 장비 할당

In [None]:
device

In [None]:
# import torch.nn as nn

# # 사전 훈련된 efficient-b7 모델 불러오기
# model = EfficientNet.from_pretrained('efficientnet-b7')

# # 불러온 efficientnet-b7 모델의 마지막 계층 수정
# model._fc = nn.Sequential(
#                 nn.Linear(model._fc.in_features, model._fc.out_features), # 2560 --> 1000
#                 nn.ReLU(), # 활성화함수
#                 nn.Dropout(p=0.5), # 50% 드롭아웃
#                 nn.Linear(model._fc.out_features, 4) # 1000 --> 4
# )
# model = model.to(device)

### 12.3.4 모델 훈련 및 성능 검증

손실 함수와 옵티마이저 설정

In [None]:
import torch.nn as nn
from torch import optim # 옵티마이저 (경사하강법...)

criterion = nn.CrossEntropyLoss()

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00006, weight_decay=0.0001)

In [None]:
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=7, factor=0.1, verbose=True)

훈련 및 성능 검증

In [None]:
def validation(model, validloader, criterion):
    # 전방향 예측후 나온 점수(logits)의 최대값을 최종 예측으로 준비
    # 이 최종 예측과 정답을 비교
    # 전체 중 맞은 것의 개수 비율을 정확도(accuracy)로 계산
    num_classes = 4
    valid_accuracy = 0
    valid_loss = 0
    preds_auc_list = [] # 예측 확률값 저장용 리스트 초기화
    preds_acc_list = []
    true_list = [] # 실제 타깃값 저장용 리스트 초기화
    true_onehot_list = []
    
    # 전방향 예측을 구할 때는 gradient가 필요가 없음
    with torch.no_grad():
        for images, labels in validloader: # 10000개의 데이터에 대해 100개씩(미니배치 사이즈) 100번을 iterations
            # 1. 입력데이터 준비
            #images.resize_(images.size()[0], 784) # 100, 1, 28, 28
            images = images.to(device)
            labels = labels.to(device)
            
            # 2. 전방향(Forward) 예측 
            # logits = model.forward(images) # 점수 반환
            outputs = model(images)
            # _, preds = torch.max(logits, 1) # 100개에 대한 최종 예측
            
            loss = criterion(outputs, labels) # 100개에 대한 loss
            
            # acc 스코어
            preds_acc = torch.max(outputs.cpu(), dim=1)[1].numpy()
            true = labels.cpu().numpy()
            
            # roc_auc 스코어
            preds_auc = torch.softmax(outputs.cpu(), dim=1).numpy()
            true_onehot = torch.eye(num_classes, device='cuda')[labels].cuda().cpu().numpy()
            
            # ACC와 AUC 스코어 필요 인자가 달라서 각자 계산
            # 왜 이렇게 계산하는지 학습 필요
            preds_acc_list.extend(preds_acc)
            preds_auc_list.extend(preds_auc)
            true_list.extend(true)
            true_onehot_list.extend(true_onehot)
            
            # valid_accuracy += accuracy
            valid_loss += loss.item() # tensor 값을 꺼내옴 
            
    valid_auc = roc_auc_score(true_onehot_list, preds_auc_list)
    vallid_accuracy = accuracy_score(true_list, preds_acc_list)
    return valid_loss, valid_auc, vallid_accuracy # 100세트 전체 대한 총 loss, 총 accuracy

In [None]:
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm # 진행률 표시 막대
writer  = SummaryWriter()

In [None]:
train_loss_list = []
valid_loss_list = []
val_auc_list = []
val_acc_list = []

In [None]:
def train(model, epochs, criterion, optimizer):
    steps = 0
    min_loss = 10000
    max_accuracy = 0
    trigger = 0
    patience = 5 # for Early stopping
    num_classes = 4
  
    steps_per_epoch = len(trainloader) 
  
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for images, labels in tqdm(trainloader): # 진행률 막대 표시
            steps += 1
            # 1. 입력 데이터 준비
            images = images.to(device)
            labels = labels.to(device)
            
            # 2. 전방향(Forward) 예측 
            outputs = model(images) # 예측
            loss = criterion(outputs, labels) # 예측과 결과를 통해 Cross Entropy Loss 반환
      
            # 3. 역방향(Backward) 오차(Gradient) 전파
            optimizer.zero_grad() # 파이토치에서 gradient가 누적되지 않게 하기 위해
            loss.backward()
      
            # 4. 경사하강법으로 모델 파라미터 업데이트
            optimizer.step() # W <- W -lr*Gradient
      
            train_loss += loss.item()
        
            if (steps % steps_per_epoch) == 0: # step :.... (epoch 마다)
                model.eval() # 배치 정규화, 드롭아웃이 적용될 때는 model.forward 연산이 training때와 다르므로 반드시 설정
                valid_loss, valid_auc, valid_accuracy = validation(model, validloader, criterion)
        
                # tensorboad 시각화를 위한 로그 이벤트 등록
                writer.add_scalar("Loss/train", train_loss/len(trainloader), epoch)
                writer.add_scalar("Loss/valid", valid_loss/len(validloader), epoch)
                writer.add_scalars("Loss/train and valid",
                                  {'train' : train_loss/len(trainloader),
                                  'valid' : valid_loss/len(validloader)}, epoch)
                
                # writer.add_scalar("Valid Accuracy", valid_accuracy/len(validloader), epoch)
                writer.add_scalar("Valid AUC", valid_auc, epoch)
                writer.add_scalar("Valid ACC", valid_accuracy, epoch)
                
                train_loss_list.append(train_loss/len(trainloader))
                valid_loss_list.append(valid_loss/len(validloader))
                val_acc_list.append(valid_accuracy)
                val_auc_list.append(valid_auc)
                
                print('Epoch : {}/{}...'.format(epoch+1, epochs),
                      'Train Loss : {:.3f} / '.format(train_loss/len(trainloader)),
                      'Valid Loss : {:.3f} / '.format(valid_loss/len(validloader)),
                      'Valid AUC : {:.3f} / '.format(valid_auc),
                      'Valid Accuracy : {:.3f}'.format(valid_accuracy))
              
                if valid_accuracy > max_accuracy: 
                    max_accuracy = valid_accuracy
                    torch.save(model.state_dict(), 'best_checkpoint_effb7.pth')
        
                # Early Stopping (조기 종료)
                if valid_loss > min_loss:
                    trigger += 1 # valid loss가 min_loss 를 갱신하지 못할때마다 증가
                    print('trigger : ', trigger )
                    if trigger > patience:
                        print('Early Stopping!!!')
                        print('Traning step is finished!!')
                        writer.flush()  
                        return   
                else:
                    trigger = 0
                    min_loss = valid_loss
        
                train_loss = 0
                model.train()
                scheduler.step(valid_loss)
  
    writer.flush()  

In [None]:
torch.cuda.empty_cache()

In [None]:
# img size 320, 512
epochs=50
train(model, epochs, criterion, optimizer)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure()
plt.ylim(0,1.5)
sns.lineplot(list(range(len(train_loss_list))), train_loss_list)
sns.lineplot(list(range(len(valid_loss_list))), valid_loss_list)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train','Val'])

In [None]:
%load_ext tensorboard

In [None]:
!taskkill /im tensorboard.exe /f
!del /q %TMP%\.tensorboard-info\*

In [None]:
%tensorboard --logdir=runs

In [None]:
writer.close()

### 12.3.5 예측 및 결과 제출

In [None]:
dataset_test = ImageDataset(test, img_dir=img_dir, transform=transform_test, is_test=True)
 
testloader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False, worker_init_fn=seed_worker,
                         generator=g, num_workers=0)

예측

In [None]:
model.eval() # 모델을 평가 상태로 설정

preds = np.zeros((len(test), 4)) # 예측값 저장용 배열초기화

with torch.no_grad():
    for i, images in enumerate(testloader):
        images = images.to(device)
        outputs = model(images)
        
        # 타깃 예측 확률
        preds_part = torch.softmax(outputs.cpu(), dim=1).squeeze().numpy()
        preds[i*batch_size:(i+1)*batch_size] += preds_part

In [None]:
submission[['healthy', 'multiple_diseases', 'rust', 'scab']] = preds
submission.to_csv('submission.csv', index=False)

## 12.4 성능 개선
1. 에폭 늘리기
2. 스케줄러 추가
3. TTA(테스트 단계 데이터 증강) 기법
4. 레이블 스무딩 적용

### 12.4.1 모델 훈련 및 성능 검증

스케줄러 변경 및 에폭증강

In [None]:
from transformers import get_cosine_schedule_with_warmup
epochs = 20
scheduler = get_cosine_schedule_with_warmup(optimizer,
                                            num_warmup_steps = len(trainloader) * 3,
                                            num_training_steps = len(trainloader) * epochs)

In [None]:
def train(model, epochs, criterion, optimizer):
    steps = 0
    min_loss = 10000
    max_accuracy = 0
    trigger = 0
    patience = 5 # for Early stopping
    num_classes = 4
  
    steps_per_epoch = len(trainloader) 
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for images, labels in tqdm(trainloader): # 이터레이터로부터 미니배치 16개씩을 가져와 images, labels에 준비
            steps += 1
            # 1. 입력 데이터 준비
            images = images.to(device)
            labels = labels.to(device)
            
            # 2. 전방향(Forward) 예측 
            outputs = model(images) # 예측
            loss = criterion(outputs, labels) # 예측과 결과를 통해 Cross Entropy Loss 반환
      
            # 3. 역방향(Backward) 오차(Gradient) 전파
            optimizer.zero_grad() # 파이토치에서 gradient가 누적되지 않게 하기 위해
            loss.backward()
      
            # 4. 경사하강법으로 모델 파라미터 업데이트
            optimizer.step() # W <- W -lr*Gradient
      
            train_loss += loss.item()
        
            if (steps % steps_per_epoch) == 0: # step : 3125, .... (epoch 마다)
                model.eval() # 배치 정규화, 드롭아웃이 적용될 때는 model.forward 연산이 training때와 다르므로 반드시 설정
                valid_loss, valid_auc, valid_accuracy = validation(model, validloader, criterion)
        
                # tensorboad 시각화를 위한 로그 이벤트 등록
                writer.add_scalar("Loss/train", train_loss/len(trainloader), epoch)
                writer.add_scalar("Loss/valid", valid_loss/len(validloader), epoch)
                writer.add_scalars("Loss/train and valid",
                                  {'train' : train_loss/len(trainloader),
                                  'valid' : valid_loss/len(validloader)}, epoch)
                
                # writer.add_scalar("Valid Accuracy", valid_accuracy/len(validloader), epoch)
                writer.add_scalar("Valid AUC", valid_auc, epoch)
        
                print('Epoch : {}/{}.....'.format(epoch+1, epochs),
                      'Train Loss : {:.3f}'.format(train_loss/len(trainloader)),
                      'Valid Loss : {:.3f}'.format(valid_loss/len(validloader)),
                      'Valid AUC : {:.3f}'.format(valid_auc),
                      'Valid Accuracy : {:.3f}'.format(valid_accuracy))
              
                if valid_auc > max_accuracy: 
                    max_accuracy = valid_auc
                    torch.save(model.state_dict(), 'best_checkpoint.pth')
        
                # Early Stopping (조기 종료)
                if valid_loss > min_loss:
                    trigger += 1 # valid loss가 min_loss 를 갱신하지 못할때마다 증가
                    print('trigger : ', trigger )
                    if trigger > patience:
                        print('Early Stopping!!!')
                        print('Traning step is finished!!')
                        writer.flush()  
                        return   
                else:
                    trigger = 0
                    min_loss = valid_loss
        
                train_loss = 0
                model.train()
                scheduler.step()
  
    writer.flush()  

훈련 및 성능 검증

In [None]:
torch.cuda.empty_cache()

In [None]:
epochs = 20
train(model, epochs, criterion, optimizer)

In [None]:
%reload_ext tensorboard

In [None]:
%tensorboard --logdir=runs

In [None]:
dataset_test = ImageDataset(test, img_dir=img_dir, transform=transform_test, is_test=True)

loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False, worker_init_fn=seed_worker, generator=g, num_workers=0)

dataset_TTA = ImageDataset(test, img_dir=img_dir, transform=transform_train, is_test=True)

loader_TTA = DataLoader(dataset_TTA, batch_size=batch_size, shuffle=False, worker_init_fn=seed_worker, generator=g, num_workers=0)

예측

- 테스트 데이터 원본으로 예측한 타깃값

In [None]:
model.eval()

preds_test = np.zeros((len(test), 4))

with torch.no_grad():
    for i, images in enumerate(testloader):
        images = images.to(device)
        outputs = model(images)
        
        # 타깃 예측 확률
        preds_part = torch.softmax(outputs.cpu(), dim=1).squeeze().numpy()
        preds_test[i*batch_size:(i+1)*batch_size] += preds_part

In [None]:
alpha = 0.001
threshold = 0.999

submission_test_ls = submission_test.copy()
submission_tta_ls = submission_tta.copy()

target = ['healthy', 'multiple_diseases', 'rust', 'scab']

submission_test_ls[target] = apply_label_smoothing(submission_test_ls, target, alpha, threshold)

submission_tta_ls[target] = apply_label_smoothing(submission_tta_ls, target, alpha, threshold)

submission_test_ls.to_csv('submission_test_ls.csv', index=False)
submission_tta_ls.to_csv('submission_tta_ls.csv', index=False)