# 41_데이터 불러오기 / 파일 정리

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip -qq '/content/drive/MyDrive/Colab Notebooks/DL/41_dataset.zip' -d './dataset'

--------------------

In [4]:
import os

original_dataset_dir = './dataset'
classes_list = os.listdir(original_dataset_dir)


# colab 재가동시에 이 폴더는 사라질 것이다.
base_dir = './splitted'
    # train, test, validate 폴더로 나누기 위해 지정
os.mkdir(base_dir)
    # os.mkdir : 새로운 폴더를 만든다.

In [5]:
import shutil

train_dir = os.path.join(base_dir, 'train')
os.mkdir(train_dir)

validation_dir = os.path.join(base_dir, 'val')
os.mkdir(validation_dir)

test_dir = os.path.join(base_dir, 'test')
os.mkdir(test_dir)

for cls in classes_list:        # 폴더 명을 통일시킴.
    os.mkdir(os.path.join(train_dir, cls))
    os.mkdir(os.path.join(validation_dir, cls))
    os.mkdir(os.path.join(test_dir, cls))

# 데이터 현황 확인

In [7]:
import math

for cls in classes_list:
    path = os.path.join(original_dataset_dir, cls)
    fnames = os.listdir(path)

    train_size = math.floor(len(fnames) * 0.6)
                                            # 60% 로 분할
                        # floor : 정수형, 내림
    validation_size = math.floor(len(fnames) * 0.2)
                                            # 20% 로 분할
    test_size = math.floor(len(fnames) * 0.2)
                                            # 20% 로 분할

    train_fnames = fnames[:train_size]
    print('Train size(', cls, ') : ',len(train_fnames))
    
    # 파일 복사하기
    for fname in train_fnames:
        src = os.path.join(path, fname)
        dst = os.path.join(os.path.join(train_dir, cls), fname)
        shutil.copyfile(src, dst)

    validation_fnames = fnames[train_size:(validation_size + train_size)]
    print('Validation size(',cls,'): ', len(validation_fnames))
    for fname in validation_fnames:
        src = os.path.join(path, fname)
        dst = os.path.join(os.path.join(validation_dir, cls), fname)
        shutil.copyfile(src, dst)

    test_fnames = fnames[(train_size+validation_size):(validation_size + train_size + test_size)]

    print('Test size(',cls,'): ', len(test_fnames))
    for fname in test_fnames:
        src = os.path.join(path, fname)
        dst = os.path.join(os.path.join(test_dir, cls), fname)
        shutil.copyfile(src, dst)

Train size( Pepper,_bell___healthy ) :  886
Validation size( Pepper,_bell___healthy ):  295
Test size( Pepper,_bell___healthy ):  295
Train size( Potato___Early_blight ) :  600
Validation size( Potato___Early_blight ):  200
Test size( Potato___Early_blight ):  200
Train size( Grape___Black_rot ) :  708
Validation size( Grape___Black_rot ):  236
Test size( Grape___Black_rot ):  236
Train size( Tomato___Spider_mites Two-spotted_spider_mite ) :  1005
Validation size( Tomato___Spider_mites Two-spotted_spider_mite ):  335
Test size( Tomato___Spider_mites Two-spotted_spider_mite ):  335
Train size( Tomato___Late_blight ) :  1145
Validation size( Tomato___Late_blight ):  381
Test size( Tomato___Late_blight ):  381
Train size( Peach___healthy ) :  216
Validation size( Peach___healthy ):  72
Test size( Peach___healthy ):  72
Train size( Pepper,_bell___Bacterial_spot ) :  598
Validation size( Pepper,_bell___Bacterial_spot ):  199
Test size( Pepper,_bell___Bacterial_spot ):  199
Train size( Tomat

--------------------------

# 42_식물잎의 사진으로 질병분류 - 학습하기

In [8]:
import torch
import os

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device('cuda' if USE_CUDA else 'cpu')
BATCH_SIZE = 256
        # batch_size는 그래픽 메모리의 최대에 도달하면
        # 더 올라갈 수 없음.
EPOCH = 30

In [9]:
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
                                # ImageFolder : 폴더 이름을 라벨로 설정하는 것.

transform_base = transforms.Compose([transforms.Resize((64, 64)), transforms.ToTensor()])
                            # transform.Compose : 입력받는 사진을 변환한다.
                                                # Resize : 사진 크기를 변환
                                                                            # ToTensor() : 텐서로 변환
train_dataset = ImageFolder(root='./splitted/train', transform=transform_base)
val_dataset = ImageFolder(root='./splitted/val', transform=transform_base)

In [11]:
from torch.utils.data import DataLoader
                                # DataLoader : Combines a dataset and a sampler, and provides an iterable over the given dataset.

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size = BATCH_SIZE,
                                           shuffle=True,
                                           num_workers=4)

val_loader = torch.utils.data.DataLoader(val_dataset,
                                         batch_size=BATCH_SIZE,
                                         shuffle=True,
                                         num_workers=4)

  cpuset_checked))


In [17]:
# 모델 구조

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Net(nn.Module):

    def __init__(self):

        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, 3, padding = 1)
        self.conv3 = nn.Conv2d(64, 64, 3, padding = 1)

        self.fc1 = nn.Linear(4096, 512)
            # fc : fully connected layer
        self.fc2 = nn.Linear(512, 33)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.pool(x)
        x = F.dropout(x, p=0.25, training=self.training)
                                # training : 훈련할 때는 사용하고, evaluation (평가) 할때는 사용하지 않겠다고 설정

        x = self.conv2(x)
        x = F.relu(x)
        x = self.pool(x)
        x = F.dropout(x, p=0.25, training=self.training)

        x = self.conv3(x)
        x = F.relu(x)
        x = self.pool(x)
        x = F.dropout(x, p=0.25, training=self.training)

        x = x.view(-1, 4096)        # flatten 의 과정
        x = self.fc1(x)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training = self.training)
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

model_base = Net().to(DEVICE)
optimizer = optim.Adam(model_base.parameters(), lr=0.001)

In [18]:
def train(model, train_loader, optimizer):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(DEVICE), target.to(DEVICE)
        optimizer.zero_grad()       # weight 초기화
        output = model(data)
        loss = F.cross_entropy(output, target)
        loss.backward()     # backpropogation
        optimizer.step()        # weight update

In [19]:
def evaluate(model, test_loader):
    model.eval()
    test_loss=0
    correct = 0

    with torch.no_grad():           # with 구문 : https://pythondocs.net/uncategorized/%ED%8C%8C%EC%9D%B4%EC%8D%AC-with-%EC%A0%95%EB%A6%AC/
        # gradient 가 없는지 확인하는 과정  
        for data, target in test_loader:
            data, target = data.to(DEVICE), target.to(DEVICE)
            output = model(data)

            test_loss += F.cross_entropy(output,target, reduction='sum').item()

            pred = output.max(1, keepdim=True)[1]
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    test_accuracy = 100. * correct / len(test_loader.dataset)
    return test_loss, test_accuracy

In [21]:
import time
import copy

def train_baseline(model, train_loader, val_loader, optimizer, num_epochs = 30):
    best_acc = 0.0
    best_model_wts = copy.deepcopy(model.state_dict())
        # 정확도가 가장 높은 모델의 Weight를 저장할 것임

    for epoch in range(1, num_epochs + 1):
        since = time.time()
        train(model, train_loader, optimizer)
        train_loss, train_acc = evaluate(model, train_loader)
        val_loss, val_acc = evaluate(model, val_loader)

        if val_acc > best_acc:
            best_acc = val_acc
            best_model_wts = copy.deepcopy(model.state_dict())
                                    # 당 모델의 acc 가 높으면 model.state_dict()의 정보를 딥카피 한다.

        time_elapsed = time.time() - since
        print('------------- epoch {} ---------------'.format(epoch))
        print('train Loss : {:.4f}, Accuracy : {:.2f}%'.format(train_loss, train_acc))
        print('val Loss: {:.4f}, Accuracy : {:.2f}%'.format(val_loss, val_acc))
        print('Completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    model.load_state_dict(best_model_wts)
                    # 딥카피 된 model.state_dict()를 불러온다
    return model

base = train_baseline(model_base, train_loader, val_loader, optimizer, EPOCH)
torch.save(base, 'baseline.pt')

  cpuset_checked))


------------- epoch 1 ---------------
train Loss : 1.1003, Accuracy : 65.93%
val Loss: 1.1374, Accuracy : 64.81%
Completed in 1m 14s
------------- epoch 2 ---------------
train Loss : 0.8629, Accuracy : 74.44%
val Loss: 0.9171, Accuracy : 72.35%
Completed in 1m 16s
------------- epoch 3 ---------------
train Loss : 0.6618, Accuracy : 79.64%
val Loss: 0.7295, Accuracy : 77.77%
Completed in 1m 15s
------------- epoch 4 ---------------
train Loss : 0.5277, Accuracy : 83.62%
val Loss: 0.6104, Accuracy : 80.95%
Completed in 1m 16s
------------- epoch 5 ---------------
train Loss : 0.4577, Accuracy : 85.50%
val Loss: 0.5479, Accuracy : 82.36%
Completed in 1m 14s
------------- epoch 6 ---------------
train Loss : 0.3821, Accuracy : 88.40%
val Loss: 0.4744, Accuracy : 85.19%
Completed in 1m 14s
------------- epoch 7 ---------------
train Loss : 0.3430, Accuracy : 89.49%
val Loss: 0.4419, Accuracy : 85.99%
Completed in 1m 16s
------------- epoch 8 ---------------
train Loss : 0.2885, Accuracy :

-------------------

# 43_전이학습 개념 - 식물잎의 사진으로 질병분류

In [None]:
# 전이학습 transfer learning

# 44_전이학습 모델불러오기 - 식물잎의 사진으로 질병분류

In [22]:
data_transforms = {
    'train' : transforms.Compose([transforms.Resize([64, 64]), 
                                  transforms.RandomHorizontalFlip(),        # 기하학적 증강
                                  transforms.RandomVerticalFlip(),          # 기하학적 증강 (augmentation) >> 과적합을 고려하여 상하좌우로 무작위로 뒤집는 것.
                                  transforms.RandomCrop(52),
                                                        # 52 * 52 사이즈로 크롭
                                  transforms.ToTensor(),
                                  transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]),
                                                        # R  ,  G ,   B
                                                        # 평균
                                                                                # 표준편차
                   
    'val' : transforms.Compose([transforms.Resize([64, 64]),
                                transforms.RandomCrop(52),
                                transforms.ToTensor(),
                                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ])
}

In [24]:
data_dir = './splitted'
image_datasets = {x: ImageFolder(root = os.path.join(data_dir, x),
                                 transform = data_transforms[x]) for x in ['train', 'val']}
dataloaders = {x : torch.utils.data.DataLoader(image_datasets[x],
                                               batch_size=BATCH_SIZE,
                                               shuffle=True,
                                               num_workers=4) for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}

class_names = image_datasets['train'].classes

  cpuset_checked))


## pre-trained model load

In [25]:
from torchvision import models

resnet = models.resnet50(pretrained=True)
                        # resnet50 이 가지는 클래스 숫자가 현재 모델의 출력 숫자와 다르기 때문에,
                        # 마지막 레이어의 출력 숫자를 바꿔줘야 한다.
num_ftrs = resnet.fc.in_features
                    # 마지막 레이어 지정
resnet.fc = nn.Linear(num_ftrs, 33)
                                # 33개 출력으로 변경
                                
resnet = resnet.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.Adam(filter(lambda p: p.requires_grad, resnet.parameters()), lr=0.001)
                                            # 맨 마지막 출력단은 weight가 없기 때문에 해당 옵션을 올려줘야 한다.

from torch.optim import lr_scheduler
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
                                                    # 7 Epochs 마다 Learning Rate를 0.1 감소시킨다.

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

----------------

# 45_전이학습 모델 수정 후 학습시키기 - 식물잎의 사진으로 질병 분류

In [26]:
ct = 0
for child in resnet.children():
    ct += 1
    if ct < 6:
        for param in child.parameters():
            param.requires_grad =False
                        # 0 ~ 5번 레이어를 Freeze 함 (고정)
    # 6 ~ 9번 레이어까지는 학습을 진행함!

In [27]:
def train_resnet(model, criterion, optimizer, scheduler, num_epochs = 25):

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('--------------- epoch {} ----------------'.format(epoch+1))
        since = time.time()
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0


            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(DEVICE)
                labels = labels.to(DEVICE)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):      # 만약 train이면 gradient를 적용하도록 설정한다.
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                                        # one hot encoding 형태로 뽑힘
                                        # 출력 33개를 그대로 가져와서
                    loss = criterion(outputs, labels)

                    if phase == 'train' :
                        loss.backward() # backpropogation
                        optimizer.step()    # weight update

                running_loss += loss.item() * inputs.size(0)
                                                # inputs.size(0) : 설정된 배치사이즈를 잡아주는 것.
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()        # scheduler 를 통해 lr 도 업데이트 시켜줌

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss : {:.4f} ACC : {:.4f}'.format(phase, epoch_loss, epoch_acc))


            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        time_elapsed = time.time() - since
        print('Completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc : {:4f}'.format(best_acc))

    model.load_state_dict(best_model_wts)

    return model

In [28]:
model_resnet50 = train_resnet(resnet, criterion, optimizer_ft, 
                              exp_lr_scheduler, num_epochs = EPOCH)

torch.save(model_resnet50, 'resnet50.pt')

--------------- epoch 1 ----------------


  cpuset_checked))


train Loss : 0.6102 ACC : 0.8155
val Loss : 0.2746 ACC : 0.9096
Completed in 0m 54s
--------------- epoch 2 ----------------
train Loss : 0.2166 ACC : 0.9294
val Loss : 0.2271 ACC : 0.9304
Completed in 0m 52s
--------------- epoch 3 ----------------
train Loss : 0.1737 ACC : 0.9454
val Loss : 0.1881 ACC : 0.9404
Completed in 0m 52s
--------------- epoch 4 ----------------
train Loss : 0.1353 ACC : 0.9566
val Loss : 0.1568 ACC : 0.9492
Completed in 0m 52s
--------------- epoch 5 ----------------
train Loss : 0.1137 ACC : 0.9621
val Loss : 0.1410 ACC : 0.9547
Completed in 0m 53s
--------------- epoch 6 ----------------
train Loss : 0.0919 ACC : 0.9692
val Loss : 0.1600 ACC : 0.9531
Completed in 0m 52s
--------------- epoch 7 ----------------
train Loss : 0.0819 ACC : 0.9738
val Loss : 0.1018 ACC : 0.9682
Completed in 0m 52s
--------------- epoch 8 ----------------
train Loss : 0.0446 ACC : 0.9854
val Loss : 0.0578 ACC : 0.9815
Completed in 0m 52s
--------------- epoch 9 ----------------


-------------------------
# 46_전이학습 평가하기 - 식물잎의 사진으로 질병분류

In [29]:
transform_resNet = transforms.Compose([
                                       transforms.Resize([64, 64]),
                                       transforms.RandomCrop(52),
                                       transforms.ToTensor(),
                                       transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

test_resNet = ImageFolder(root='./splitted/test', transform = transform_resNet)
test_loader_resNet = torch.utils.data.DataLoader(test_resNet,
                                                 batch_size = BATCH_SIZE,
                                                 shuffle=True,
                                                 num_workers=4)

  cpuset_checked))


In [30]:
resnet50 = torch.load('resnet50.pt')
resnet50.eval()
test_loss, test_accuracy = evaluate(resnet50, test_loader_resNet)

print('ResNet test acc : ', test_accuracy)

  cpuset_checked))


ResNet test acc :  99.03617474026787
