### Dataset & DataLoader 살펴보기
- Pytorch에서 배치크기만큼 데이터를 조절하기 위한 메카니즘
- Dataset : 사용 데이터를 기반으로 사용자정의 클래스 작성
- DataLoader : 지정된 Dataset에서 지정된 batch size만큼 피쳐와 타겟을 추출하여 전달

[1] 모듈 로딩 및 데이터 준비

In [1]:
### ===> 모듈 로딩
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd

In [2]:
### ===> 데이터 준비
x_data = torch.IntTensor([[10,20,30], [20,30,40], [30,40,50], [40,50,60], [50,60,70]])
y_data = torch.FloatTensor([[20], [30], [40], [50], [60]])

print(f'x_data => {x_data.shape} {x_data.ndim}D')
print(f'y_data => {y_data.shape} {y_data.ndim}D')

x_data => torch.Size([5, 3]) 2D
y_data => torch.Size([5, 1]) 2D


[2] 데이터셋 생성

- [2-1] TensorDataset 활용 : Dataset의 sub_class

In [3]:
# TensorDataset 클래스 로딩
from torch.utils.data import TensorDataset

In [4]:
dataset = TensorDataset(x_data, y_data)
dataset

<torch.utils.data.dataset.TensorDataset at 0x15603931c40>

In [5]:
# __getitems__() 호출
dataset[0]

(tensor([10, 20, 30], dtype=torch.int32), tensor([20.]))

In [6]:
len(dataset)

5

- [2-2] 사용자정의 데이터셋 생성

In [7]:
### 데이터 준비
filename = '../data/iris.csv'

In [8]:
irisDF = pd.read_csv(filename)
irisDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [9]:
### 사용자정의 DataSet 클래스
# - 데이터의 Tensor 변환
class DLDataset(Dataset):
    # 초기화 함수
    def __init__(self, x_data, y_data):
        super().__init__()
        
        # x, y 데이터 ==> ndarray
        x_data = x_data.values if isinstance(x_data, pd.DataFrame) else x_data
        y_data = y_data.values if isinstance(y_data, pd.DataFrame) else y_data
        
        # ndarray ==> tensor
        self.feature = torch.tensor(x_data)
        self.target = torch.tensor(y_data)
        
    # 데이터셋의 갯수 체크 함수
    def __len__(self):
        return len(self.target)
    
    # 특정 인덱스 데이터+라벨 반환 콜백 함수(callback function)
    def __getitem__(self, index):
        return self.feature[index], self.target[index]
    
    

In [10]:
if irisDF.__class__.__name__ == 'DataFrame':
    print('DF')

DF


In [11]:
isinstance(irisDF, pd.DataFrame), isinstance({'A':22}, list)

(True, False)

In [12]:
irisDF[irisDF.columns[-1]].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [13]:
# 피쳐와 라벨로 분리
featureDF = irisDF[irisDF.columns[:-1]]
targetDF = irisDF[[irisDF.columns[-1]]].replace({'setosa':0, 'versicolor':1, 'virginica':2})

print(f'featureDF => {featureDF.shape}, {featureDF.ndim}D')
print(f'targetDF => {targetDF.shape}, {targetDF.ndim}D')

featureDF => (150, 4), 2D
targetDF => (150, 1), 2D


In [14]:
# 데이터셋 생성
my_dataset = DLDataset(featureDF.values, targetDF.values)

In [15]:
my_dataset = DLDataset(featureDF, targetDF.values)

In [16]:
my_dataset

<__main__.DLDataset at 0x1560495c4f0>

- [2-3] 학습용, 검증용, 테스트용 Dataset

In [17]:
### ===> 파이토치
from torch.utils.data import random_split

# 학습용, 검증용, 테스트 데이터 비율
seed = torch.Generator().manual_seed(42)

trainDS, validDS, testDS = random_split(my_dataset, [0.7, 0.1, 0.2], generator=seed)

print(f'trainDS => {len(trainDS)}개, validDS => {len(validDS)}개, testDS => {len(testDS)}개')

print(f'Subset 속성=>\nindices : {trainDS.indices} \ndataset : {trainDS.dataset}')
print(f'Subset 속성=>\nindices : {validDS.indices} \ndataset : {validDS.dataset}')

trainDS => 105개, validDS => 15개, testDS => 30개
Subset 속성=>
indices : [42, 95, 30, 64, 52, 35, 130, 40, 82, 17, 108, 94, 68, 97, 117, 127, 41, 44, 57, 140, 149, 32, 23, 102, 16, 113, 71, 18, 67, 66, 0, 25, 101, 112, 91, 3, 59, 116, 86, 84, 106, 142, 43, 39, 26, 98, 93, 20, 87, 19, 120, 114, 7, 63, 76, 89, 36, 45, 37, 56, 58, 122, 51, 145, 24, 21, 105, 62, 15, 11, 48, 133, 88, 50, 6, 134, 111, 8, 49, 75, 69, 124, 4, 147, 80, 100, 99, 141, 47, 107, 13, 109, 129, 28, 38, 53, 121, 5, 55, 31, 73, 74, 54, 29, 12] 
dataset : <__main__.DLDataset object at 0x000001560495C4F0>
Subset 속성=>
indices : [22, 104, 81, 1, 103, 125, 85, 2, 96, 128, 27, 118, 77, 110, 146] 
dataset : <__main__.DLDataset object at 0x000001560495C4F0>


[3] DataLoader 생성 : 학습용, 검증용, 테스트용 

In [18]:
# DataLoader 생성
# drop_last 매개변수 : 배치사이즈로 데이터셋 분리 후 남는 데이터 처리 방법 설정 [기본: False]
batch = 5
trainDL = DataLoader(trainDS, batch_size=batch)
validDL = DataLoader(validDS, batch_size=batch)
testDL = DataLoader(testDS, batch_size=batch)

len(trainDL), len(validDL), len(testDL)

(21, 3, 6)

In [19]:
# Epoch당 반복 단위
print('batch_size :', batch)
print(f'trainDS => {len(trainDS)}개, validDS => {len(validDS)}개, testDS => {len(testDS)}개')
print(f'trainDL => {len(trainDL)}개, validDS => {len(validDL)}개, testDS => {len(testDL)}개')

batch_size : 5
trainDS => 105개, validDS => 15개, testDS => 30개
trainDL => 21개, validDS => 3개, testDS => 6개


In [20]:
# DataLoader 속성
for _, (feature, target) in enumerate(trainDL):
    print(f'[{_}] feature {feature.shape}')

[0] feature torch.Size([5, 4])
[1] feature torch.Size([5, 4])
[2] feature torch.Size([5, 4])
[3] feature torch.Size([5, 4])
[4] feature torch.Size([5, 4])
[5] feature torch.Size([5, 4])
[6] feature torch.Size([5, 4])
[7] feature torch.Size([5, 4])
[8] feature torch.Size([5, 4])
[9] feature torch.Size([5, 4])
[10] feature torch.Size([5, 4])
[11] feature torch.Size([5, 4])
[12] feature torch.Size([5, 4])
[13] feature torch.Size([5, 4])
[14] feature torch.Size([5, 4])
[15] feature torch.Size([5, 4])
[16] feature torch.Size([5, 4])
[17] feature torch.Size([5, 4])
[18] feature torch.Size([5, 4])
[19] feature torch.Size([5, 4])
[20] feature torch.Size([5, 4])


[4] Model 클래스 정의 : 입/출력 피쳐수, 층 수, 은닉층의 노드수
- 구조 설계
    * 입력층 : 입력 <= 피쳐 갯수, iris 4개
    * 은닉층 : 마음대로
    * 출력층 : 출력 <= [분류] 타겟 클래스 갯수 [회귀] 1개

In [21]:
class IrisModel(nn.Module):
    def __init__(self, idim, odim):
        super().__init__()
        self.seq = nn.Sequential(
            nn.Linear(idim, 8),
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.Linear(4, 2),
            nn.ReLU(),
            nn.Linear(2, odim)
        )
        
    def forward(self, x):
        return self.seq(x)
    

In [22]:
model = IrisModel(4, 3)

In [23]:
from torch import optim

optim = optim.Adam(model.parameters(), lr=0.01)

In [24]:
for i in trainDL:
    print(i[0]), print(i[1])

tensor([[4.4000, 3.2000, 1.3000, 0.2000],
        [5.7000, 3.0000, 4.2000, 1.2000],
        [4.8000, 3.1000, 1.6000, 0.2000],
        [5.6000, 2.9000, 3.6000, 1.3000],
        [6.9000, 3.1000, 4.9000, 1.5000]], dtype=torch.float64)
tensor([[0],
        [1],
        [0],
        [1],
        [1]])
tensor([[5.0000, 3.2000, 1.2000, 0.2000],
        [7.4000, 2.8000, 6.1000, 1.9000],
        [5.0000, 3.5000, 1.3000, 0.3000],
        [5.8000, 2.7000, 3.9000, 1.2000],
        [5.1000, 3.5000, 1.4000, 0.3000]], dtype=torch.float64)
tensor([[0],
        [2],
        [0],
        [1],
        [0]])
tensor([[6.7000, 2.5000, 5.8000, 1.8000],
        [5.6000, 2.7000, 4.2000, 1.3000],
        [6.2000, 2.2000, 4.5000, 1.5000],
        [6.2000, 2.9000, 4.3000, 1.3000],
        [7.7000, 3.8000, 6.7000, 2.2000]], dtype=torch.float64)
tensor([[2],
        [1],
        [1],
        [1],
        [2]])
tensor([[6.1000, 3.0000, 4.9000, 1.8000],
        [4.5000, 2.3000, 1.3000, 0.3000],
        [5.1000, 3.800

In [25]:
target.squeeze()

tensor([1, 1, 1, 0, 0])

In [26]:
for i in validDL:
    print(i[0]), print(i[1])

tensor([[4.6000, 3.6000, 1.0000, 0.2000],
        [6.5000, 3.0000, 5.8000, 2.2000],
        [5.5000, 2.4000, 3.7000, 1.0000],
        [4.9000, 3.0000, 1.4000, 0.2000],
        [6.3000, 2.9000, 5.6000, 1.8000]], dtype=torch.float64)
tensor([[0],
        [2],
        [1],
        [0],
        [2]])
tensor([[7.2000, 3.2000, 6.0000, 1.8000],
        [6.0000, 3.4000, 4.5000, 1.6000],
        [4.7000, 3.2000, 1.3000, 0.2000],
        [5.7000, 2.9000, 4.2000, 1.3000],
        [6.4000, 2.8000, 5.6000, 2.1000]], dtype=torch.float64)
tensor([[2],
        [1],
        [0],
        [1],
        [2]])
tensor([[5.2000, 3.5000, 1.5000, 0.2000],
        [7.7000, 2.6000, 6.9000, 2.3000],
        [6.7000, 3.0000, 5.0000, 1.7000],
        [6.5000, 3.2000, 5.1000, 2.0000],
        [6.3000, 2.5000, 5.0000, 1.9000]], dtype=torch.float64)
tensor([[0],
        [2],
        [1],
        [2],
        [2]])


In [27]:
max_epoch = 1000
for e in range(1, max_epoch+1):
    for feature, target in trainDL:
        model.train()
        h = model(feature.float())
        
        cost = nn.functional.cross_entropy(h, target.squeeze())
        
        optim.zero_grad()
        cost.backward()
        optim.step()
    
    print(f'Epoch [{e:2} / {max_epoch}] ---')    
    model.eval()
    for i, (feature, target) in enumerate(validDL):
        v = model(feature.float())
        print(f'validset [{i}] cost : {cost.item():.6} accuracy : {(sum(v.argmax(dim=1) == target.squeeze(dim=1)) / len(v) * 100)}%')

Epoch [ 1 / 1000] ---
validset [0] cost : 1.26282 accuracy : 40.0%
validset [1] cost : 1.26282 accuracy : 40.0%
validset [2] cost : 1.26282 accuracy : 60.000003814697266%
Epoch [ 2 / 1000] ---
validset [0] cost : 1.18205 accuracy : 40.0%
validset [1] cost : 1.18205 accuracy : 40.0%
validset [2] cost : 1.18205 accuracy : 60.000003814697266%
Epoch [ 3 / 1000] ---
validset [0] cost : 1.1287 accuracy : 40.0%
validset [1] cost : 1.1287 accuracy : 40.0%
validset [2] cost : 1.1287 accuracy : 60.000003814697266%
Epoch [ 4 / 1000] ---
validset [0] cost : 1.09456 accuracy : 20.0%
validset [1] cost : 1.09456 accuracy : 40.0%
validset [2] cost : 1.09456 accuracy : 20.0%
Epoch [ 5 / 1000] ---
validset [0] cost : 1.07271 accuracy : 40.0%
validset [1] cost : 1.07271 accuracy : 20.0%
validset [2] cost : 1.07271 accuracy : 20.0%
Epoch [ 6 / 1000] ---
validset [0] cost : 1.05853 accuracy : 40.0%
validset [1] cost : 1.05853 accuracy : 20.0%
validset [2] cost : 1.05853 accuracy : 20.0%
Epoch [ 7 / 1000] -

In [28]:
import torchmetrics.functional as metrics

In [29]:
model.eval()
for i, (feature, target) in enumerate(testDL):
    v = model(feature.float())
    print(f'testset [{i}] accuracy : {metrics.accuracy(v.argmax(dim=1), target.squeeze(dim=1), task="multiclass", num_classes=3)}\n'
          f'              precision: {metrics.precision(v.argmax(dim=1), target.squeeze(dim=1), task="multiclass", num_classes=3)}\n'
          f'              recall   : {metrics.recall(v.argmax(dim=1), target.squeeze(dim=1), task="multiclass", num_classes=3)}\n'
          f'              f1-score : {metrics.f1_score(v.argmax(dim=1), target.squeeze(dim=1), task="multiclass", num_classes=3)}')

testset [0] accuracy : 0.0
              precision: 0.0
              recall   : 0.0
              f1-score : 0.0
testset [1] accuracy : 0.4000000059604645
              precision: 0.4000000059604645
              recall   : 0.4000000059604645
              f1-score : 0.4000000059604645
testset [2] accuracy : 0.4000000059604645
              precision: 0.4000000059604645
              recall   : 0.4000000059604645
              f1-score : 0.4000000059604645
testset [3] accuracy : 0.0
              precision: 0.0
              recall   : 0.0
              f1-score : 0.0
testset [4] accuracy : 0.0
              precision: 0.0
              recall   : 0.0
              f1-score : 0.0
testset [5] accuracy : 0.4000000059604645
              precision: 0.4000000059604645
              recall   : 0.4000000059604645
              f1-score : 0.4000000059604645
