# 머신 러닝 교과서 - 파이토치편

<table align="left"><tr><td>
<a href="https://colab.research.google.com/github/rickiepark/ml-with-pytorch/blob/main/ch13/ch13_part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="코랩에서 실행하기"/></a>
</td></tr></table>

## 패키지 버전 체크

권장 패키지 버전을 확인하세요:

In [6]:
from python_environment_check import check_packages


d = {
    'numpy': '1.21.2',
    'pandas': '1.3.2',
    'sklearn': '1.0',
    'torch': '1.8',
    'torchvision': '0.9.0'
}
check_packages(d)

[OK] numpy 2.3.5
[OK] pandas 2.3.3
[OK] sklearn 1.8.0
[OK] torch 2.9.1+cpu
[OK] torchvision 0.24.1+cpu


torchvision은 PyTorch와 짝을 이루는 공식 패키지로,

이미지 데이터셋 (MNIST, CIFAR, ImageNet 등)

이미지 전처리(transform)

CNN 모델(ResNet, VGG 등)

ctrl+~로 터미널 열기

python -m pip install torchvision


In [7]:
from python_environment_check import check_packages


d = {
    'numpy': '1.21.2',
    'pandas': '1.3.2',
    'sklearn': '1.0',
    'torch': '1.8',
    'torchvision': '0.9.0'
}
check_packages(d)

[OK] numpy 2.3.5
[OK] pandas 2.3.3
[OK] sklearn 1.8.0
[OK] torch 2.9.1+cpu
[OK] torchvision 0.24.1+cpu


# 13장 - 파이토치 구조 자세히 알아보기 (파트 2/3)

**목차**

- 프로젝트 1 - 자동차 연비 예측하기
  - 특성 열 사용하기
  - DNN 회귀 모델 훈련하기
- 프로젝트 2 - MNIST 손글씨 숫자 분류하기

In [8]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd

from IPython.display import Image

## 13.6 프로젝트 1 - 자동차 연비 예측하기

### 13.6.1  특성 열 사용하기

In [9]:
Image(url='https://raw.githubusercontent.com/rickiepark/ml-with-pytorch/main/ch13/figures/13_07.png', width=700)

In [10]:
import pandas as pd

In [11]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv(url, names=column_names,
                 na_values = "?", comment='\t',
                 sep=" ", skipinitialspace=True)

df.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
393,27.0,4,140.0,86.0,2790.0,15.6,82,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,1
397,31.0,4,119.0,82.0,2720.0,19.4,82,1


In [12]:
print(df.isna().sum())

df = df.dropna() # 결측값(NaN)이 하나라도 있는 행(row)을 제거 > 행 기준(axis=0)으로 동작
  ## 한 행에 NaN이 하나라도 있으면 그 행 전체 삭제
df = df.reset_index(drop=True) # 인덱스를 0부터 다시 매김 > 이유: dropna() 이후 인덱스가 연속적이지 않음
df.tail() # 데이터프레임의 마지막 5행 출력

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64


Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
387,27.0,4,140.0,86.0,2790.0,15.6,82,1
388,44.0,4,97.0,52.0,2130.0,24.6,82,2
389,32.0,4,135.0,84.0,2295.0,11.6,82,1
390,28.0,4,120.0,79.0,2625.0,18.6,82,1
391,31.0,4,119.0,82.0,2720.0,19.4,82,1


In [13]:
import sklearn
import sklearn.model_selection


df_train, df_test = sklearn.model_selection.train_test_split(df, train_size=0.8, random_state=1)
train_stats = df_train.describe().transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MPG,313.0,23.404153,7.666909,9.0,17.5,23.0,29.0,46.6
Cylinders,313.0,5.402556,1.701506,3.0,4.0,4.0,8.0,8.0
Displacement,313.0,189.51278,102.675646,68.0,104.0,140.0,260.0,455.0
Horsepower,313.0,102.929712,37.919046,46.0,75.0,92.0,120.0,230.0
Weight,313.0,2961.198083,848.602146,1613.0,2219.0,2755.0,3574.0,5140.0
Acceleration,313.0,15.704473,2.725399,8.5,14.0,15.5,17.3,24.8
Model Year,313.0,75.929712,3.675305,70.0,73.0,76.0,79.0,82.0
Origin,313.0,1.591054,0.807923,1.0,1.0,1.0,2.0,3.0


In [14]:
numeric_column_names = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration']

df_train_norm, df_test_norm = df_train.copy(), df_test.copy()

for col_name in numeric_column_names:
    mean = train_stats.loc[col_name, 'mean']
    std  = train_stats.loc[col_name, 'std']
    df_train_norm.loc[:, col_name] = (df_train_norm.loc[:, col_name] - mean)/std
    df_test_norm.loc[:, col_name] = (df_test_norm.loc[:, col_name] - mean)/std

df_train_norm.tail()

  0.3511267  -0.8243028  -0.8243028  -0.8243028   0.3511267  -0.8243028
  0.3511267   1.52655621  1.52655621  1.52655621  0.3511267   1.52655621
 -0.8243028   0.3511267   1.52655621 -0.8243028  -0.8243028   0.3511267
 -0.8243028  -0.8243028  -0.8243028   0.3511267  -0.8243028   1.52655621
  0.3511267  -0.8243028   0.3511267  -0.8243028  -0.8243028   1.52655621
 -0.8243028   1.52655621  1.52655621 -0.8243028  -0.8243028  -0.8243028
 -0.8243028   0.3511267  -0.8243028   1.52655621 -0.8243028  -0.8243028
  1.52655621 -0.8243028  -0.8243028  -0.8243028   1.52655621  1.52655621
  0.3511267   0.3511267   1.52655621 -0.8243028  -0.8243028   1.52655621
  1.52655621 -0.8243028  -0.8243028   0.3511267   1.52655621 -0.8243028
  0.3511267  -0.8243028   1.52655621  1.52655621 -0.8243028  -0.8243028
 -1.41201755  1.52655621  0.3511267   1.52655621 -0.8243028  -0.8243028
 -0.8243028   1.52655621  1.52655621  0.3511267   0.3511267   1.52655621
 -0.8243028   1.52655621 -0.23658805 -0.8243028  -0.824302

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
203,28.0,-0.824303,-0.90102,-0.736562,-0.950031,0.255202,76,3
255,19.4,0.351127,0.4138,-0.340982,0.29319,0.548737,78,1
72,13.0,1.526556,1.144256,0.713897,1.339617,-0.625403,72,1
235,30.5,-0.824303,-0.89128,-1.053025,-1.072585,0.475353,77,1
37,14.0,1.526556,1.563051,1.636916,1.47042,-1.35924,71,1


In [15]:
import torch

In [16]:
boundaries = torch.tensor([73, 76, 79]) # 연식 그룹

v = torch.tensor(df_train_norm['Model Year'].values)
df_train_norm['Model Year Bucketed'] = torch.bucketize(v, boundaries, right=True)

v = torch.tensor(df_test_norm['Model Year'].values)
df_test_norm['Model Year Bucketed'] = torch.bucketize(v, boundaries, right=True)

numeric_column_names.append('Model Year Bucketed')

In [17]:
# 범주형 특성 처리 > one-hot encoding 

from torch.nn.functional import one_hot


total_origin = len(set(df_train_norm['Origin']))

origin_encoded = one_hot(torch.from_numpy(df_train_norm['Origin'].values) % total_origin)
x_train_numeric = torch.tensor(df_train_norm[numeric_column_names].values)
x_train = torch.cat([x_train_numeric, origin_encoded], 1).float()

origin_encoded = one_hot(torch.from_numpy(df_test_norm['Origin'].values) % total_origin)
x_test_numeric = torch.tensor(df_test_norm[numeric_column_names].values)
x_test = torch.cat([x_test_numeric, origin_encoded], 1).float()

In [18]:
# label 텐서 생성
y_train = torch.tensor(df_train_norm['MPG'].values).float()
y_test = torch.tensor(df_test_norm['MPG'].values).float()

## 13.6.2 DNN 회귀 모델 훈련하기

In [19]:
from torch.utils.data import DataLoader, TensorDataset


train_ds = TensorDataset(x_train, y_train) # 입력(x)과 정답(y)을 “한 쌍”으로 묶은 데이터셋 객체 생성
  ## train_ds[i] → (x_train[i], y_train[i])

batch_size = 8
torch.manual_seed(1)
train_dl = DataLoader(train_ds, batch_size, shuffle=True) # DataLoader = “미니배치 학습 자동 관리자
 # Dataset에서 데이터를 꺼내 배치로 묶고, 섞어서, 반복(iteration) 가능하게 만든다

### DataLoader를 쓰는 이유 (직접 구현 vs DataLoader)

- DataLoader 없이 하면

  > for i in range(0, len(x_train), batch_size):
    xb = x_train[i:i+batch_size]
    yb = y_train[i:i+batch_size]

   shuffle 직접 구현, 배치 경계 관리 > 실수 위험 ↑

- DataLoader 사용:

  for xb, yb in train_dl:
    ...


- 학습 루프에서의 실제 사용:

  for epoch in range(epochs):
    for xb, yb in train_dl:
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


In [20]:
hidden_units = [8, 4]
input_size = x_train.shape[1]

all_layers = []
for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    all_layers.append(layer)
    all_layers.append(nn.ReLU())
    input_size = hidden_unit

all_layers.append(nn.Linear(hidden_units[-1], 1))

model = nn.Sequential(*all_layers)

model

Sequential(
  (0): Linear(in_features=9, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)

In [21]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [22]:
torch.manual_seed(1)
num_epochs = 200
log_epochs = 20
for epoch in range(num_epochs):
    loss_hist_train = 0
    for x_batch, y_batch in train_dl:
        pred = model(x_batch)[:, 0]
        loss = loss_fn(pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_hist_train += loss.item()
    if epoch % log_epochs==0:
        print(f'에포크 {epoch}  손실 {loss_hist_train/len(train_dl):.4f}')

에포크 0  손실 536.1047
에포크 20  손실 8.4361
에포크 40  손실 7.8695
에포크 60  손실 7.1891
에포크 80  손실 6.7062
에포크 100  손실 6.7599
에포크 120  손실 6.3124
에포크 140  손실 6.6864
에포크 160  손실 6.7648
에포크 180  손실 6.2156


In [23]:
with torch.no_grad():
    pred = model(x_test.float())[:, 0]
    loss = loss_fn(pred, y_test)
    print(f'테스트 MSE: {loss.item():.4f}')
    print(f'테스트 MAE: {nn.L1Loss()(pred, y_test).item():.4f}')

테스트 MSE: 9.6130
테스트 MAE: 2.1211


MSE (Mean Squared Error, 평균제곱오차) = 1/N sum(yi - yi^)**2

  - 오차를 제곱하므로 큰 오차에 훨씬 큰 벌점, “큰 실수를 매우 싫어한다”

MAE (Mean Absolute Error, 평균절대오차) = 1/N sum|(yi - yi^)|

  - 오차를 있는 그대로 평가, “모든 실수를 똑같이 취급”

## 13.7 프로젝트 2 - MNIST 손글씨 숫자 분류하기

In [24]:
import torchvision
from torchvision import transforms


image_path = './'
transform = transforms.Compose([transforms.ToTensor()]) # 텐서 변환 - 실수 텐서, [0,255] 값을 [0,1]로 변환

mnist_train_dataset = torchvision.datasets.MNIST(root=image_path,
                                           train=True,
                                           transform=transform,
                                           download=True)
mnist_test_dataset = torchvision.datasets.MNIST(root=image_path,
                                           train=False,
                                           transform=transform,
                                           download=False)

batch_size = 64
torch.manual_seed(1)
train_dl = DataLoader(mnist_train_dataset, batch_size, shuffle=True)

100%|██████████| 9.91M/9.91M [00:06<00:00, 1.61MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 145kB/s]
100%|██████████| 1.65M/1.65M [00:01<00:00, 1.01MB/s]
100%|██████████| 4.54k/4.54k [00:00<?, ?B/s]


In [25]:
# 신경망 모델 
hidden_units = [32, 16]
image_size = mnist_train_dataset[0][0].shape
input_size = image_size[0] * image_size[1] * image_size[2]

all_layers = [nn.Flatten()] # 1차원 텐서
for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    all_layers.append(layer)
    all_layers.append(nn.ReLU())
    input_size = hidden_unit

all_layers.append(nn.Linear(hidden_units[-1], 10))
model = nn.Sequential(*all_layers)

model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=32, bias=True)
  (2): ReLU()
  (3): Linear(in_features=32, out_features=16, bias=True)
  (4): ReLU()
  (5): Linear(in_features=16, out_features=10, bias=True)
)

In [26]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

torch.manual_seed(1)
num_epochs = 20
for epoch in range(num_epochs):
    accuracy_hist_train = 0
    for x_batch, y_batch in train_dl:
        pred = model(x_batch)
        loss = loss_fn(pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        is_correct = (torch.argmax(pred, dim=1) == y_batch).float()
        accuracy_hist_train += is_correct.sum()
    accuracy_hist_train /= len(train_dl.dataset)
    print(f'에포크 {epoch}  정확도 {accuracy_hist_train:.4f}')

에포크 0  정확도 0.8531
에포크 1  정확도 0.9287
에포크 2  정확도 0.9413
에포크 3  정확도 0.9506
에포크 4  정확도 0.9558
에포크 5  정확도 0.9592
에포크 6  정확도 0.9627
에포크 7  정확도 0.9650
에포크 8  정확도 0.9674
에포크 9  정확도 0.9690
에포크 10  정확도 0.9710
에포크 11  정확도 0.9729
에포크 12  정확도 0.9739
에포크 13  정확도 0.9750
에포크 14  정확도 0.9764
에포크 15  정확도 0.9777
에포크 16  정확도 0.9779
에포크 17  정확도 0.9798
에포크 18  정확도 0.9806
에포크 19  정확도 0.9813


In [27]:
pred = model(mnist_test_dataset.data / 255.)
is_correct = (torch.argmax(pred, dim=1) == mnist_test_dataset.targets).float()
print(f'테스트 정확도: {is_correct.mean():.4f}')

테스트 정확도: 0.9647
