# 자동차 연비 데이터셋 으로 예측

In [49]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from tomlkit import comment

import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd

import lightning.pytorch as pl

## Step 1 Input tensor, Target Tensor 준비

In [6]:
# dataset 파일읽어 Dataframe 객체 생성
auto_mpg_dataset_url = "파이토치를 이용한 딥러닝 모델 구현/auto-mpg.data"

column_name = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
               'Acceleration', 'Model Year', 'Origin', 'Car name']

raw_dataset = pd.read_csv("auto-mpg.data", names=column_name, na_values='?', comment = '\t'
                          ,sep = " ", skipinitialspace=True)

In [7]:
raw_dataset

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin,Car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,


## Step2 데이터 전처리

In [13]:
# 자동차 이름 특성 제거
dataset = raw_dataset.drop(labels='Car name', axis=1)
dataset

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,1


In [14]:
# dataset NaN값 제거
dataset.isna().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [15]:
dataset.dropna(inplace=True)

In [16]:
#데이터셋 분할 Train, Test
Train_dataset=dataset.sample(frac=0.8)
test_dataset=dataset.drop(Train_dataset.index)

In [17]:
len(Train_dataset), len(test_dataset)

(314, 78)

In [18]:
# 정답 데이터 MPG-연비 제거
train_stats = Train_dataset.describe()
train_stats.drop(labels="MPG",axis=1, inplace=True)

In [22]:
# raw와 column 위치 바꾸기 Transpose
train_stats = train_stats.transpose()

In [23]:
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Cylinders,314.0,5.404459,1.691582,3.0,4.0,4.0,6.0,8.0
Displacement,314.0,191.816879,104.822667,68.0,105.0,142.5,259.5,455.0
Horsepower,314.0,104.092357,38.905486,46.0,75.25,90.5,120.0,230.0
Weight,314.0,2964.372611,847.064319,1613.0,2226.5,2764.5,3554.75,5140.0
Acceleration,314.0,15.583439,2.693463,8.0,14.0,15.5,17.0,24.8
Model Year,314.0,75.898089,3.718849,70.0,73.0,76.0,79.0,82.0
Origin,314.0,1.566879,0.789499,1.0,1.0,1.0,2.0,3.0


In [26]:
# target tensor를 위한 연비값 추출
train_labels = Train_dataset.pop("MPG")
test_labels = test_dataset.pop("MPG")

In [28]:
# 정규화 전처리
train_mean = train_stats['mean']
train_std = train_stats['std']
Train_dataset = (Train_dataset - train_mean) / train_std

In [29]:
test_dataset = (test_dataset - train_mean) / train_std

## Step3 pipeline 생성

In [30]:
# torch Tensor 객체 변환
train_x = torch.Tensor(Train_dataset.values)
test_x = torch.Tensor(test_dataset.values)

train_y = torch.Tensor(train_labels.values).unsqueeze(1)
test_y = torch.Tensor(test_labels.values).unsqueeze(1)

In [40]:
train_x = train_x.to(device)
train_y = train_y.to(device)
test_x = test_x.to(device)
test_y = test_y.to(device)

In [31]:
train_x.shape , train_y.shape

(torch.Size([314, 7]), torch.Size([314, 1]))

In [41]:
# torch.tensor 객체를 이용해 Dataset 객체 생성

train_dataset = TensorDataset(train_x, train_y)
test_dataset = TensorDataset(test_x, test_y)

In [42]:
# Dataset 객체를 이용해 DataLoader 객체를  생성
train_loader=DataLoader(dataset = train_dataset, batch_size = 8, shuffle = True)
test_loader=DataLoader(dataset = test_dataset, batch_size = 8, shuffle = False)

## Step4 DNN 모델 디자인

In [36]:
# Subclassing - sequential 버전
class MPGModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_stack = nn.Sequential(
            nn.Linear(7, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
        )
    def forward(self, x):
        x =  self.linear_stack(x)
        return x

In [38]:
model = MPGModel()

In [39]:
from torchsummary import summary
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [44]:
summary(model, input_size=(7,), batch_size=8)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                    [8, 64]             512
              ReLU-2                    [8, 64]               0
            Linear-3                    [8, 32]           2,080
              ReLU-4                    [8, 32]               0
            Linear-5                    [8, 16]             528
              ReLU-6                    [8, 16]               0
            Linear-7                     [8, 1]              17
Total params: 3,137
Trainable params: 3,137
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.01
Estimated Total Size (MB): 0.03
----------------------------------------------------------------


In [54]:
## torchLightning 버전

class LMPGModel(pl.LightningModule):
    def __init__(self):
        super().__init__()

        self.linear1 = nn.Linear(7, 64)
        self.linear2 = nn.Linear(64, 32)
        self.linear3 = nn.Linear(32, 16)
        self.linear4 = nn.Linear(16, 1)

    def forward(self, x):
        #Layer1
        x = self.linear1(x)
        x = torch.relu(x)

        #Layer2
        x = self.linear2(x)
        x = torch.relu(x)

        #Layer3
        x = self.linear3(x)
        x = torch.relu(x)

        #Layer4
        x = self.linear4(x)

        return x

    def mse_loss(self, logits, labels):
        loss = nn.MSELoss()
        return loss(logits, labels)

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        logits = self.forward(x)
        loss = self.mse_loss(logits, y)
        self.log('train_loss', loss)
        return loss

    def test_step(self, test_batch, batch_idx):
        x, y = test_batch
        logits = self.forward(x)
        loss = self.mse_loss(logits, y)
        self.log('test_loss', loss)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)

In [55]:
model = LMPGModel()
trainer = pl.Trainer(default_root_dir= 'C:/exercise/lightning_logs',
                     max_epochs = 100,
                     accelerator="gpu",
                     devices=1)
trainer.fit(model, train_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3050 Ti Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\user\anaconda3\envs\namduhus_GPU\Lib\site-packages\lightning\pytorch\core\optimizer.py:183: `LightningModule.configure_optimizers` returned `None`, this fit will run with no optimizer

  | Name    | Type   | Params | Mode 
-------------------------------------------
0 | linear1 | Linear | 512    | train
1 | linear2 | Linear | 2.1 K  | train
2 | linear3 | Linear | 528    | train
3 | linear4 | Linear | 17     | train
------------------------------------

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=100` reached.


In [56]:
trainer.test(model, test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\user\anaconda3\envs\namduhus_GPU\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 648.8148803710938}]

## Step5 학습관련 객체 생성

In [45]:
loss = nn.MSELoss()
mae = nn.L1Loss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)

## Step6 학습 loop

In [None]:
def train(epoch):
    model.train()

    train_loss = 0
    train_mae = 0

    for batch_idx, (x, y) in enumerate(train_loader):
        # 모델의 순전파 수행
        output = model(x)

        # loss 함수에 모델의 예측값과 정답 정보를 전달하여 loss 값 계산
        batch_mse = loss(output, y)
        train_mse = batch_mse.item()

        # loss 미치는 영향도 계산
        batch_mse.backward()

        # 계산된 영향도 업데이트
        optimizer.step()

        # optim 초기화
        optimizer.zero_grad()

        # mae 계산
        batch_mae = mae(output, y)
        train_mae += batch_mae.item()

        if batch_idx % 50 == 0:
            print("Train Epoch: {} [{}/{} ({:.0f}%]\tMAE: {:.6f}\tMSE: {:.6f}".format(
                epoch,
                (batch_idx + 1) * len(x),
                len(train_loader),
                100. * (batch_idx+1) / len(train_loader),
                batch_mae,
                batch_mse
            ))