### Fashion MNIST & IRIS 모델 구현
- Fashion MNIST 모델 
    - 옷 판별 모델 구현
    

- 모델 구현 과정
    - 데이터 전처리 (결측치, 이상치, 중복치 처리 등등)
    - 학습 모델 설계 (모델 인스턴스 생성 + forward 메서드)
    - 데이터셋 모델 설계 (모델 인스턴스 생성 + 배치수 만큼 텐서화 진행)
    - 데이터 분리, 인코딩, 스케일링
    - 최적화, 손실함수 인스턴스 정하기
    - 만든 모델을 바탕으로 학습 진행  
        (데이터 로딩 -> 학습 -> 손실 -> 평가 -> 최적화-> 검증 및 결과 저장)

In [25]:
from urllib.request import urlretrieve  #(데이터 불러오는 모듈)
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
from torchinfo import summary
from torchmetrics import F1Score
from torchmetrics.classification import MulticlassF1Score, MulticlassConfusionMatrix
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder

import pandas as pd
from func import Torch_preccesing



[1] 데이터 전처리 <hr>

In [26]:
# 랜덤 고정
torch.manual_seed(1)

# 텐서 저장 및 실행 위치 설정
DEVICE='cuda' if torch.cuda.is_available() else 'cpu'

In [27]:
# 데이터 불러오기
# TEST_URL  = 'https://media.githubusercontent.com/media/fpleoni/fashion_mnist/master/fashion-mnist_test.csv'
# TRAIN_URL = 'https://media.githubusercontent.com/media/fpleoni/fashion_mnist/master/fashion-mnist_train.csv'

TRAIN_FILE = './data/fashion-mnist_train.csv'
TEST_FILE  = './data/fashion-mnist_test.csv'
# # 데이터 파일로 저장
# urlretrieve(TRAIN_URL, TRAIN_FILE)   #url에 있는 파일을 지정 경로에 저장
# urlretrieve(TEST_URL, TEST_FILE)

In [28]:
trainDF= pd.read_csv(TRAIN_FILE)
testDF= pd.read_csv(TEST_FILE)
# 사진 데이터 이므로 결측치, 이상치, 중복치 처리 X


In [29]:
trainDF.columns[:1]

Index(['label'], dtype='object')

In [30]:
# 피쳐/타겟 분리
FashionDF=Torch_preccesing(trainDF)

FashionDF.feature= trainDF[trainDF.columns[1:]]
FashionDF.target= trainDF[trainDF.columns[:1]]

FashionDF.target['label'].unique(), FashionDF.feature.columns

(array([2, 9, 6, 0, 3, 4, 5, 8, 7, 1], dtype=int64),
 Index(['pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6', 'pixel7',
        'pixel8', 'pixel9', 'pixel10',
        ...
        'pixel775', 'pixel776', 'pixel777', 'pixel778', 'pixel779', 'pixel780',
        'pixel781', 'pixel782', 'pixel783', 'pixel784'],
       dtype='object', length=784))

In [31]:
# 피쳐: pixel 1 ~ 784  (784 컬럼)
# 타겟: label (0~9 10가지)

In [32]:
FashionDF.X_test=testDF[testDF.columns[1:]]
FashionDF.Y_test=testDF[testDF.columns[:1]]

In [33]:
# 다중분류 -> 라벨은 onehot_encoding
# 훈련
# target_enc=FashionDF.encoding(OneHotEncoder(), FashionDF.target)
# target_enc.toarray()
# FashionDF.target=pd.DataFrame(target_enc, columns=[f'class_{i}' for i in range(FashionDF.target.shape[0])])
# # 테스트
# test_enc= FashionDF.encoding(OneHotEncoder(), FashionDF.Y_test)
# test_enc= test_enc.toarray()
# FashionDF.Y_test=pd.DataFrame(test_enc, columns=[f'class_{i}' for i in range(FashionDF.Y_test.shape[0])])


In [34]:
# 훈련, 검증 데이터 분리 (테스트는 별도로 존재)

FashionDF.X_train, FashionDF.X_val, FashionDF.Y_train, FashionDF.Y_val= FashionDF.split(val=False,
                                                                                        testsize=.25,
                                                                                        random_state=916,
                                                                                        get_data=True)


train: (45000, 784),(45000, 1)
test: (15000, 784),(15000, 1)


In [35]:
# 스케일링을 해야할까?
# 1. 스케일링 O -> MinMaxScaler()
FashionDF.get_scaled(MinMaxScaler(), val=True)



X_train_scaled: (45000, 784),2
X_test_scaled: (10000, 784),2
X_val_scaled: (15000, 784),2


[2] 모델 데이터 설계
- 클래스 목적: Fashion 데이터 학습 및 추론
- 클래스 이름: FashionMClf
- 부모 클래스: nn.Module()
- 매개변수: 입출력 개수, AF, 마지막 출력 AF, 은닉층의 수

- 구조 설정
    - 입력층: 784 -> 20, AF= Relu
    - 은닉층: 20 -> 30, AF= Relu, 3층
    - 출력층: 30 -> 라벨 컬럼 수(10), softMAX인데, Cross Entropy 사용시 필요 없음

In [36]:
class get_classification_model(nn.Module):
    '''
    은닉층 수= 리스트 수-1
    '''
    def __init__(self, in_in, out_out, hidden: list) -> None:
        super().__init__()
        self.in_layer= nn.Linear(in_in, hidden[0])
        self.h_layers=nn.ModuleList()
        for h in range(len(hidden)-1):
            self.h_layers.append(nn.Linear(hidden[h], hidden[h+1]))
        self.out_layer= nn.Linear(hidden[-1], out_out)
    
    def forward(self, input_data, out_AF=None):
        y= self.in_layer(input_data)
        y= F.relu(y)

        for layer in self.h_layers:
            y=layer(y)
            y=F.relu(y)
        
        if out_AF:
            y= out_AF(self.out_layer(y))
        else:
            y=self.out_layer(y)
        return y
        


In [37]:
# 모델 테스트
a=[100,80,60,40,20]
model= get_classification_model(in_in=784, out_out=10, hidden=a)
print(model)
summary(model, input_size=(60000, 784)) #행과 열

get_classification_model(
  (in_layer): Linear(in_features=784, out_features=100, bias=True)
  (h_layers): ModuleList(
    (0): Linear(in_features=100, out_features=80, bias=True)
    (1): Linear(in_features=80, out_features=60, bias=True)
    (2): Linear(in_features=60, out_features=40, bias=True)
    (3): Linear(in_features=40, out_features=20, bias=True)
  )
  (out_layer): Linear(in_features=20, out_features=10, bias=True)
)


Layer (type:depth-idx)                   Output Shape              Param #
get_classification_model                 [60000, 10]               --
├─Linear: 1-1                            [60000, 100]              78,500
├─ModuleList: 1-2                        --                        --
│    └─Linear: 2-1                       [60000, 80]               8,080
│    └─Linear: 2-2                       [60000, 60]               4,860
│    └─Linear: 2-3                       [60000, 40]               2,440
│    └─Linear: 2-4                       [60000, 20]               820
├─Linear: 1-3                            [60000, 10]               210
Total params: 94,910
Trainable params: 94,910
Non-trainable params: 0
Total mult-adds (G): 5.69
Input size (MB): 188.16
Forward/backward pass size (MB): 148.80
Params size (MB): 0.38
Estimated Total Size (MB): 337.34

[3] 데이터셋 클래스 설계
- 피쳐: 784개
- 타겟: 10개(인코딩)
- 클래스 이름: FashionDataset
- 부모 클래스: untils.data.Dataset
- 속성/필드: feature, target, nrows, n_features
- 필수 메서드 오버라이딩
    - init
    - len
    - get

In [38]:
class CustomDataset(Dataset):
    def __init__(self, featureDF, targetDF) -> None:
        super().__init__()
        self.featureDF= featureDF
        self.targetDF= targetDF
        self.n_rows= featureDF.shape[0]
        self.n_features= featureDF.shape[1]
    
    def __len__(self):
        return self.n_rows
    
    def __getitem__(self, index):
        featureTS= torch.FloatTensor(self.featureDF.iloc[index].values).to(DEVICE)
        targetTS= torch.FloatTensor(self.targetDF.iloc[index].values).to(DEVICE)
        return featureTS, targetTS
    

In [39]:
feature= FashionDF.X_train_scaled
target= FashionDF.Y_train
testDS=CustomDataset(pd.DataFrame(feature), pd.DataFrame(target))
print(testDS.n_rows)
testDL=DataLoader(testDS, batch_size=32)
for f, t in testDL:
    print(f.shape, t.shape)
    break

45000
torch.Size([32, 784]) torch.Size([32, 1])


[4] 학습 준비


In [40]:
# 설정값
EPOCH= 100
BATCH_SIZE=32
LR=0.001

In [41]:
FashionDF.Y_train['label']

552      7
50513    0
27703    9
37981    2
50552    8
        ..
8202     4
52576    3
33372    5
11477    4
36076    2
Name: label, Length: 45000, dtype: int64

In [42]:
# 타겟 라벨 인코딩
# scikit-learn -> 라벨은 무조건 1차원 => DF인 경우 values로 numpy변경 후 ravel()통해 1차원으로 변환
                                                                    # 넘파이 함수
# enc_model=LabelEncoder()
# enc_model.fit(FashionDF.Y_train.values.ravel())
# enc_model.transform(FashionDF.Y_train.values.ravel())
# enc_model.transform(FashionDF.Y_test.values.ravel())
# enc_model.transform(FashionDF.Y_val.values.ravel())

# 데이터셋 인스턴스 생성
trainDS= CustomDataset(pd.DataFrame(FashionDF.X_train_scaled), pd.DataFrame(FashionDF.Y_train))
testDS= CustomDataset(pd.DataFrame(FashionDF.X_test_scaled), pd.DataFrame(FashionDF.Y_test))
valDS= CustomDataset(pd.DataFrame(FashionDF.X_val_scaled), pd.DataFrame(FashionDF.Y_val))
trainDL= DataLoader(trainDS, batch_size=BATCH_SIZE)
testDL= DataLoader(testDS, batch_size=BATCH_SIZE)
valDL= DataLoader(valDS, batch_size=BATCH_SIZE)

In [43]:
# DL 테스트
for f, t in trainDL:
    print(f.shape)
    break

torch.Size([32, 784])


In [44]:
# 최적화 인스턴스 사용할 최적화 모델 이름(모델 파라미터, 단계)
optimizer= optim.Adam(model.parameters(), lr=LR)

# 손실함수 인스턴스 -> 다중분류
Loss= nn.CrossEntropyLoss()

# 학습 모델
h_layer=[200,150,100]
model= get_classification_model(in_in=784, out_out=10, hidden=h_layer)

[5] 학습 진행

In [45]:
# 손실, 평가값 저장
LOSS_HISTORY, SCORE_HISTORY= [[],[]], [[],[]]

model.train()
for epoch in range(EPOCH):
    print(f'{epoch+1}/{EPOCH}')

    loss_total, score_total= 0,0
    loss_val_total, score_val_total=0,0
    for feature, target in trainDL:
        # 학습
        pre_y=model(feature)

        # 손실
        loss= Loss(pre_y, target.reshape(-1).long())
        loss_total+=loss

        # 평가
        target=target.reshape(-1)
        score_total+= MulticlassF1Score(num_classes=10)(pre_y, target)
        # 최적화
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # 검증

    model.eval()
    with torch.no_grad():
        for feature, target in valDL:
            # 학습
            pre_val= model(feature)

            # 손실
            loss= Loss(pre_val, target.reshape(-1).long())
            loss_val_total+=loss
            # 평가
            score= MulticlassF1Score(num_classes=10)(pre_val, target.reshape(-1))
            score_val_total+=score
    # 테스트

    
    # 저장
    LOSS_HISTORY[0].append(loss_total/BATCH_SIZE)
    SCORE_HISTORY[0].append(score_total/BATCH_SIZE)
    print(f'Train\n Loss: {loss_total/BATCH_SIZE}\n Score: {score_total/BATCH_SIZE}')

    LOSS_HISTORY[1].append(loss_val_total/BATCH_SIZE)
    SCORE_HISTORY[1].append(score_val_total/BATCH_SIZE)
    print(f'Val\n Loss: {loss_val_total/BATCH_SIZE}\n Score: {score_val_total/BATCH_SIZE}')


            
        

1/100
Train
 Loss: 101.30159759521484
 Score: 0.8095508217811584
Val
 Loss: 33.75965118408203
 Score: 0.2703002691268921
2/100
Train
 Loss: 101.30159759521484
 Score: 0.8095508217811584
Val
 Loss: 33.75965118408203
 Score: 0.2703002691268921
3/100
Train
 Loss: 101.30159759521484
 Score: 0.8095508217811584
Val
 Loss: 33.75965118408203
 Score: 0.2703002691268921
4/100
Train
 Loss: 101.30159759521484
 Score: 0.8095508217811584
Val
 Loss: 33.75965118408203
 Score: 0.2703002691268921
5/100
Train
 Loss: 101.30159759521484
 Score: 0.8095508217811584
Val
 Loss: 33.75965118408203
 Score: 0.2703002691268921
6/100
Train
 Loss: 101.30159759521484
 Score: 0.8095508217811584
Val
 Loss: 33.75965118408203
 Score: 0.2703002691268921
7/100
Train
 Loss: 101.30159759521484
 Score: 0.8095508217811584
Val
 Loss: 33.75965118408203
 Score: 0.2703002691268921
8/100
Train
 Loss: 101.30159759521484
 Score: 0.8095508217811584
Val
 Loss: 33.75965118408203
 Score: 0.2703002691268921
9/100
Train
 Loss: 101.301597595