# [모의 경진대회] 뇌파 데이터를 이용한 수면 단계 분류

* 수치/시계열 분류 과제
* 담당: 박성호M

## 데이터 디렉토리 구조

In [93]:
01_DATA/  
  \_train/  
    \_xxx.npy  
    \_yyy.npy
    \_zzz.npy
    \_...  
  \_test/  
    \_aaa.npy  
    \_bbb.npy  
    \_...  
  \_train_labels.csv  
  \_sample_submission.csv

SyntaxError: invalid decimal literal (2388783385.py, line 1)

## 필수 라이브러리 불러오기

In [1]:
import os
import numpy as np
import pandas as pd
import random

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

## 하이퍼파라미터 및 기타 인자 설정

#### 데이터 경로

In [2]:
# 데이터 디렉토리 및 클래스 개수 
DATA_DIR = '/workspace/Competition/PSG/01_DATA/trial3'
label_dir = os.path.join(DATA_DIR, 'train_labels.csv')
train_dir = os.path.join(DATA_DIR, 'train')
test_dir = os.path.join(DATA_DIR, 'test')
norm_dir = os.path.join(DATA_DIR, 'norm.npy')

#### 시드 설정

In [3]:
RANDOM_SEED = 2022

torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

#### 디바이스 설정

In [4]:
# 디바이스 설정 (GPU 설정)
os.environ["CUDA_VISIBLE_DEVICES"]="0"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#### 하이퍼파라미터 설정

In [5]:
EPOCHS = 20
BATCH_SIZE = 32
LEARNING_RATE = 0.0003
EARLY_STOPPING_PATIENCE = 10

## Dataset 정의

In [6]:
class EEG_Single_Dataset(Dataset):
    def __init__(self, datapath, labeldf, normpath):
        self.df = labeldf
        self.label_encoding = {'W':0, 'N1':1, 'N2':2, 'N3':3, 'R':4}
        self.data_path = datapath
        self.file_ids = self.df['rec_id']
        self.labels = self.df['stage']
        self.normparams = np.load(normpath).astype('float32')
        self.mean = self.normparams[0]
        self.std = self.normparams[1]
    
    def __len__(self):
        return len(self.file_ids)
    
    def __getitem__(self,index):
        npypath = os.path.join(self.data_path, self.file_ids[index]+'.npy')
        x = torch.from_numpy(np.load(npypath).astype('float32'))
        x = (x-self.mean)/self.std
        subx = x[:,-30*128:]
        label = self.labels[index]
        y = self.label_encoding[label]
        
        return subx,y

## 모델 정의

In [7]:
class DOUBLE_CNN(nn.Module):
    def __init__(self):
        super(DOUBLE_CNN, self).__init__()
        
        self.small_cnn = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=64, kernel_size=int(128/2), stride = int(128/16)),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=8, stride=8),
            nn.Dropout(p=0.3),
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=4),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=128, kernel_size=4),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(in_channels=128, out_channels=128, kernel_size=4),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=4, stride=4),
        )

        self.large_cnn = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=64, kernel_size=128*4, stride=int(128/2)),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=4, stride=4),
            nn.Dropout(p=0.3),
            nn.Conv1d(in_channels = 64, out_channels=128, kernel_size=3),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(in_channels = 128, out_channels = 128, kernel_size=3),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
        )
        
        self.fc = nn.Sequential(
            nn.Linear((12+4)*128,1024),
            nn.ReLU(),
            nn.Linear(1024,128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,5)
        )

    def forward(self, x):
        xs = self.small_cnn(x)
        xl = self.large_cnn(x)
        xs = xs.flatten(1,2)
        xl = xl.flatten(1,2)
        xcat = torch.cat((xs,xl),1)
        out = self.fc(xcat)
        return out

## Utils 정의
#### EarlyStopper

In [8]:
class LossEarlyStopper():
    def __init__(self, patience: int)-> None:
        self.patience = patience
        self.patience_counter = 0
        self.min_loss = np.Inf
        self.stop = False
        self.savel_model = False
        
    def check_early_stopping(self, loss: float)-> None:
        if loss > self.min_loss:
            self.patience_counter +=1
            msg = f"Early stopping counter {self.patience_counter}/{self.patience}"
            
            if self.patience_counter == self.patience:
                self.stop=True
            
        else:
            self.patience_counter = 0
            self.save_model = True
            msg = f"Validation loss decreased {self.min_loss} - > {loss}"
            self.min_loss = loss
        print(msg)

#### Trainer

In [9]:
class Trainer():
    def __init__(self, model, optimizer, loss, metrics, device):
        self.model = model
        self.optimizer = optimizer
        self.loss = loss
        self.metric_fn = metrics
        self.device = device
        
    def train_epoch(self, dataloader, epoch_index):
        self.model.train()
        train_total_loss = 0
        target_list = []
        pred_list = []
        
        for batch_index, (x,y) in enumerate(dataloader):
            x,y = x.to(self.device), y.to(self.device)
            y_pred = model(x)
            loss = self.loss(y_pred,y)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
            train_total_loss += loss.item()
            pred_list.extend(y_pred.argmax(dim=1).cpu().tolist())
            target_list.extend(y.cpu().tolist())
        self.train_mean_loss = train_total_loss / (batch_index+1)
        train_score, f1 = self.metric_fn(y_pred=pred_list, y_answer=target_list)
        msg = f"Epoch {epoch_index}, Train loss: {self.train_mean_loss}, Acc:{train_score}, F1-Macro: {f1}"
        print(msg)
    
    def validate_epoch(self, dataloader, epoch_index):
        val_total_loss = 0
        target_list = []
        pred_list = []
        
        with torch.no_grad():
            for batch_index, (x, y) in enumerate(dataloader):
                x = x.to(self.device)
                y = y.to(self.device)
                y_pred = self.model(x)
                loss = self.loss(y_pred, y)
                
                val_total_loss += loss.item()
                target_list.extend(y.cpu().tolist())
                pred_list.extend(y_pred.argmax(dim=1).cpu().tolist())
        self.val_mean_loss = val_total_loss / (batch_index+1)
        val_score, f1 = self.metric_fn(y_pred = pred_list, y_answer = target_list)
        msg = f"Epoch {epoch_index}, Val loss: {self.val_mean_loss}, Acc:{val_score}, F1-Macro: {f1}"
        print(msg)

#### 평가지표

In [10]:
def get_metric_fn(y_pred, y_answer):
    assert len(y_pred) == len(y_answer), 'The size of prediction and answer are not the same.'
    accuracy = accuracy_score(y_answer, y_pred)
    f1 = f1_score(y_answer, y_pred, average='macro')
    return accuracy, f1

## 모델 학습

#### Dataset & Dataloader 설정

In [11]:
# Load label dataframe
entiredf = pd.read_csv(label_dir)
traindf, valdf = train_test_split(entiredf, test_size=0.2)
traindf = traindf.reset_index(drop=True)
valdf = valdf.reset_index(drop=True)


train_dataset = EEG_Single_Dataset(datapath=train_dir, labeldf=traindf, normpath=norm_dir)
val_dataset = EEG_Single_Dataset(datapath=train_dir, labeldf=valdf, normpath=norm_dir)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print('Train set samples: ', len(train_dataset), 'Val set samples: ', len(val_dataset))

Train set samples:  20024 Val set samples:  5007


#### 모델과 기타 utils 설정

In [12]:
model = DOUBLE_CNN().to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()
early_stopper = LossEarlyStopper(patience=EARLY_STOPPING_PATIENCE)
metrics = get_metric_fn

trainer = Trainer(model, optimizer, loss_fn, get_metric_fn, DEVICE)

In [13]:
model

DOUBLE_CNN(
  (small_cnn): Sequential(
    (0): Conv1d(1, 64, kernel_size=(64,), stride=(8,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=8, stride=8, padding=0, dilation=1, ceil_mode=False)
    (4): Dropout(p=0.3, inplace=False)
    (5): Conv1d(64, 128, kernel_size=(4,), stride=(1,))
    (6): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): ReLU()
    (8): Conv1d(128, 128, kernel_size=(4,), stride=(1,))
    (9): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Conv1d(128, 128, kernel_size=(4,), stride=(1,))
    (12): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU()
    (14): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  )
  (large_cnn): Sequential(
    (0): Conv1d(1, 64, kernel_size=(512,), stride=(64,))
    (1): Batc

### Epoch 단위 학습 진행

In [None]:
for epoch_index in tqdm(range(EPOCHS)):
    trainer.train_epoch(train_loader, epoch_index)
    trainer.validate_epoch(val_loader, epoch_index)
    
    early_stopper.check_early_stopping(loss = trainer.val_mean_loss)
    
    if early_stopper.stop:
        print('Early Stopped')
        break
    if early_stopper.save_model:
        check_point = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }
        torch.save(check_point, 'best.pt')

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch 0, Train loss: 1.1453338464418539, Acc:0.5292149420695166, F1-Macro: 0.38243259930365275


  5%|▌         | 1/20 [00:52<16:33, 52.28s/it]

Epoch 0, Val loss: 1.0612052914443288, Acc:0.5674056321150389, F1-Macro: 0.4022797071031422
Validation loss decreased inf - > 1.0612052914443288
Epoch 1, Train loss: 1.0100702574839606, Acc:0.5957850579304834, F1-Macro: 0.47062324335276057


 10%|█         | 2/20 [01:16<10:42, 35.71s/it]

Epoch 1, Val loss: 0.9388385160713438, Acc:0.6363091671659676, F1-Macro: 0.5054368494379257
Validation loss decreased 1.0612052914443288 - > 0.9388385160713438
Epoch 2, Train loss: 0.9381905822708203, Acc:0.6326907710747104, F1-Macro: 0.5046080376086303


 15%|█▌        | 3/20 [01:38<08:25, 29.72s/it]

Epoch 2, Val loss: 0.8895192787905407, Acc:0.6674655482324745, F1-Macro: 0.53397285344861
Validation loss decreased 0.9388385160713438 - > 0.8895192787905407
Epoch 3, Train loss: 0.889917863300814, Acc:0.6584099081102677, F1-Macro: 0.5283238324621993


 20%|██        | 4/20 [02:01<07:08, 26.80s/it]

Epoch 3, Val loss: 0.8773582813086783, Acc:0.6630716996205313, F1-Macro: 0.5176612772994517
Validation loss decreased 0.8895192787905407 - > 0.8773582813086783
Epoch 4, Train loss: 0.8424193879095511, Acc:0.6797343188174191, F1-Macro: 0.5463274996441474


 25%|██▌       | 5/20 [02:25<06:28, 25.91s/it]

Epoch 4, Val loss: 0.836317352238734, Acc:0.6802476532854005, F1-Macro: 0.5440865902112556
Validation loss decreased 0.8773582813086783 - > 0.836317352238734
Epoch 5, Train loss: 0.8185356725423861, Acc:0.6932680783060328, F1-Macro: 0.5576488807188855


 30%|███       | 6/20 [02:48<05:50, 25.03s/it]

Epoch 5, Val loss: 0.8193544215837102, Acc:0.6898342320750949, F1-Macro: 0.5501398235978998
Validation loss decreased 0.836317352238734 - > 0.8193544215837102
Epoch 6, Train loss: 0.7945308552001612, Acc:0.6999101078705553, F1-Macro: 0.5626284348195592


 35%|███▌      | 7/20 [03:11<05:16, 24.32s/it]

Epoch 6, Val loss: 0.7888285132350435, Acc:0.6992210904733374, F1-Macro: 0.5604328875738298
Validation loss decreased 0.8193544215837102 - > 0.7888285132350435
Epoch 7, Train loss: 0.779386970657891, Acc:0.7065521374350779, F1-Macro: 0.5693082546893342


 40%|████      | 8/20 [03:34<04:46, 23.87s/it]

Epoch 7, Val loss: 0.7880912554112209, Acc:0.7006191332135011, F1-Macro: 0.5591151735194156
Validation loss decreased 0.7888285132350435 - > 0.7880912554112209
Epoch 8, Train loss: 0.7578384491582267, Acc:0.7131941669996005, F1-Macro: 0.575227720339972


 45%|████▌     | 9/20 [03:57<04:20, 23.66s/it]

Epoch 8, Val loss: 0.782622388024239, Acc:0.7054124226083484, F1-Macro: 0.5648768654801425
Validation loss decreased 0.7880912554112209 - > 0.782622388024239
Epoch 9, Train loss: 0.740568455177755, Acc:0.7238314023172193, F1-Macro: 0.5849428575873997


 50%|█████     | 10/20 [04:20<03:54, 23.42s/it]

Epoch 9, Val loss: 0.7698110945665153, Acc:0.7128020770920711, F1-Macro: 0.5723437230924877
Validation loss decreased 0.782622388024239 - > 0.7698110945665153
Epoch 10, Train loss: 0.7251864332265366, Acc:0.7264782261286457, F1-Macro: 0.5887867149732016


 55%|█████▌    | 11/20 [04:43<03:29, 23.28s/it]

Epoch 10, Val loss: 0.7728866228632106, Acc:0.7086079488715797, F1-Macro: 0.5695027607183064
Early stopping counter 1/10
Epoch 11, Train loss: 0.7172755345273704, Acc:0.7291749900119856, F1-Macro: 0.5912695259844607


 60%|██████    | 12/20 [05:06<03:05, 23.25s/it]

Epoch 11, Val loss: 0.7664825859343171, Acc:0.713800679049331, F1-Macro: 0.570115512515343
Validation loss decreased 0.7698110945665153 - > 0.7664825859343171
Epoch 12, Train loss: 0.6998926195949792, Acc:0.7355673192169396, F1-Macro: 0.597329608998994


 65%|██████▌   | 13/20 [05:30<02:43, 23.29s/it]

Epoch 12, Val loss: 0.7721389059428196, Acc:0.7122029159177152, F1-Macro: 0.5736816706471793
Early stopping counter 1/10
Epoch 13, Train loss: 0.6871842387766122, Acc:0.738363963244107, F1-Macro: 0.602474755096104


 70%|███████   | 14/20 [05:53<02:19, 23.23s/it]

Epoch 13, Val loss: 0.7845800651866159, Acc:0.7066107449570601, F1-Macro: 0.5756160623180049
Early stopping counter 2/10
Epoch 14, Train loss: 0.6764483743915543, Acc:0.7420595285657211, F1-Macro: 0.605942888497002


 75%|███████▌  | 15/20 [06:16<01:56, 23.32s/it]

Epoch 14, Val loss: 0.7560649206683894, Acc:0.7140003994407829, F1-Macro: 0.5753688519275121
Validation loss decreased 0.7664825859343171 - > 0.7560649206683894
Epoch 15, Train loss: 0.6646220652630535, Acc:0.7460547343188174, F1-Macro: 0.6105909363143546


 80%|████████  | 16/20 [06:40<01:33, 23.26s/it]

Epoch 15, Val loss: 0.7779907923974808, Acc:0.7130017974835231, F1-Macro: 0.5708645965351344
Early stopping counter 1/10
Epoch 16, Train loss: 0.6542893385830016, Acc:0.7512485017978426, F1-Macro: 0.6181822008814359


 85%|████████▌ | 17/20 [07:03<01:09, 23.28s/it]

Epoch 16, Val loss: 0.7536191109828888, Acc:0.7191931296185341, F1-Macro: 0.5839228846198454
Validation loss decreased 0.7560649206683894 - > 0.7536191109828888
Epoch 17, Train loss: 0.6399173000797677, Acc:0.7546444266879744, F1-Macro: 0.6273127578012092


 90%|█████████ | 18/20 [07:26<00:46, 23.36s/it]

Epoch 17, Val loss: 0.7639277095247985, Acc:0.7151987217894947, F1-Macro: 0.5866299300537967
Early stopping counter 1/10
Epoch 18, Train loss: 0.62059688591919, Acc:0.7645825009988014, F1-Macro: 0.6450756993283382


 95%|█████████▌| 19/20 [07:49<00:23, 23.27s/it]

Epoch 18, Val loss: 0.7489563065349676, Acc:0.7269822248851607, F1-Macro: 0.6028545935572144
Validation loss decreased 0.7536191109828888 - > 0.7489563065349676


## 추론

#### 테스트 Dataset 정의

In [199]:
class TestDataset(Dataset):
    def __init__(self, datapath, normpath):
        self.data_path = datapath
        self.npy_list = os.listdir(self.data_path)
        self.normparams = np.load(normpath).astype('float32')
        self.mean = self.normparams[0]
        self.std = self.normparams[1]
    
    def __len__(self):
        return len(self.npy_list)
    
    def __getitem__(self,index):
        filename = self.npy_list[index]
        npypath = os.path.join(self.data_path, filename)
        x = torch.from_numpy(np.load(npypath).astype('float32'))
        x = (x-self.mean)/self.std
        subx = x[:,-30*128:]
        subfilename = filename.split('.')[0]
        return subx,subfilename

#### 테스트 Dataset & Dataloader 설정

In [200]:
test_dataset = TestDataset(test_dir, norm_dir)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

#### 모델 로드

In [201]:
TRAINED_MODEL_PATH = 'best.pt'
test_model = DOUBLE_CNN()
test_model.load_state_dict(torch.load(TRAINED_MODEL_PATH)['model'])

<All keys matched successfully>

#### 추론 진행

In [213]:
file_list = []
pred_list = []

test_model.to(DEVICE)
test_model.eval()
with torch.no_grad():
    for batch_index, (x,y) in tqdm(enumerate(test_loader)):
        x = x.to(DEVICE)
        pred = test_model(x)
        
        file_list.extend(list(y))
        pred_list.extend(pred.argmax(dim=1).tolist())

196it [00:02, 70.27it/s]


#### 결과 저장

In [222]:
# 예측 결과 데이터프레임으로 저장
results = pd.DataFrame({'rec_id':file_list, 'stage':pred_list})

# 예측 숫자 --> 클래스로 변경
label_decoding = {0:'W', 1:'N1', 2:'N2', 3:'N3', 4:'R'}
results = results.replace(label_decoding)

# sample_submission.csv와 순서 동일하게 변경
sampledf = pd.read_csv(os.path.join(DATA_DIR,'sample_submission.csv'))
sorter = list(sampledf['rec_id'])
results = results.set_index('rec_id')
results = results.loc[sorter].reset_index()

# 결과 저장
results.to_csv('prediction.csv',index=False)