# 라이트닝 참고

https://baeseongsu.github.io/posts/pytorch-lightning-introduction/ - train/val step

In [12]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import random_split, DataLoader , Dataset , TensorDataset
import os
import random

import pandas as pd
from transformers import BertModel
from kobert_tokenizer import KoBERTTokenizer
from transformers import DataCollatorWithPadding

In [2]:
class RM_bert(pl.LightningModule):
    def __init__(self):
        super().__init__()

        # 하이퍼 파라미터
        self.num_labels=1
        self.hidden_size=768
        self.hidden_dropout_prob=0.1

        # 모델 구조
        self.kobert = BertModel.from_pretrained('skt/kobert-base-v1')
        self.dropout = nn.Dropout(self.hidden_dropout_prob)
        self.linear = nn.Linear(self.hidden_size,  self.num_labels)

        # 최적화 파라미터
        #self.ratio_pn = 1 # T/F 비율 얼마나 줄지 ex) T 10개 F 30개면 pos_weight = 3
        self.loss_fn = nn.BCEWithLogitsLoss()   # sigmoid+BCE

    def forward(
            self,input_ids=None,
            attention_mask=None,
            token_type_ids=None,
            position_ids=None,
            head_mask=None,
            inputs_embeds=None,
            labels=None,
            ):
        
        # KoBERT 입력
        output = self.kobert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            )
        
        # 드롭아웃
        pooled_output = self.dropout(output.pooler_output)

        # 리니어 레이어
        return self.linear(pooled_output)
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(),lr=0.02)
    

    def training_step(self, train_batch, batch_idx):
        x1,x2,x3,y = train_batch['x1'] , train_batch['x2'] , train_batch['x3'] , train_batch['y']
        logits = self.forward(x1,x2,x3)
        loss = self.loss_fn(logits, y)
        self.log_dict({'train_loss':loss})
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        x1,x2,x3,y = val_batch['x1'] , val_batch['x2'] , val_batch['x3'] , val_batch['y']
        logits = self.forward(x1,x2,x3)
        loss = self.loss_fn(logits, y)
        self.log_dict({'val_loss':loss})


In [3]:
class News_Dataset(Dataset):
    def __init__(self,root_path):
        true_path = os.path.join(root_path,str(1))
        false_path = os.path.join(root_path,str(0))
        true_file_list = [os.path.join(true_path,f) for f in os.listdir(true_path)]
        false_file_list = [os.path.join(false_path,f) for f in os.listdir(false_path)]

        # 최종 파일경로 모음 및 라벨
        self.file_path = true_file_list + false_file_list
        self.label = [1 for _ in range(len(true_file_list))] + [0 for _ in range(len(false_file_list))]

        # 섞어주기
        data = list(zip(self.file_path, self.label))
        random.shuffle(data)
        self.file_path, self.label = zip(*data)

        # 토크나이저
        self.tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
        
    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        with open(self.file_path[idx], 'r') as file:
            title = file.readline()
            content = file.readline()
        token = self.tokenizer(title[:-2],content)
        y = self.label[idx]
        
        return {
            'x1':torch.tensor(token['input_ids']),
            'x2':torch.tensor(token['token_type_ids']),
            'x3':torch.tensor(token['attention_mask']),
            'y':torch.tensor(y)}
    
def my_collate_fn(samples):
    '''
        [{'x1': tensor([0, 1, 2, 3, 4]), 'x2': tensor([0, 0, 0, 0, 0]), 'x3': tensor([1, 1, 1, 1, 1]), 'y': tensor([0, 1, 0, 1, 0])}, 
        {'x1': tensor([0, 1, 2, 3, 4]), 'x2': tensor([0, 0, 0, 0, 0]), 'x3': tensor([1, 1, 1, 1, 1]), 'y': tensor([0, 1, 0, 1, 0])}]
    
    '''

    collate_x1 = []
    collate_x2 = []
    collate_x3 = []
    collate_y = []
    
    for sample in samples:
        collate_x1.append(sample['x1'])
        collate_x2.append(sample['x2'])
        collate_x3.append(sample['x3'])
        collate_y.append(sample['y'])

    
    return {'x1': torch.stack(collate_x1),
            'x2' : torch.stack(collate_x2),
            'x3' : torch.stack(collate_x3),
            'y': torch.stack(collate_y)}
    

In [4]:
train_dataset = News_Dataset()
valid_dataset = News_Dataset()

In [5]:
batch_size = 2
train_loader =  DataLoader(train_dataset,batch_size=batch_size,shuffle=True,collate_fn=my_collate_fn)
valid_loader = DataLoader(valid_dataset,batch_size=batch_size,shuffle=True,collate_fn=my_collate_fn)

logger = pl.loggers.CSVLogger("logs", name="RM_training1")
trainer = pl.Trainer(max_epochs=3,logger=logger,accelerator="auto")
model = RM_bert()
trainer.fit(model,train_loader,valid_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type              | Params
----------------------------------------------
0 | kobert  | BertModel         | 92.2 M
1 | dropout | Dropout           | 0     
2 | linear  | Linear            | 769   
3 | loss_fn | BCEWithLogitsLoss | 0     
----------------------------------------------
92.2 M    Trainable params
0         Non-trainable params
92.2 M    Total params
368.751   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\Lenovo\miniconda3\envs\dl_study\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
c:\Users\Lenovo\miniconda3\envs\dl_study\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\Users\Lenovo\miniconda3\envs\dl_study\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\Users\Lenovo\miniconda3\envs\dl_study\lib\site-packages\pytorch_lightning\loops\fit_loop.py:293: The n

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


# 문제

0. 크롤링 기사 전처리 , 제목 , 본문 어떻게 할건지? , 스페셜 토큰

1. 다이나믹 패딩 collate에 적용

2. 얼마나 저장할지 에퐄마다?

3. 조기종료

4. 스케줄러? 안해도 될듯? 몰루

5. py파일로 변환 , arg 밑 도커 백그라운드

6. train_loss는 왜 저장안되는지

In [19]:
model.eval()
test_loader = DataLoader(News_Dataset(),batch_size=10,shuffle=True,collate_fn=my_collate_fn)
sample = next(iter(test_loader))
x1,x2,x3,y = sample['x1'] , sample['x2'] , sample['x3'] , sample['y']
y_pred = nn.Sigmoid()(model(x1,x2,x3))
print(y_pred)


tensor([[1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000]], grad_fn=<SigmoidBackward0>)
