In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 31.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 27.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.5 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# from torch.utils.data import RandomSampler, SequentialSampler
from torch.cuda.amp import GradScaler, autocast

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, BertConfig
from transformers import get_cosine_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
import argparse

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import re
import datetime
import random
import time
from tqdm.auto import tqdm

In [None]:
# mount google drive 
import os, sys 
from google.colab import drive

drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive


In [None]:
# !pip install datasets

In [None]:
# from datasets import load_dataset

# dataset = load_dataset("searle-j/kote")
# print(dataset)

In [None]:
DATA_IN_PATH = "./data"
FILE_NAME = "KOTE_relabel.tsv"
DATA_PATH = os.path.join(DATA_IN_PATH, FILE_NAME)
kote = pd.read_csv(DATA_PATH, sep="\t", index_col=0)
print(kote.shape)
kote.head()

(50000, 4)


Unnamed: 0,text,class,label,datset
0,내가 톰행크스를 좋아하긴 했나보다... 초기 영화 빼고는 다 봤네.,0,행복한,train
1,"정말 상상을 초월하는 무개념 진상들 상대하다 우울증, 공항장애 걸리는 공무원 많아요...",3,슬픈,train
2,"새로운 세상과 조우한 자의 어린아이 같은 반응, 어쩌면 회복된 것은 눈이 아닌 순수...",0,행복한,train
3,미역은 원생생물계 산호초는 동물ㅇㅇ 아 미역이 바다의 새ㄱㅇㄱㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ,0,행복한,train
4,네 맞습니다 플스는 역시 30프레임이 어울리죠 ㅎ,0,행복한,train


In [None]:
kote['datset'].value_counts()

train    40000
test      5000
valid     5000
Name: datset, dtype: int64

### 전처리

In [None]:
MODEL_NAME = 'klue/roberta-large'
MAX_LEN = 320  # max_token_len
BATCH_SIZE = 32

In [None]:
# le = LabelEncoder()

class KorSongsDataset(Dataset):

  # 생성자, 데이터를 전처리 하는 부분
  def __init__(self, inputs, targets, tokenizer, max_len):
    self.inputs = inputs
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
    # self._prepare_data()

  # def _prepare_data(self):
  #   kote = pd.read_csv(DATA_PATH, sep="\t")
    
  def __len__(self):
    return len(self.inputs)

  # idx(인덱스)에 해당하는 입출력 데이터를 반환
  def __getitem__(self, idx):
    input = str(self.inputs[idx])
    target = self.targets[idx]
    encoding = self.tokenizer.encode_plus(input, 
                                          add_special_tokens=True,
                                          max_length=self.max_len,
                                          return_attention_mask=True,
                                          return_tensors='pt',
                                          return_token_type_ids=False, 
                                          padding='max_length', 
                                          truncation=True)
    return {
      'input_text' : input,
      'input_ids' : encoding['input_ids'].flatten(),
      'attention_mask' : encoding['attention_mask'].flatten(),
      'targets' : torch.tensor(target, dtype=torch.long)
    }

In [None]:
# ds = KorSongsDataset(inputs=kote['text'].to_numpy(),targets=kote['label'].to_numpy(),tokenizer=BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False),max_len=MAX_LEN)
# ds.__len__()
# ds.__getitem__(0)

In [None]:
def KorSongsDataLoader(df, tokenizer, max_len, batch_size):
  ds = KorSongsDataset(
          inputs=df['text'].to_numpy(),
          targets=df['class'].to_numpy(),
          tokenizer=tokenizer,
          max_len=max_len)
  
  return DataLoader(
    ds,
    batch_size=batch_size,
    shuffle=True,
    pin_memory=True
  )

In [None]:
# 정확도 계산 함수
def accuracy_measure(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# 시간 표시 함수
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))  # hh:mm:ss 형태로 변경

In [None]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

MODEL_NAME = 'bert-base-multilingual-cased'
MAX_LEN = 320  # maximum token lenngth
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 1
EPS = 1e-8  # optimizer에서 0 나누기 방지를 위한 epsilon

# Load data
DATA_DIR = './data'
FILE_NAME = 'KOTE_relabel.tsv'
DATA_PATH = os.path.join(DATA_DIR, FILE_NAME)
kote = pd.read_csv(DATA_PATH, sep='\t', index_col=0)
train_df = kote[kote['datset']=='train']
val_df = kote[kote['datset']=='valid']
test_df = kote[kote['datset']=='test']

MODEL_DIR = './model/'
MODEL_SAVE_NAME = f'kote-trained-by-{MODEL_NAME}'
MODEL_SAVE_PATH = MODEL_DIR + MODEL_SAVE_NAME

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)

train_dataloader = KorSongsDataLoader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
val_dataloader = KorSongsDataLoader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_dataloader = KorSongsDataLoader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)

model = BertForSequenceClassification.from_pretrained(MODEL_NAME,
                                                      num_labels=5,
                                                      output_hidden_states=False,
                                                      output_attentions=False)

if device.type != 'cpu':
    print("Running model in CUDA")
    model.cuda()

optimizer = optim.AdamW(model.parameters(),lr=LEARNING_RATE, eps=EPS)
total_steps = len(train_dataloader) * EPOCHS

scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

training_stats = []
scaler = GradScaler()
total_t0 = time.time()

for epoch in range(EPOCHS):
    print('')
    print(f'======== Epoch {epoch+1:}/{EPOCHS:} ========')
    print('Training...')

    t0 = time.time()  # 시작 시간 설정
    total_train_loss = 0  # loss 초기화
    
    model.train()
    for step, batch in enumerate(train_dataloader):
        if (step%250 == 0 or step == len(train_dataloader)) and not step==0:
            elapsed = format_time(time.time() - t0)
            print(f'  Batch {step:>5,} of {len(train_dataloader):>5,}.    Elapsed: {elapsed}.')

        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['targets'].to(device)

        model.zero_grad()  # 그래디언트 초기화

        '''forward'''
        with autocast():
            loss, logits = model(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask, 
                                  return_dict=False, 
                                  labels=b_labels)
        total_train_loss += loss.item()

        '''backpropagation'''
        scaler.scale(loss).backward()  # 그래디언트 계산
        scaler.step(optimizer)
        scaler.update()  # scaler 업데이트
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # 그래디언트 클리핑
        optimizer.step()  # 그래디언트를 통해 가중치 파라미터 업데이트
        scheduler.step()  # learning rate 업데이트

    avg_train_loss = total_train_loss / len(train_dataloader)  # 평균 loss
    training_time = format_time(time.time() - t0)
    print('')
    print(f'  Average training loss: {avg_train_loss:.2f}')
    print(f'  Training epcoh took: {training_time:}')

    # ----------------------------------------------------------------

    print('')
    print('Running Validation...')

    t0 = time.time()

    model.eval()  # 평가 모드

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in val_dataloader:
        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['targets'].to(device)

        with torch.no_grad():   
            loss, logits = model(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask, 
                                  return_dict=False, 
                                  labels=b_labels)
            total_eval_loss += loss.item()

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            total_eval_accuracy += accuracy_measure(logits, label_ids)
    
    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    
avg_val_loss = total_eval_loss / len(val_dataloader)

validation_time = format_time(time.time() - t0)

print("  Validation Loss: {0:.2f}".format(avg_val_loss))
print("  Validation took: {:}".format(validation_time))

training_stats.append(
    {
        'epoch': epoch+1,
        'Training Loss': avg_train_loss,
        'Valid. Loss': avg_val_loss,
        'Valid. Accur.': avg_val_accuracy,
        'Training Time': training_time,
        'Validation Time': validation_time
    }
)

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


# Save the model
if not os.path.exists(MODEL_SAVE_PATH):
    os.makedirs(MODEL_SAVE_PATH)

print("saving model to {}".format(MODEL_SAVE_PATH))

model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

# ----------------------------------------------------------------

# model.eval()


cuda:0


Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Running model in CUDA

Training...
  Batch   250 of 2,500.    Elapsed: 0:01:26.
  Batch   500 of 2,500.    Elapsed: 0:02:55.
  Batch   750 of 2,500.    Elapsed: 0:04:21.
  Batch 1,000 of 2,500.    Elapsed: 0:05:48.
  Batch 1,250 of 2,500.    Elapsed: 0:07:14.
  Batch 1,500 of 2,500.    Elapsed: 0:08:40.
  Batch 1,750 of 2,500.    Elapsed: 0:10:06.
  Batch 2,000 of 2,500.    Elapsed: 0:11:33.
  Batch 2,250 of 2,500.    Elapsed: 0:12:59.

  Average training loss: nan
  Training epcoh took: 0:14:25

Running Validation...
  Validation Loss: nan
  Validation took: 0:01:16

Training complete!
Total training took 0:15:41 (h:mm:ss)
saving model to ./model/kote-trained-by-bert-base-multilingual-cased


('./model/kote-trained-by-bert-base-multilingual-cased/tokenizer_config.json',
 './model/kote-trained-by-bert-base-multilingual-cased/special_tokens_map.json',
 './model/kote-trained-by-bert-base-multilingual-cased/vocab.txt',
 './model/kote-trained-by-bert-base-multilingual-cased/added_tokens.json')

In [None]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

MODEL_NAME = 'klue/roberta-base'
MAX_LEN = 320  # maximum token lenngth
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 3
EPS = 1e-8  # optimizer에서 0 나누기 방지를 위한 epsilon

# Load data
DATA_DIR = './data'
FILE_NAME = 'KOTE_relabel.tsv'
DATA_PATH = os.path.join(DATA_DIR, FILE_NAME)
kote = pd.read_csv(DATA_PATH, sep='\t', index_col=0)
train_df = kote[kote['datset']=='train']
val_df = kote[kote['datset']=='valid']
test_df = kote[kote['datset']=='test']

MODEL_DIR = './model/'
MODEL_SAVE_NAME = f'kote-trained-by-{MODEL_NAME}'
MODEL_SAVE_PATH = MODEL_DIR + MODEL_SAVE_NAME

# Initializing model based tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)

train_dataloader = KorSongsDataLoader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
val_dataloader = KorSongsDataLoader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_dataloader = KorSongsDataLoader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)

model = BertForSequenceClassification.from_pretrained(MODEL_NAME,
                                                      num_labels=5,
                                                      output_hidden_states=False,
                                                      output_attentions=False)

if device.type != 'cpu':
    print("Running model in CUDA")
    model.cuda()

optimizer = optim.AdamW(model.parameters(),lr=LEARNING_RATE, eps=EPS)
total_steps = len(train_dataloader) * EPOCHS

scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

training_stats = []
scaler = GradScaler()
total_t0 = time.time()

for epoch in range(EPOCHS):
    print('')
    print(f'======== Epoch {epoch+1:}/{EPOCHS:} ========')
    print('Training...')

    t0 = time.time()  # 시작 시간 설정
    total_train_loss = 0  # loss 초기화
    
    model.train()
    for step, batch in enumerate(train_dataloader):
        if (step%250 == 0 or step == len(train_dataloader)) and not step==0:
            elapsed = format_time(time.time() - t0)
            print(f'  Batch {step:>5,} of {len(train_dataloader):>5,}.    Elapsed: {elapsed}.')

        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['targets'].to(device)

        model.zero_grad()  # 그래디언트 초기화

        '''forward'''
        with autocast():
            loss, logits = model(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask, 
                                  return_dict=False, 
                                  labels=b_labels)
        total_train_loss += loss.item()

        '''backpropagation'''
        scaler.scale(loss).backward()  # 그래디언트 계산
        scaler.step(optimizer)
        scaler.update()  # scaler 업데이트
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # 그래디언트 클리핑
        optimizer.step()  # 그래디언트를 통해 가중치 파라미터 업데이트
        scheduler.step()  # learning rate 업데이트

    avg_train_loss = total_train_loss / len(train_dataloader)  # 평균 loss
    training_time = format_time(time.time() - t0)
    print('')
    print(f'  Average training loss: {avg_train_loss:.2f}')
    print(f'  Training epcoh took: {training_time:}')

    # ----------------------------------------------------------------

    print('')
    print('Running Validation...')

    t0 = time.time()

    model.eval()  # 평가 모드

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in val_dataloader:
        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['targets'].to(device)

        with torch.no_grad():   
            loss, logits = model(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask, 
                                  return_dict=False, 
                                  labels=b_labels)
            total_eval_loss += loss.item()

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            total_eval_accuracy += accuracy_measure(logits, label_ids)
    
    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    
avg_val_loss = total_eval_loss / len(val_dataloader)

validation_time = format_time(time.time() - t0)

print(f"  Validation Loss: {avg_val_loss:.2f}")
print(f"  Validation took: {validation_time:}")

training_stats.append(
    {
        'epoch': epoch+1,
        'Training Loss': avg_train_loss,
        'Valid. Loss': avg_val_loss,
        'Valid. Accur.': avg_val_accuracy,
        'Training Time': training_time,
        'Validation Time': validation_time
    }
)

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


# Save the model
if not os.path.exists(MODEL_SAVE_PATH):
    os.makedirs(MODEL_SAVE_PATH)

print(f'saving model to {MODEL_SAVE_PATH}')

model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

# ----------------------------------------------------------------

# model.eval()


cuda:0


You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at klue/roberta-base were not used when initializing BertForSequenceClassification: ['roberta.encoder.layer.2.intermediate.dense.bias', 'roberta.encoder.layer.5.output.dense.bias', 'roberta.encoder.layer.0.attention.self.query.weight', 'roberta.encoder.layer.7.output.dense.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.intermediate.dense.bias', 'roberta.encoder.layer.3.attention.self.query.bias', 'roberta.encoder.layer.3.intermediate.dense.weight', 'roberta.encoder.layer.6.output.LayerNorm.weight', 'roberta.encoder.layer.9.output.LayerNorm.weight', 'roberta.encoder.layer.9.attention.output.LayerNorm.bias', 'roberta.encoder.layer.7.intermediate.dense.bias', 'roberta.encoder.layer.3.attention.self.key.bias', 'roberta.encoder.layer.4.attention.output.LayerNorm

Running model in CUDA

Training...
  Batch   250 of 2,500.    Elapsed: 0:01:32.
  Batch   500 of 2,500.    Elapsed: 0:03:03.
  Batch   750 of 2,500.    Elapsed: 0:04:35.
  Batch 1,000 of 2,500.    Elapsed: 0:06:06.
  Batch 1,250 of 2,500.    Elapsed: 0:07:38.
  Batch 1,500 of 2,500.    Elapsed: 0:09:10.
  Batch 1,750 of 2,500.    Elapsed: 0:10:41.
  Batch 2,000 of 2,500.    Elapsed: 0:12:13.
  Batch 2,250 of 2,500.    Elapsed: 0:13:44.

  Average training loss: 0.87
  Training epcoh took: 0:15:16

Running Validation...

Training...
  Batch   250 of 2,500.    Elapsed: 0:01:32.
  Batch   500 of 2,500.    Elapsed: 0:03:03.
  Batch   750 of 2,500.    Elapsed: 0:04:35.
  Batch 1,000 of 2,500.    Elapsed: 0:06:06.
  Batch 1,250 of 2,500.    Elapsed: 0:07:37.
  Batch 1,500 of 2,500.    Elapsed: 0:09:09.
  Batch 1,750 of 2,500.    Elapsed: 0:10:27.
  Batch 2,000 of 2,500.    Elapsed: 0:11:44.
  Batch 2,250 of 2,500.    Elapsed: 0:13:02.

  Average training loss: nan
  Training epcoh took: 0:14

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

MODEL_NAME = 'klue/bert-base'
MAX_LEN = 320  # maximum token lenngth
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 1
EPS = 1e-8  # optimizer에서 0 나누기 방지를 위한 epsilon

# Load data
DATA_DIR = './data'
FILE_NAME = 'KOTE_relabel.tsv'
DATA_PATH = os.path.join(DATA_DIR, FILE_NAME)
kote = pd.read_csv(DATA_PATH, sep='\t', index_col=0)
train_df = kote[kote['datset']=='train']
val_df = kote[kote['datset']=='valid']
test_df = kote[kote['datset']=='test']

MODEL_DIR = './model/'
MODEL_SAVE_NAME = f'kote-trained-by-{MODEL_NAME}'
MODEL_SAVE_PATH = MODEL_DIR + MODEL_SAVE_NAME

# Initializing model based tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=False)

train_dataloader = KorSongsDataLoader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
val_dataloader = KorSongsDataLoader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_dataloader = KorSongsDataLoader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)

model = BertForSequenceClassification.from_pretrained(MODEL_NAME,
                                                      num_labels=5,
                                                      output_hidden_states=False,
                                                      output_attentions=False)

if device.type != 'cpu':
    print("Running model in CUDA")
    model.cuda()

optimizer = optim.AdamW(model.parameters(),lr=LEARNING_RATE, eps=EPS)
total_steps = len(train_dataloader) * EPOCHS

# Create the learning rate scheduler
scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

training_stats = []
scaler = GradScaler()
total_t0 = time.time()

for epoch in range(EPOCHS):
    print('')
    print(f'======== Epoch {epoch+1:}/{EPOCHS:} ========')
    print('Training...')

    t0 = time.time()  # 시작 시간 설정
    total_train_loss = 0  # loss 초기화
    
    model.train()
    for step, batch in enumerate(train_dataloader):
        if (step%250 == 0 or step == len(train_dataloader)) and not step==0:
            elapsed = format_time(time.time() - t0)
            print(f'  Batch {step:>5,} of {len(train_dataloader):>5,}.    Elapsed: {elapsed}.')

        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['targets'].to(device)

        model.zero_grad()  # 그래디언트 초기화

        '''forward'''
        with autocast():
            loss, logits = model(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask, 
                                  return_dict=False, 
                                  labels=b_labels)
        total_train_loss += loss.item()

        '''backpropagation'''
        scaler.scale(loss).backward()  # 그래디언트 계산
        scaler.step(optimizer)
        scaler.update()  # scaler 업데이트
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # 그래디언트 클리핑
        optimizer.step()  # 그래디언트를 통해 가중치 파라미터 업데이트
        scheduler.step()  # learning rate 업데이트

    avg_train_loss = total_train_loss / len(train_dataloader)  # 평균 loss
    training_time = format_time(time.time() - t0)
    print('')
    print(f'  Average training loss: {avg_train_loss:.2f}')
    print(f'  Training epcoh took: {training_time:}')

    # ----------------------------------------------------------------

    print('')
    print('Running Validation...')

    t0 = time.time()

    model.eval()  # 평가 모드

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in val_dataloader:
        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['targets'].to(device)

        with torch.no_grad():   
            loss, logits = model(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask, 
                                  return_dict=False, 
                                  labels=b_labels)
            total_eval_loss += loss.item()

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            total_eval_accuracy += accuracy_measure(logits, label_ids)
    
    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    
avg_val_loss = total_eval_loss / len(val_dataloader)

validation_time = format_time(time.time() - t0)

print(f"  Validation Loss: {avg_val_loss:.2f}")
print(f"  Validation took: {validation_time:}")

training_stats.append(
    {
        'epoch': epoch+1,
        'Training Loss': avg_train_loss,
        'Valid. Loss': avg_val_loss,
        'Valid. Accur.': avg_val_accuracy,
        'Training Time': training_time,
        'Validation Time': validation_time
    }
)

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


# Save the model
if not os.path.exists(MODEL_SAVE_PATH):
    os.makedirs(MODEL_SAVE_PATH)

print("saving model to {}".format(MODEL_SAVE_PATH))

model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

# ----------------------------------------------------------------

# model.eval()


In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
sample_lyrics = '''
당신은 날 설레게 만들어
조용한 내 마음 자꾸만 춤추게 해
얼마나 얼마나 날 떨리게 하는지
당신이 이 밤을 항상 잠 못 들게 해
매일 같은 밤 너를 생각하면서
유치한 노랠 들으며
심장이 춤을 추면서
오오 난 너를 기다리면서
유치한 노랠 부르며
심장이 춤을 추면서 워
나를 떨리게 하나요
그대 왜 나를 설레게 하나요 자꾸만
오늘도 잠 못 이루는 이 밤
아름다운 그대
나를 아프게 하나요
웃는 그대 왜 자꾸 설레게 하나요
하염없이
오늘 밤 잠이 오질 않네요
보고 싶은 그대여
당신이 날 힘들게 만들어
갑자기 내 마음 자꾸만 멍들게 해
얼마나 얼마나 잠 못 들게 하는지
고요한 내 마음 항상 시끄럽게 해
매일 같은 밤 너를 생각하면서
유치한 노랠 들으며
심장이 춤을 추면서
워 난 너를 기다리면서
유치한 노랠 부르며
심장이 춤을 추면서 워
나를 떨리게 하나요
그대 왜 나를 설레게 하나요 자꾸만
오늘도 잠 못 이루는 이 밤
아름다운 그대
나를 아프게 하나요
웃는 그대 왜 자꾸 설레게 하나요
하염없이
오늘 밤 잠이 오질 않네요
보고 싶은 그대여
매일 밤 너를 생각하면서
유치한 노랠 들으며
심장이 춤을 추면서
워오 난 너를 기다리면서
유치한 노랠 부르며
심장이 춤을 추면서 워
나를 떨리게 하나요
그대 왜 나를 설레게 하나요 자꾸만
오늘도 잠 못 이루는 이 밤
아름다운 그대여
나를 아프게 하나요
웃는 그대 왜 자꾸 설레게 하나요
하염없이
오늘 밤 잠이 오질 않네요
보고 싶은 그대여
'''

sample_df = pd.DataFrame({'text':[sample_lyrics], 'label':['행복한']})
sample_df

Unnamed: 0,text,label
0,\n당신은 날 설레게 만들어\n조용한 내 마음 자꾸만 춤추게 해\n얼마나 얼마나 날...,행복한


In [None]:
emotion_class = {
                    '행복한': 0,
                    '화나는': 1,
                    '평온한': 2, 
                    '슬픈': 3, 
                    '없음': 4
                }

sample_df['class'] = sample_df['label'].map(emotion_class)
sample_df

Unnamed: 0,text,label,class
0,\n당신은 날 설레게 만들어\n조용한 내 마음 자꾸만 춤추게 해\n얼마나 얼마나 날...,행복한,0


In [None]:
# 뭔가 잘못됐다.
def lyrics_predict(lyrics):
    lyrics_dataloader = KorSongsDataLoader(sample_df, tokenizer, 100, 16)
    
    model.eval()  

    for batch in lyrics_dataloader:
        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['targets'].to(device)

        with torch.no_grad():   
            loss, logits = model(b_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=b_input_mask, 
                                  return_dict=False, 
                                  labels=b_labels)
            
        logits = logits.detach().cpu().numpy()
        predict = int(np.argmax(logits, axis=1))
    return predict

In [None]:
lyrics_predict(sample_lyrics)

0

In [None]:
DATA_IN_PATH = "./data"
FILE_NAME = "kor_songs_validation.tsv"
DATA_PATH = os.path.join(DATA_IN_PATH, FILE_NAME)
songs = pd.read_csv(DATA_PATH, sep="\t", index_col=0)
songs['class'] = songs['emotion'].map(emotion_class)
print(f"전체 데이터의 개수: {len(songs)}")
songs.head()

전체 데이터의 개수: 113


Unnamed: 0_level_0,lyrics,emotion,class
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
댄스,20 22 PSY Coming back (이리 오너라) Long time no se...,행복한,0
록/메탈,Look at you 넌 못 감당해 날 Ya took off hook 기분은 Cok...,화나는,1
발라드,너와 함께 하고 싶은 일들을 상상하는 게 요즘 내 일상이 되고 너의 즐거워하는 모습...,행복한,0
발라드,너를 위해 해 줄 것이 하나 없어서 보낼 수밖에 없었고 네가 없이 사는 법을 알지 ...,슬픈,3
발라드,하얀 머리 뽑아 달라며 한 개 백 원이라던 그 시절 다 지나가고 이젠 흰 눈만 남았...,슬픈,3


In [None]:
songs['predict'] = songs['lyrics'].map(lyrics_predict)
songs

Unnamed: 0_level_0,lyrics,emotion,class,predict
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
댄스,20 22 PSY Coming back (이리 오너라) Long time no se...,행복한,0,0
록/메탈,Look at you 넌 못 감당해 날 Ya took off hook 기분은 Cok...,화나는,1,0
발라드,너와 함께 하고 싶은 일들을 상상하는 게 요즘 내 일상이 되고 너의 즐거워하는 모습...,행복한,0,0
발라드,너를 위해 해 줄 것이 하나 없어서 보낼 수밖에 없었고 네가 없이 사는 법을 알지 ...,슬픈,3,0
발라드,하얀 머리 뽑아 달라며 한 개 백 원이라던 그 시절 다 지나가고 이젠 흰 눈만 남았...,슬픈,3,0
...,...,...,...,...
발라드,너와 함께 하고 싶은 일들을 상상하는 게 요즘 내 일상이 되고 너의 즐거워하는 모습...,행복한,0,0
발라드,둥근 해가 뜨면 제일 먼저 기분 좋은 상상을 하지 하나 둘 셋 자리에 일어나 하마처...,행복한,0,0
인디음악,사랑노래가 너무 많아서 어떤 노래로 고백을 할까 처음본 순간 falling love...,행복한,0,0
국내뮤지컬,잔인한 곳 무자비한 곳 목구멍 풀칠해 버텨내 살아내는 것도 벅차 세상은 잔인한 곳 ...,화나는,1,0


In [None]:
(songs['class'] == songs['predict']).mean()

0.22123893805309736

In [None]:
songs['predict'].value_counts()

0    113
Name: predict, dtype: int64