In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os
import re

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = "/content/drive/MyDrive/final_project/data/"
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [3]:
!pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7


In [4]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m28.8 MB/s[0m eta [36m0:00:0

In [8]:
df = pd.read_csv(f"{DATA_PATH}train_df.csv")
df.head()

Unnamed: 0,content,username,comments,date,like,tags,id,분야
0,"@brmud_official 제 뽀얀(?) 피부의 비결,,,올리브영 1등 마스크팩...","['_sweet_guy._', 'piano_salmon', 'avant__dh', ...","['진흙을 발라놔도 화사하신 미모❤😳', '크 꿀피부비결이 요거였군요😍', '피부 ...",2023-08-14,787.0,"['#협찬', '#비알머드', '#BRMUD', '#리커버머드마스크', '#모공팩'...",__bbshy,뷰티
1,@lolarose 5월 12일 한국에 새롭게 출시한 롤라로즈 !단풍잎 모양이 넘 ...,['nan'],['nan'],2023-08-12,1018.0,"['#협찬', '#롤라로즈', '#롤라로즈코리아', '#Lolarose', '#여자...",__bbshy,뷰티
2,@hassoul_official 편한데 스타일리한 해즈소울~~~제가 사용한 심리스...,"['merryzzy', 'bini._.9', 'ricklee504', 'it._.z...","['이고 나두 입는데 편해 ❤️ 모델이네 꺄아', '편하게 좋을거같아용🖤🤭', 'L...",2023-08-11,871.0,"['#광고', '#해즈소울', '#심리스브라', '#심리스드로즈', '#브라추천',...",__bbshy,뷰티
3,리버클래시 파격 프로모션 주목‼️리버클래시는 자유로운 뜻의 'Liberl'과 고급...,"['hym.constant', 'hamdarong', 's_oeeuuu']","['어익후 내리다가 너무잘생겨서 깜짝놀랬네!', '차은우 대죤잘❤️', '예쁘고 감...",2023-08-10,772.0,"['#협찬', '#리버클래시', '#LIBERCLASSY', '#차은우', '#차은...",__bbshy,뷰티
4,@clarinskorea 여름철 안티에이징 제품으로 선택한 클라랑스 에센스&아이크...,"['kobyunghee0515', 'iamtravelerjin', '3_able',...",['우와 (👏👏👏) 넘 좋네요(👍👍👍) + 인님의 미모😍 (최고) 태풍🌀 피해 없...,2023-08-10,714.0,"['#제품제공', '#프레셔스라로씨옹', '#프레셔스라크렘므이으', '#클라랑스',...",__bbshy,뷰티


In [9]:
df['tags'][0]

"['#협찬', '#비알머드', '#BRMUD', '#리커버머드마스크', '#모공팩', '#보령머드팩', '#머드팩', '#올리브영', '#올영세일', '#올리브영마스크팩', '#올리브영팩', '#올리브영추천템', '#OLIVEYOUNG', '#올리브영모공팩', '#mudmask', '#ビーアールマッド']"

In [10]:
df['tags'] = df['tags'].str.replace('[', '', regex=True).str.replace("\'", '', regex=True).str.replace('#', '', regex=True).str.replace(',', '', regex=True).str.replace(']', '', regex=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25430 entries, 0 to 25429
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   content   25430 non-null  object 
 1   username  25430 non-null  object 
 2   comments  25419 non-null  object 
 3   date      25430 non-null  object 
 4   like      25430 non-null  float64
 5   tags      25430 non-null  object 
 6   id        25430 non-null  object 
 7   분야        25205 non-null  object 
dtypes: float64(1), object(7)
memory usage: 1.6+ MB


In [12]:
model_name = "ainize/klue-bert-base-mrc"

In [13]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/499 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [14]:
from transformers import AutoModel
model = AutoModel.from_pretrained(model_name)
model

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at ainize/klue-bert-base-mrc and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [15]:
train = df['content']

In [16]:
train = tokenizer(train.tolist(), max_length=512, padding='max_length', truncation=True, return_offsets_mapping=True)

In [17]:
offset_mapping = np.array(train['offset_mapping'])

In [18]:
input_ids = np.array(train['input_ids'])
token_type_ids = np.array(train['token_type_ids'])
attention_mask = np.array(train['attention_mask'])

input_ids.shape, token_type_ids.shape, attention_mask.shape

((25430, 512), (25430, 512), (25430, 512))

In [19]:
df['content'][0][1:2]

'@'

In [20]:
token_lst = []
for word in ['올리브', '올리브영']:
    token = tokenizer.tokenize(word)
    token_ids = tokenizer.convert_tokens_to_ids(token)
    token_lst.append(token_ids)

In [21]:
token_lst

[[11962], [11962, 2122]]

In [22]:
df['content'][0]

' @brmud_official 제 뽀얀(?) 피부의 비결,,,올리브영 1등 마스크팩비알머드까다롭기로 유명한 올리브영에서 리뷰 1,800여개가 넘는, 평점은 무려 4.8점의 확신의 꿀템이죠😁저는 1일 1팩보다는 일주일에 한두번씩 머드팩을 해주곤 하는데요. 7일동안 묵어있던 피지와 각질이 싹 씻겨나가면서 다음날 화장도 잘 먹고 피부결이 화사해보인달까요? 중요한 약속 전날에 해줘도 굿굿💕             '

In [23]:
df['content'][0][34: 34+len('올리브')]

'올리브'

In [24]:
df['content'][0].find('올리브')

34

In [25]:
df['content'][0][34 : 37]

'올리브'

In [26]:
df['content'][0][34: 34+len('올리브영')]

'올리브영'

In [27]:
df['content'][0].find('올리브영')

34

In [28]:
df['content'][0][34 : 38]

'올리브영'

In [29]:
df['tags'][0].split(' ')

['협찬',
 '비알머드',
 'BRMUD',
 '리커버머드마스크',
 '모공팩',
 '보령머드팩',
 '머드팩',
 '올리브영',
 '올영세일',
 '올리브영마스크팩',
 '올리브영팩',
 '올리브영추천템',
 'OLIVEYOUNG',
 '올리브영모공팩',
 'mudmask',
 'ビーアールマッド']

In [30]:
token_lst = []
for word in df['tags'][0].split(' '):
    token = tokenizer.tokenize(word)
    token_ids = tokenizer.convert_tokens_to_ids(token)
    token_lst.append(token_ids)

In [31]:
token_lst

[[18497],
 [1187, 2457, 2282, 2343],
 [38, 2107, 2110, 2309, 2134],
 [31499, 2264, 2282, 2343, 6931, 2292],
 [14431, 3227],
 [16495, 2282, 2343, 3227],
 [28929, 3227],
 [11962, 2122],
 [1446, 2122, 2103, 2210],
 [11962, 2122, 6931, 2292, 3227],
 [11962, 2122, 3227],
 [11962, 2122, 2159, 2337, 2643],
 [51, 2237, 2184, 24423, 2214, 28721, 9029],
 [11962, 2122, 2391, 2086, 3227],
 [80, 8844, 15869, 2041, 2038],
 [1]]

In [32]:
input_ids[0]

array([    2,    36,    69,  2008,  2037,  8844,    66, 23017,  1545,
        1218,  3272,    12,    35,    13,  4335,  2079,  8459,    16,
          16,    16, 11962,  2122,    21,  2491,  8979,  3227,  2151,
        2457,  2282,  2343,  2299,  2062,  3283,  2015,  2200,  4455,
        2470, 11962,  2122, 27135, 11622,    21,    16,  6971,  2173,
        2019,  2116,   749,  2259,    16, 20609,  2073,  6294,    24,
          18,    28,  2532,  2079,  6483,  2079,     1,    21,  2210,
          21,  3227,  2178,  4000, 28179,  2170,  7978,  2517,  3292,
       28929,  3227,  2069,  1897,  2223,  2344,  1889, 13964,  2182,
          18,    27,  2210,  4157,  1089,  2051,  2689,  2414, 12407,
        2522, 13928,  2052,  1337, 30881, 16570, 31369,  3729,  2401,
        4669,  2119,  1521,  1059,  2088,  4335,  2489,  2052, 13379,
        2097,  2178,  2179,  2448,  6301,    35,  3748,  2470,  4680,
        5978,  2170, 23942,  2119,   623,  2673,  2152,     3,     0,
           0,     0,

In [33]:
import sys

In [34]:
np.set_printoptions(threshold=sys.maxsize)

In [36]:
df['tags'][0]

'협찬 비알머드 BRMUD 리커버머드마스크 모공팩 보령머드팩 머드팩 올리브영 올영세일 올리브영마스크팩 올리브영팩 올리브영추천템 OLIVEYOUNG 올리브영모공팩 mudmask ビーアールマッド'

In [35]:
offset_mapping[0]

array([[  0,   0],
       [  1,   2],
       [  2,   3],
       [  3,   4],
       [  4,   5],
       [  5,   7],
       [  7,   8],
       [  8,  16],
       [ 17,  18],
       [ 19,  20],
       [ 20,  21],
       [ 21,  22],
       [ 22,  23],
       [ 23,  24],
       [ 25,  27],
       [ 27,  28],
       [ 29,  31],
       [ 31,  32],
       [ 32,  33],
       [ 33,  34],
       [ 34,  37],
       [ 37,  38],
       [ 39,  40],
       [ 40,  41],
       [ 42,  45],
       [ 45,  46],
       [ 46,  47],
       [ 47,  48],
       [ 48,  49],
       [ 49,  50],
       [ 50,  51],
       [ 51,  52],
       [ 52,  53],
       [ 53,  54],
       [ 54,  55],
       [ 56,  58],
       [ 58,  59],
       [ 60,  63],
       [ 63,  64],
       [ 64,  66],
       [ 67,  69],
       [ 70,  71],
       [ 71,  72],
       [ 72,  75],
       [ 75,  76],
       [ 76,  77],
       [ 77,  78],
       [ 79,  80],
       [ 80,  81],
       [ 81,  82],
       [ 83,  85],
       [ 85,  86],
       [ 87,

In [None]:
input = input_ids.copy()

In [None]:
# UNK 토큰으로 1인 값 0으로 치환
for i in range(input.shape[0]):
    input[i][input[i] == 1] = 0

In [None]:
def tostr_tag_token(tag):
    tag_list = [tag.split(' ') for tag in tag] # 문자열인 테그를 공백을 기준으로 나눔
    tag_token_str_list = [] # 테그 문자열을 담을 리스트 초기화
    for tags in tag_list:
        tag_token_list = [] # 각 text 마다 토큰화한 테그를 담을 리스트 초기화
        for tag in tags:
            tag_token = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tag))
            tag_token = ' '.join(map(str,tag_token)) # 토큰화한 테그를 문자열로 변환
            tag_token_list.append(tag_token)
        tag_token_list = list(filter(None, tag_token_list)) # 빈 문자열 제거
        tag_token_list = sorted(tag_token_list, key=lambda x: len(x), reverse=True) # 토큰화한 테그를 내림차순으로 정렬
        tag_token_str_list.append(tag_token_list)
    return tag_token_str_list

In [None]:
def target_data(train, tag, max_length=512):
    target = np.empty((0, max_length), int) # 빈 target 배열 생성
    for text_token, tag_token in tqdm(zip(train, tag), total=train.shape[0]):
        text_token = ' '.join(map(str, text_token)) # text를 각각 가져와 문자열로 변환
        # 각 문장에 해당하는 테그를 가져와 순서대로 해당하는 열을 1로 변환
        for token in tag_token:
            text_token = re.sub(f"{token}\s", '1 '*len(token.split(' ')), text_token)
        text_token = list(map(int, text_token.split(' '))) # 변환된 문자열을 공백을 기준으로 나눔
        text_token = np.array(text_token)
        text_token = (text_token == 1) + 0 # 1을 제외한 숫자를 0으로 치환
        target = np.append(target, text_token.reshape(1, -1), axis=0)
    return target

In [None]:
tag = tostr_tag_token(df['tags'])
target = target_data(input, tag)
target.shape

  0%|          | 0/25430 [00:00<?, ?it/s]

(25430, 512)

In [None]:
notag_list = []
for i in range(target.shape[0]):
    if not target[i].sum():
        notag_list.append(i)

In [None]:
len(notag_list)

6779

In [None]:
input_ids = np.delete(input_ids, notag_list, axis=0)
token_type_ids = np.delete(token_type_ids, notag_list, axis=0)
attention_mask  = np.delete(attention_mask , notag_list, axis=0)
target = np.delete(target, notag_list, axis=0)

input_ids.shape, token_type_ids.shape, attention_mask.shape, target.shape

((18651, 512), (18651, 512), (18651, 512), (18651, 512))

##### 데이터셋 나누기

In [None]:
def permutation_train_test_split(input_ids, token_type_ids, attention_mask, target, test_size=0.2, shuffle=True, random_state=SEED):
    test_num = int(input_ids.shape[0] * test_size)
    train_num = input_ids.shape[0] - test_num

    if shuffle:
        np.random.seed(random_state)
        shuffled = np.random.permutation(input_ids.shape[0])

        input_ids = input_ids[shuffled,:]
        token_type_ids = token_type_ids[shuffled,:]
        attention_mask = attention_mask[shuffled,:]
        target = target[shuffled,:]

        train_input_ids = input_ids[:train_num]
        train_token_type_ids = token_type_ids[:train_num]
        train_attention_mask = attention_mask[:train_num]

        test_input_ids = input_ids[train_num:]
        test_token_type_ids = token_type_ids[train_num:]
        test_attention_mask = attention_mask[train_num:]

        train_target = target[:train_num]

        test_target = target[train_num:]

    else:
        train_input_ids = input_ids[:train_num]
        train_token_type_ids = token_type_ids[:train_num]
        train_attention_mask = attention_mask[:train_num]

        test_input_ids = input_ids[train_num:]
        test_token_type_ids = token_type_ids[train_num:]
        test_attention_mask = attention_mask[train_num:]

        train_target = target[:train_num]

        test_target = target[train_num:]

    return train_input_ids, train_token_type_ids, train_attention_mask, test_input_ids, test_token_type_ids, test_attention_mask, train_target, test_target

In [None]:
train_input_ids, train_token_type_ids, train_attention_mask, test_input_ids, test_token_type_ids, test_attention_mask, train_target, test_target = permutation_train_test_split(input_ids, token_type_ids, attention_mask, target)

#### 학습데이터셋

In [None]:
train_input_ids.shape, train_token_type_ids.shape, train_attention_mask.shape, train_target.shape

((14921, 512), (14921, 512), (14921, 512), (14921, 512))

#### 테스트데이터셋

In [None]:
test_input_ids.shape, test_token_type_ids.shape, test_attention_mask.shape, test_target.shape

((3730, 512), (3730, 512), (3730, 512), (3730, 512))



---



In [None]:
class BlogDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, token_type_ids, y=None):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.y = y

    def __len__(self):
        return self.input_ids.shape[0]

    def __getitem__(self, idx):
        item = {}
        item['input_ids'] = torch.tensor(self.input_ids[idx])
        item['attention_mask'] = torch.tensor(self.attention_mask[idx])
        item['token_type_ids'] = torch.tensor(self.token_type_ids[idx])
        if self.y is not None:
            item['y'] = torch.Tensor(self.y[idx])
        return item

In [None]:
dt = BlogDataset(train_input_ids, train_attention_mask, train_token_type_ids, train_target)
dl = torch.utils.data.DataLoader(dt, batch_size=2)
batch = next(iter(dl))
batch

{'input_ids': tensor([[    2,  4842, 25019,  ...,     0,     0,     0],
         [    2, 25687,  2645,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'y': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 1., 1.,  ..., 0., 0., 0.]])}

In [None]:
class Net(torch.nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.output_layer = torch.nn.Linear(self.model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        x = self.model(input_ids, attention_mask, token_type_ids)
        return self.output_layer(x[0])

In [None]:
model = Net(model_name)
pred = model(batch['input_ids'], batch['attention_mask'], batch['token_type_ids'])
pred.shape

Some weights of BertModel were not initialized from the model checkpoint at ainize/klue-bert-base-mrc and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([2, 512, 1])

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer, device):
    epoch_loss = 0
    model.train()
    for batch in tqdm(dataloader):
        pred = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['token_type_ids'].to(device))
        pred = pred.flatten(1)
        # pedding 한 부분 제외
        mask = batch['attention_mask'].eq(1).to(device)
        target = torch.masked_select(batch['y'].to(device), mask)
        result = torch.masked_select(pred, mask)
        loss = loss_fn(result.view(-1, 1), target.view(-1, 1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)
    return epoch_loss

In [None]:
@torch.no_grad()
def test_loop(dataloader, model, loss_fn, device):
    model.eval()
    sig = torch.nn.Sigmoid()
    pred_list = []
    epoch_loss = 0
    for batch in dataloader:
        pred = model(batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['token_type_ids'].to(device))
        pred = pred.flatten(1)

        # pedding 한 부분 제외
        if batch.get('y') is not None:
            mask = batch['attention_mask'].eq(1).to(device)
            target = torch.masked_select(batch['y'].to(device), mask)
            result = torch.masked_select(pred, mask)
            loss = loss_fn(result.view(-1, 1), target.view(-1, 1))
            epoch_loss += loss.item()

        pred = sig(pred)
        pred = pred.to('cpu').numpy()
        pred_list.append(pred)

    pred = np.concatenate(pred_list)
    epoch_loss /= len(dataloader)
    return epoch_loss, pred

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import gc

batch_size = 16
loss_fn = torch.nn.BCEWithLogitsLoss()
device = "cuda" if torch.cuda.is_available() else "cpu"
epochs = 100
n_splits = 5
# cv = KFold(n_splits=n_splits, random_state=SEED, shuffle=True)

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
cv = MultilabelStratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True)

In [None]:
is_holdout = False
reset_seeds(SEED)
best_f1_score_list = []
best_FP_avg_list = []
for i, (tri, vai) in enumerate(cv.split(train_input_ids, train_target)):
    if i <= 1:
        continue
    model = Net(model_name).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

    # 학습용
    train_dt = BlogDataset(train_input_ids[tri], train_attention_mask[tri], train_token_type_ids[tri], train_target[tri])
    train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)

    # 검증용
    valid_dt = BlogDataset(train_input_ids[vai], train_attention_mask[vai], train_token_type_ids[vai], train_target[vai])
    valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=batch_size, shuffle=False)

    best_f1_score = 0
    best_fp_avg = 0
    patience = 0

    for epoch in tqdm(range(epochs)):
        train_loss = train_loop(train_dl, model, loss_fn, optimizer, device)
        valid_loss, pred = test_loop(valid_dl, model, loss_fn, device)
        pred = (pred > 0.5).astype(int)
        # 정답데이터에서 1인 값만 mask
        mask = train_target[vai] == 1
        f1 = f1_score(train_target[vai][mask], pred[mask], average='micro')
        # FP
        fp_score = ((pred == 1) & (train_target[vai] == 0)).sum()
        # 평군 FP
        fp_avg = fp_score/train_target[vai].shape[0]

        patience += 1
        print(train_loss, valid_loss, f1, fp_score, fp_avg, sep="\t")
        if f1 > best_f1_score:
            patience = 0
            best_f1_score = f1
            best_fp_avg = fp_avg  # F1 score가 가장 높을 때의 FP 점수 기록
            torch.save(model.state_dict(), f"model_{i}.pth")

        if patience == 3:
            break

    print(f"{i} 번째 폴드 best F1_score: {best_f1_score}")
    print(f"{i} 번째 폴드 best FP: {best_fp_avg}")
    best_f1_score_list.append(best_f1_score)
    best_FP_avg_list.append(best_fp_avg)

    del train_dl, train_dt, valid_dl, valid_dt
    gc.collect()

    del optimizer, model
    torch.cuda.empty_cache()

    if is_holdout:
        break
print(np.mean(best_f1_score_list))
print(np.mean(best_FP_avg_list))

Some weights of BertModel were not initialized from the model checkpoint at ainize/klue-bert-base-mrc and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/748 [00:00<?, ?it/s]

0.07879336114554482	0.06791906948051145	0.6006150985230483	8269	2.7916948008102636


  0%|          | 0/748 [00:00<?, ?it/s]

0.055315659904384355	0.07100754109041024	0.7328612809784358	18647	6.295408507765024


  0%|          | 0/748 [00:00<?, ?it/s]

0.039898979714150254	0.07739858537591914	0.6931302077745592	13932	4.703578663065496


  0%|          | 0/748 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

#### 추론하기

In [None]:
test_dt = BlogDataset(test_input_ids, test_attention_mask, test_token_type_ids)
test_dl = torch.utils.data.DataLoader(test_dt, batch_size=batch_size, shuffle = False)

pred_list = []
for i in range(n_splits):
    model = Net(model_name).to(device)
    state_dict = torch.load(f"model_{i}.pth") # 가중치 불러오기
    model.load_state_dict(state_dict) # 모델에 가중치 세팅

    _, pred = test_loop(test_dl, model, loss_fn, device)
    pred_list.append(pred)

pred = np.mean(pred_list, axis=0)
pred = (pred > 0.5).astype(int)

Some weights of BertModel were not initialized from the model checkpoint at ainize/klue-bert-base-mrc and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FileNotFoundError: ignored

#### f1_score, FP 계산

In [None]:
mask = test_target == 1
f1 = f1_score(test_target[mask], pred[mask], average='micro')
fp_score = ((pred == 1) & (test_target == 0)).sum()
fp_avg = fp_score / test_target.shape[0]
print(f"F1_SCORE : {f1}")
print(f"FP_AVG : {fp_avg}")

#### 추론한 토큰 역 토큰화

In [None]:
def convert_to_tokens(tokenizer, test_input_ids, pred):
    tag_tokens = []
    for i in range(len(pred)): # pred 값이 1인 부분만 추출
        tag_token = tokenizer.convert_ids_to_tokens(test_input_ids[i][pred[i] == 1])
        tag_tokens.append(tag_token)
    return tag_tokens

In [None]:
# 후에 토큰들 연결 시키는 작업 필요
tag_tokens = convert_to_tokens(tokenizer, test_input_ids, pred)

In [None]:
tag_tokens[1]