```
Copyright (C) 2023 Donggeon Lee
 
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
 
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
 
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
```

In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = str(0)

import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForPreTraining, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from tqdm import tqdm

def set_seed(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 42
MAX_SEQ_LEN = 512
set_seed(SEED)


class BertDataset(Dataset):
    def __init__(self, corpus, tokenizer, max_length):
        self.corpus = corpus
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, idx):
        report = self.corpus[idx]
        sentences = report.split('\n')
        tokenized_sentences = [self.tokenizer.encode(sentence, add_special_tokens=False) for sentence in sentences]

        # Choose two sentences for NSP
        if len(tokenized_sentences) >= 2:
            # 50% of the time, the second sentence is a random sentence
            if random.random() > 0.5:
                second_sentence = random.choice(tokenized_sentences)
                tokenized_sentences[1] = second_sentence
                next_sentence_label = 0
            else:
                next_sentence_label = 1
        else:
            next_sentence_label = 0

        # Combine sentences and add [CLS] and [SEP] tokens
        tokens = [self.tokenizer.cls_token_id]
        for sentence in tokenized_sentences:
            tokens += sentence + [self.tokenizer.sep_token_id]
        tokens = tokens[:self.max_length]

        # Create segment ids
        if self.tokenizer.sep_token_id in tokens:
            token_type_ids = [0 if i <= tokens.index(self.tokenizer.sep_token_id) else 1 for i in range(len(tokens))]
        else:
            token_type_ids = [0] * len(tokens)


        # MLM
        probability_matrix = torch.full((len(tokens),), 0.15)
        special_tokens_mask = [self.tokenizer.get_special_tokens_mask([val], already_has_special_tokens=True)[0] for val in tokens]
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels = torch.where(masked_indices, torch.tensor(tokens), torch.tensor(-100))



        # Padding
        padding_length = self.max_length - len(tokens)
        tokens = tokens + ([self.tokenizer.pad_token_id] * padding_length)
        labels = labels.tolist() + ([-100] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)

        attention_mask = [1 if token != self.tokenizer.pad_token_id else 0 for token in tokens]

        return {"input_ids": tokens, "attention_mask": attention_mask, "token_type_ids": token_type_ids, "next_sentence_label": next_sentence_label, "labels": labels}


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        next_sentence_label = inputs.pop("next_sentence_label")
        outputs = model(**inputs)
        prediction_logits = outputs.prediction_logits
        next_sentence_logits = outputs.seq_relationship_logits

        nsp_loss = torch.nn.CrossEntropyLoss()(next_sentence_logits.view(-1, 2), next_sentence_label.view(-1))
        mlm_loss = torch.nn.CrossEntropyLoss()(prediction_logits.view(-1, self.model.config.vocab_size), labels.view(-1))

        loss = nsp_loss + mlm_loss

        return (loss, outputs) if return_outputs else loss

2023-11-21 11:19:17.990208: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-21 11:19:18.186614: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-21 11:21:28.557506: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-11-21 11:21:28.557791: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

In [2]:
# BERT 모델 및 토크나이저 불러오기
model = BertForPreTraining.from_pretrained('klue/bert-base')
tokenizer = BertTokenizer.from_pretrained('klue/bert-base')

# csv 파일 로드
df = pd.read_csv('data/train/BERT_trainset.csv')

# 결측치 제거
df = df.dropna(axis=0)

# 'text' 열의 문장들을 리스트로 변환
corpus = df['text'].tolist()

# Split the data
train_corpus, val_corpus = train_test_split(corpus, test_size=0.1, random_state=SEED)

# Create datasets
train_dataset = BertDataset(train_corpus, tokenizer, max_length=MAX_SEQ_LEN)
val_dataset = BertDataset(val_corpus, tokenizer, max_length=MAX_SEQ_LEN)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=3
)

# Create trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

Token indices sequence length is longer than the specified maximum sequence length for this model (622 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss
500,0.7722
1000,0.4136
1500,0.3875
2000,0.3681
2500,0.3379
3000,0.3529
3500,0.338
4000,0.3267
4500,0.3338
5000,0.3334


TrainOutput(global_step=6624, training_loss=0.37455422809158545, metrics={'train_runtime': 2517.2658, 'train_samples_per_second': 21.044, 'train_steps_per_second': 2.631, 'total_flos': 1.4039613220737024e+16, 'train_loss': 0.37455422809158545, 'epoch': 3.0})

In [3]:
model

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [4]:
model.save_pretrained("model")

In [5]:
from huggingface_hub import notebook_login

In [6]:
# notebook_login()

In [7]:
model.push_to_hub("KoAirBERT")

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/oneonlee/KoAirBERT/commit/811ee837de8be44405a7907b58b0613d00f84fc1', commit_message='Upload BertForPreTraining', commit_description='', oid='811ee837de8be44405a7907b58b0613d00f84fc1', pr_url=None, pr_revision=None, pr_num=None)

In [8]:
num_parameters = model.num_parameters()
print("Number of parameters: ", num_parameters)

Number of parameters:  111243010


In [9]:
exit()