#### <b>필요한 라이브러리 설치 및 불러오기</b>

* 정상적인 코드 실행을 위하여 <b>[런타임]</b> - <b>[런타임 유형 변경]</b> - 하드웨어 가속기로 <b>[GPU]</b>를 선택해 주세요.
* <b>Transformer</b>: 트랜스포머 아키텍처 라이브러리
  * 한국어 모델을 포함해 다양한 큰 규모의(large-scale) 트랜스포머 모델(BERT, ELECTRA 등) 을 제공한다.

In [1]:
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m82.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.4 MB/s[0m eta [36m0:00:0

In [2]:
import argparse
import copy
import json
import logging
import os

import numpy as np
import pandas as pd

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch

import transformers
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

<b>텍스트 특징 추출기(Feature Extractor) 모델이란?</b>

* 하나의 텍스트(text) 입력이 주어졌을 때, 해당 텍스트에 대하여 특징(feature)을 추출한다.
  * 대표적인 Large Language 모델인 <b>BERT (Transformer 기반)</b>를 불러와 사용할 수 있다.

<b>헤드(Head) 모델이란?</b>

* 실제로 내가 원하는 기능을 수행하는 모델을 정의할 수 있다.
* <b>헤드(head)</b>: 잘 학습된 모델 뒤에 붙여서 특정한 기능을 위한 작은 크기의 네트워크 모델
  * BERT 아키텍처를 통해 추출된 특징(feature)를 이용해 실제로 분류 기능을 수행할 수 있다.

<b>미세 조정(Fine-tuning)</b>
* 사전에 학습된 모델을 fine-tuning하기 위해서는 두 가지 요소가 필요하다.
  * <b>Optimizer:</b> Adam 혹은 AdamW와 같은 optimizer를 사용해 학습하는 것이 일반적이다.
  * <b>Scheduler</b>: 학습 시점 전반에 걸쳐서 어떠한 학습률(learning rate) 값으로 학습할 지 설정한다.

<b>Hugging Face Auto Class</b>

* <b>Auto 키워드</b>: 사전 학습된(pre-trained) 모델을 이용해 원하는 작업을 수행한다.
  * <b>AutoTokenizer</b>: 입력을 토큰(token)으로 바꾸는 기능을 수행한다.
    * 사전 학습된 tokenizer를 불러올 수 있다.
      * 예시) bert-base-multilingual-sentiment
  * <b>AutoModelForSequenceClassification</b>: 문장 분류(classification)을 위한 자동화된 모델을 제공한다.
    * 사전 학습된 BERT 모델을 불러올 수 있다.
      * 예시) bert-base-multilingual-sentiment
    * <b>from_pretrained()</b> 함수를 이용해 특정한 경로에서 모델을 불러올 수 있다.
      * PyTorch 혹은 TensorFlow 상관없이 사용할 수 있다.

<pre>
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("monologg/kobigbird-bert-base")  # BigBirdModel
tokenizer = AutoTokenizer.from_pretrained("monologg/kobigbird-bert-base")  # BertTokenizer
</pre>

In [3]:
# 기초적인(base) 모델 정의
class BaseModel(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.from_pretrained()

    def save_pretrained(self, save_dir):
        self.model.save_pretrained(save_dir)
        for key in ["special_tokens_map_file", "tokenizer_file"]:
            self.tokenizer.init_kwargs.pop(key, None)
        self.tokenizer.save_pretrained(save_dir)

    def from_pretrained(self):
        raise NotImplementedError

    def forward(self, inputs):
        return self.model(**inputs)

    def eval_step(self, outputs):
        raise NotImplementedError

    def get_optimizer(self):
        """Prepare optimizer"""
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.config.weight_decay,
            },
            {
                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": self.config.weight_decay,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.config.learning_rate, eps=self.config.adam_epsilon)
        return optimizer

    def get_scheduler(self, batch_num, optimizer):
        """Prepare scheduler"""
        if self.config.warmup_proportion == 0.0:
            return None

        t_total = batch_num // self.config.gradient_accumulation_steps * self.config.num_train_epochs

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(t_total * self.config.warmup_proportion),
            num_training_steps=t_total,
        )

        return scheduler

    def tensor_to_array(self, tensor):
        return tensor.detach().cpu().numpy()

    def tensor_to_list(self, tensor):
        return self.tensor_to_array(tensor).tolist()


# 실질적인 분류 기능까지 포함한 전체 모델
class ClsModel(BaseModel):
    def __init__(self, config):
        super().__init__(config)
        config.label2id = self.config.label2id

    # 사전 학습된 모델 가중치 불러오기
    def from_pretrained(self):
        data_file = os.path.join(self.config.data_dir, str(self.config.train_file))
        self.config.label2id = process_map[self.config.dataset](self.config, data_file, True, get_label_map=True)
        num_labels = len(self.config.label2id)
        if num_labels != self.config.num_labels:
            print(
                f"given args num_labels({self.config.num_labels}) is not same with num_labels({num_labels}) from dataset."
            )
            print(f"switch num_labels {self.config.num_labels} -> {num_labels}")
            self.config.num_labels = num_labels
        model_config = AutoConfig.from_pretrained(self.config.model_name_or_path, num_labels=self.config.num_labels)
        model_config.label2id = self.config.label2id
        model_config.id2label = {int(v): k for k, v in model_config.label2id.items()}
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.config.model_name_or_path, config=model_config, cache_dir=self.config.cache_dir
        )
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name_or_path, cache_dir=self.config.cache_dir)

    def forward(self, inputs):
        outputs = self.model(**inputs)
        return outputs

    def eval_step(self, inputs, outputs):
        logits = outputs.logits.detach().cpu()
        predictions = self.tensor_to_list(torch.argmax(logits, dim=-1))
        labels = self.tensor_to_list(inputs["labels"])
        results = [{"prediction": prediction, "label": label} for prediction, label in zip(predictions, labels)]
        return results

#### <b>트랜스포머 모델에 관한 기본적인 라이브러리 및 설정</b>

In [4]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


def cal_running_avg_loss(loss, running_avg_loss, decay=0.99):
    if running_avg_loss == 0:
        return loss
    running_avg_loss = running_avg_loss * decay + (1 - decay) * loss
    return running_avg_loss

#### <b>가짜 뉴스(Fake News) 데이터 세트 다운로드</b>

* 가짜 뉴스 데이터 세트를 다운로드한다.

In [5]:
!git clone https://github.com/2alive3s/Fake_news
%cd Fake_news
!unzip data/mission1_train.zip
!unzip data/mission2_train.zip
!mkdir -p cache/fake_news_data

Cloning into 'Fake_news'...
remote: Enumerating objects: 83, done.[K
remote: Total 83 (delta 0), reused 0 (delta 0), pack-reused 83[K
Unpacking objects: 100% (83/83), 138.74 MiB | 8.71 MiB/s, done.
/content/Fake_news
Archive:  data/mission1_train.zip
  inflating: mission1_train.csv      
Archive:  data/mission2_train.zip
  inflating: mission2_train.csv      


In [17]:
!cp mission1_train.csv cache/fake_news_data/mission1_train.csv
!cp mission2_train.csv cache/fake_news_data/mission2_train.csv

#### <b>가짜 뉴스(Fake News) 분류 모델을 위한 전처리 부분</b>

* 내가 원하는 데이터 세트에 대하여, 데이터를 불러오는 함수를 작성할 필요가 있다.

In [18]:
# def sample_writer(data, config, tokenizer, is_train):
def sample_writer(data):
    feature = tokenizer(
        data["text"],
        max_length=config.max_seq_length,
        padding="max_length",
        truncation=True,
        add_special_tokens=True,
    )
    write_data = {
        "input_ids": feature["input_ids"],
        "attention_mask": feature["attention_mask"],
        "labels": data["label"],
    }
    return write_data


def make_label_map(labels):
    unique_labels = sorted(list(set(labels)))
    label2id = dict()
    for i, label in enumerate(unique_labels):
        label2id[label] = i
    return label2id


def postprocess():
    def decorator(fn):
        def wrapped(config, data_file, is_train, **kwargs):
            # 인자 값으로 get_label_map이 넘어온 경우
            get_label_map = kwargs.get("get_label_map", False)
            texts, labels = fn(config, data_file, is_train)

            try:
                label2id = config.label2id
            except Exception:
                label2id = label2id = make_label_map(labels)

            labels = [label2id[label] for label in labels]

            if get_label_map:
                return label2id

            data = [{"text": text, "label": label} for text, label in zip(texts, labels)]
            pd.DataFrame(data).to_csv(
                "{}_{}_{}.csv".format(data_file, config.dataset, "train" if is_train else "valid"),
                index=False,
                encoding="utf-8-sig",
            )
            if is_train:
                pd.DataFrame(list(label2id.items()), columns=["label", "id"]).to_csv(
                    "{}_{}_label2id.csv".format(data_file, config.dataset), index=False, encoding="utf-8-sig"
                )

            return data

        return wrapped

    return decorator


def train_split(config, texts, labels, is_train):
    x_train, y_train, x_label, y_label = train_test_split(
        texts, labels, test_size=0.2, random_state=config.seed, stratify=labels
    )
    if is_train:
        texts, labels = x_train, x_label
    else:
        texts, labels = y_train, y_label
    return texts, labels


@postprocess()
def process_fake_news_cls(config, data_file, is_train):
    df = pd.read_csv(data_file)
    try:
        labels = df["Label"].astype(str).values.tolist()
    except Exception:
        labels = df["label"].astype(str).values.tolist()
    texts = [
        title + " " + content
        for title, content in zip(df["title"].astype(str).values.tolist(), df["content"].astype(str).values.tolist())
    ]
    texts, labels = train_split(config, texts, labels, is_train)
    return texts, labels


process_map = {
    "fake_news": process_fake_news_cls
}


def collate_fn(features):
    input_ids = [sample["input_ids"] for sample in features]
    attention_mask = [sample["attention_mask"] for sample in features]
    labels = [sample["labels"] for sample in features]

    input_ids = torch.tensor(np.array(input_ids).astype(np.int64), dtype=torch.long)
    attention_mask = torch.tensor(np.array(attention_mask).astype(np.int8), dtype=torch.long)
    labels = torch.tensor(np.array(labels).astype(np.int64), dtype=torch.long)
    inputs = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }
    return inputs

#### <b>가짜 뉴스(Fake News) 분류 모델 학습을 위한 데이터 로더(Data Loader) 작성하기</b>

* PyTorch 모델을 이용해 학습하기 위하여 데이터 로더를 준비한다.

In [19]:
import multiprocessing


def init_sample_writer(_config, _tokenizer, _is_train, _writer):
    global config
    global tokenizer
    global is_train
    global writer
    config = _config
    tokenizer = _tokenizer
    is_train = _is_train
    writer = _writer


def write_samples(config, tokenizer, is_train, processor, writer_file, data, workers=4):
    write_cnt = 0
    with multiprocessing.Pool(
        processes=workers,
        initializer=init_sample_writer,
        initargs=(config, tokenizer, is_train, sample_writer),
    ) as pool:
        for write_data in tqdm(
            pool.imap(sample_writer, data), total=len(data), dynamic_ncols=True, desc="writing samples..."
        ):
            if isinstance(write_data, list):
                for datum in write_data:
                    writer_file.write(json.dumps(datum) + "\n")
                write_cnt += len(write_data)
            else:
                writer_file.write(json.dumps(write_data) + "\n")
                write_cnt += 1
    return write_cnt


class IterableDatasetPad(torch.utils.data.IterableDataset):
    def __init__(
        self,
        dataset: torch.utils.data.IterableDataset,
        batch_size: int = 1,
        num_devices: int = 1,
        seed: int = 0,
    ):
        self.dataset = dataset
        self.batch_size = batch_size
        self.seed = seed
        self.num_examples = 0

        chunk_size = self.batch_size * num_devices
        length = len(dataset)
        self.length = length + (chunk_size - length % chunk_size)

    def __len__(self):
        return self.length

    def __iter__(self):
        self.num_examples = 0
        if (
            not hasattr(self.dataset, "set_epoch")
            and hasattr(self.dataset, "generator")
            and isinstance(self.dataset.generator, torch.Generator)
        ):
            self.dataset.generator.manual_seed(self.seed + self.epoch)

        first_batch = None
        current_batch = []
        for element in self.dataset:
            self.num_examples += 1
            current_batch.append(element)
            # Wait to have a full batch before yielding elements.
            if len(current_batch) == self.batch_size:
                for batch in current_batch:
                    yield batch
                    if first_batch is None:
                        first_batch = batch.copy()
                current_batch = []

        # pad the last batch with elements from the beginning.
        while self.num_examples < self.length:
            add_num = self.batch_size - len(current_batch)
            self.num_examples += add_num
            current_batch += [first_batch] * add_num
            for batch in current_batch:
                yield batch
            current_batch = []

In [20]:
import torch.utils.data as torch_data


def get_data(config, tokenizer, is_train=True, overwrite=False):
    if is_train:
        data_file = config.train_file
    else:
        data_file = config.predict_file

    data_path = config.data_dir
    if data_file is not None:
        data_path = os.path.join(data_path, data_file)
    else:
        data_path += "/"

    data_processor = process_fake_news_cls # 추적
    if data_processor is None:
        raise Exception(f"Invalid data task {config.task}!")

    processor = process_fake_news_cls
    if processor is None:
        raise Exception(f"Invalid task dataset {config.dataset}!")

    comps = [
        data_path,
        config.dataset,
        config.model_name_or_path.replace("/", "_"),
        config.max_seq_length,
        "train" if is_train else "dev",
        "dataset.txt",
    ]
    dataset_file = "_".join([str(comp) for comp in comps])
    print("dataset_file:", dataset_file)

    if not os.path.exists(dataset_file) or overwrite:
        with open(dataset_file, "w", encoding="utf-8") as writer_file:
            if data_file is None or not os.path.isdir(data_path):
                data = processor(config, data_path, is_train)
                cnt = write_samples(
                    config, tokenizer, is_train, data_processor, writer_file, data, workers=config.threads
                )
            else:

                cnt = 0
                for filename in sorted([f for f in os.listdir(data_path) if f.endswith(".json")]):
                    data = processor(config, os.path.join(data_path, filename), is_train)
                    cnt += write_samples(
                        config, tokenizer, is_train, data_processor, writer_file, data, workers=config.threads
                    )
            print(f"{cnt} features processed from {data_path}")

    # dataset = load_dataset("text", data_files=dataset_file)["train"]
    dataset = load_dataset("text", data_files=dataset_file, download_mode="force_redownload")["train"]
    dataset = dataset.map(lambda x: json.loads(x["text"]), batched=False)

    if not is_train:
        # for valid datasets, we pad datasets so that no sample will be skiped in multi-device settings
        dataset = IterableDatasetPad(
            dataset=dataset,
            batch_size=config.train_batch_size if is_train else config.eval_batch_size,
            num_devices=config.world_size,
            seed=config.seed,
        )

    dataloader = torch_data.DataLoader(
        dataset,
        sampler=torch_data.RandomSampler(dataset) if is_train else None,
        drop_last=False,
        batch_size=config.train_batch_size if is_train else config.eval_batch_size,
        collate_fn=(collate_fn),
    )

    return dataloader

#### <b>모델 학습 관련 라이브러리 준비하기</b>

In [21]:
from functools import partial
import sklearn.metrics as sklearn_metrics

binary_metrics = {
    "accuracy": sklearn_metrics.accuracy_score,
    "precision": sklearn_metrics.precision_score,
    "recall": sklearn_metrics.recall_score,
    "f1": sklearn_metrics.f1_score,
    "matthews_corrcoef": sklearn_metrics.matthews_corrcoef,
    "roc_auc": sklearn_metrics.roc_auc_score,
}


metrics = {
    "accuracy": sklearn_metrics.accuracy_score,
    "f1-macro": partial(sklearn_metrics.f1_score, average="macro"),
}


def eval_cls(results, **kwargs):
    predictions = np.array([result["prediction"] for result in results])
    labels = np.array([result["label"] for result in results])
    is_binary = len(set(labels.tolist())) < 3
    results = {
        metric: round(f(labels, predictions) * 100, 2)
        for metric, f in (binary_metrics.items() if is_binary else metrics.items())
    }
    return {
        "results": results,
        "best_score": results["f1" if is_binary else "f1-macro"],
    }

In [22]:
def _run_epoch(model, loader, device=None, context=None, **kwargs):
    config = kwargs["config"]
    is_train = kwargs["is_train"]

    avg_loss = 0
    results = []
    batch_num = len(loader)

    if is_train:
        model.train()
        if config.use_tpu:
            optimizer = context.getattr_or(
                "optimizer",
                lambda: model.get_optimizer(),
            )
            scheduler = context.getattr_or(
                "scheduler",
                lambda: model.get_scheduler(batch_num, optimizer),
            )
        else:
            optimizer = kwargs["optimizer"]
            scheduler = kwargs["scheduler"]
    else:
        model.eval()

    is_master = True

    pbar = tqdm(enumerate(loader), total=batch_num, disable=not is_master, dynamic_ncols=True)
    for i, inputs in pbar:

        if not config.use_tpu:
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to(device)

        outputs = model(inputs)
        loss = outputs.loss.mean()
        avg_loss = cal_running_avg_loss(loss.item(), avg_loss)
        loss /= config.gradient_accumulation_steps

        if is_train:
            loss.backward()
            if i % config.gradient_accumulation_steps == 0 or i == batch_num - 1:

                if config.max_grad_norm > 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)

                optimizer.step()
                optimizer.zero_grad()

                if scheduler is not None:
                    scheduler.step()
        else:
            result = (model.module if hasattr(model, "module") else model).eval_step(inputs, outputs)
            results.extend(result)

        if is_master:
            pbar.set_description(
                f"epoch: {kwargs['epoch'] + 1}, {('train' if is_train else 'valid')} loss: {min(100, round(avg_loss, 4))}"
            )

    return {
        "loss": avg_loss,
        "result": results,
    }


def run_epoch(**kwargs):
    model = kwargs.pop("model")
    if kwargs["config"].use_tpu:
        results = model(_run_epoch, **kwargs)
    else:
        results = _run_epoch(model, **kwargs)

    if isinstance(results, list):
        loss = sum([result["loss"] for result in results]) / len(results)
        result = []
        for res in results:
            result.extend(res["result"])
        results = {"loss": loss, "result": result}

    return results

#### <b>하이퍼 파라미터 및 인자 값 설정하기</b>

In [23]:
from types import SimpleNamespace

config = SimpleNamespace()

config.task = "cls"
config.dataset = "fake_news"

config.cache_dir = "cache"
config.output_dir = "output"

config.use_tpu = False
config.model_name_or_path = "monologg/kobigbird-bert-base" # Model name or path
config.data_dir = "cache/fake_news_data" # The input data dir

config.train_file = "mission2_train.csv"
config.predict_file = "mission2_train.csv"

config.max_seq_length = 1024 # The maximum total input sequence length after tokenization.
config.train_batch_size = 4 # Batch size for training.
config.eval_batch_size = 2 # Batch size for evaluation.

config.learning_rate = 3e-5 # The initial learning rate for Adam.
config.num_train_epochs = 10 # Total number of training epochs to perform.

config.num_labels = 2
config.gradient_accumulation_steps = 2 # Number of updates steps to accumulate before performing a backward/update pass.

config.threads = 4
config.seed = 42 # random seed for initialization

config.do_train = True # Whether to run training.
config.do_eval_during_train = True
config.do_eval = True # Whether to run prediction.

config.do_lower_case = False
config.weight_decay = 0.0 # Weight decay if we apply some.
config.adam_epsilon = 1e-8 # Epsilon for Adam optimizer.
config.max_grad_norm = 1.0 # Max gradient norm.
config.warmup_proportion = 0.0 # Warmup proportion for linear warmup

In [24]:
if not os.path.exists(config.cache_dir):
    os.makedirs(config.cache_dir)

output_dir = os.path.join(config.output_dir, config.task, config.dataset)
print("Output directory:", output_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

Output directory: output/cls/fake_news


#### <b>딥러닝 모델 초기화하기</b>

In [25]:
set_seed(config.seed)

# 딥러닝 모델 초기화
model = ClsModel(config)

print(f"configuration: {str(config)}")

if torch.cuda.is_available(): # GPU를 사용할 수 있다면
    gpu_count = torch.cuda.device_count()
    print(f"{gpu_count} GPU device detected")
    devices = ["cuda:{}".format(i) for i in range(gpu_count)]
    model_dp = torch.nn.DataParallel(model, device_ids=devices)
    model.to(devices[0])
else: # GPU를 사용할 수 없다면 CPU로 구동
    devices = ["cpu"]
    model_dp = model

Downloading (…)lve/main/config.json:   0%|          | 0.00/870 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/458M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/kobigbird-bert-base were not used when initializing BigBirdForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at monologg/k

Downloading (…)okenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/241k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/492k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

configuration: namespace(task='cls', dataset='fake_news', cache_dir='cache', output_dir='output', use_tpu=False, model_name_or_path='monologg/kobigbird-bert-base', data_dir='cache/fake_news_data', train_file='mission2_train.csv', predict_file='mission2_train.csv', max_seq_length=1024, train_batch_size=4, eval_batch_size=2, learning_rate=3e-05, num_train_epochs=10, num_labels=2, gradient_accumulation_steps=2, threads=4, seed=42, do_train=True, do_eval_during_train=True, do_eval=True, do_lower_case=False, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, warmup_proportion=0.0, label2id={'0': 0, '1': 1})
1 GPU device detected


#### <b>데이터 및 학습 세팅 초기화하기</b>

In [26]:
config.world_size = len(devices)
if config.do_train:
    train_loader = get_data(config, tokenizer=model.tokenizer, overwrite=True) # 이 코드를 처음 실행하는 경우
    # train_loader = get_data(config, tokenizer=model.tokenizer, overwrite=False) # 한 번 데이터 처리를 한 경우
valid_loader = get_data(config, tokenizer=model.tokenizer, is_train=False)

optimizer = None
scheduler = None
if config.do_train: # 학습 모드(train mode)인 경우
    optimizer = model.get_optimizer()
    scheduler = model.get_scheduler(len(train_loader), optimizer)

params = {
    "config": config,
    "model": model_dp,
    "optimizer": optimizer,
    "scheduler": scheduler,
}
if not config.use_tpu:
    params["device"] = devices[0]

dataset_file: cache/fake_news_data/mission2_train.csv_fake_news_monologg_kobigbird-bert-base_1024_train_dataset.txt


writing samples...: 100%|██████████| 54376/54376 [02:55<00:00, 310.12it/s]


54376 features processed from cache/fake_news_data/mission2_train.csv
Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-e2d3cd41a174687d/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-e2d3cd41a174687d/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/54376 [00:00<?, ? examples/s]

dataset_file: cache/fake_news_data/mission2_train.csv_fake_news_monologg_kobigbird-bert-base_1024_dev_dataset.txt


writing samples...: 100%|██████████| 13595/13595 [00:40<00:00, 337.47it/s]


13595 features processed from cache/fake_news_data/mission2_train.csv
Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-e59dae4626d8be0a/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-e59dae4626d8be0a/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/13595 [00:00<?, ? examples/s]



#### <b>모델 학습하기</b>

In [None]:
def do_eval(epoch):
    with torch.no_grad():
        results = run_epoch(loader=valid_loader, epoch=epoch, is_train=False, **params)["result"]
        results = eval_cls(
            config=config,
            model=model,
            loader=valid_loader,
            tokenizer=model.tokenizer,
            results=results,
        )

    print("Eval results.")
    for k, v in results["results"].items():
        print(f"{k} : {v}")

    return results["best_score"]


if config.do_train:
    best_score = 0
    for epoch in range(config.num_train_epochs):
        run_epoch(loader=train_loader, epoch=epoch, is_train=True, **params)

        score = 0
        if config.do_eval_during_train:
            score = do_eval(epoch)

        if score >= best_score:
            best_score = score
            output_dir = os.path.join(config.output_dir, config.task, config.dataset, f"{epoch}-{best_score}-ckpt")
            copy.deepcopy(
                model_dp.module
                if hasattr(model_dp, "module")
                else model_dp._models[0]
                if hasattr(model_dp, "_models")
                else model_dp
            ).cpu().save_pretrained(output_dir)
            with open(os.path.join(output_dir, "finetune_config.json"), "w") as save_config:
                json.dump(vars(config), save_config, sort_keys=True, indent=4)
            print(f"Checkpoint {output_dir} saved.")

#### <b>학습된 모델 평가하기</b>

In [None]:
if config.do_eval:
    do_eval(-1)