In [30]:
import json
import logging
import os
import glob
import re
import shutil

# import numpy as np
import torch
# from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# from torch.nn import CrossEntropyLoss
# from fastprogress.fastprogress import master_bar, progress_bar
from attrdict import AttrDict
from tokenizers import BertWordPieceTokenizer
from huggingface_hub import notebook_login, create_repo, delete_repo, Repository, upload_file, delete_file

# from transformers import (
#     AdamW,
#     get_linear_schedule_with_warmup
# )

from src import (
    CONFIG_CLASSES,
    TOKENIZER_CLASSES,
    MODEL_FOR_TOKEN_CLASSIFICATION,
    init_logger,
    set_seed,
    # compute_metrics,
    # show_ner_report
)

from processor import ner_load_and_cache_examples as load_and_cache_examples
from processor import ner_tasks_num_labels as tasks_num_labels
from processor import ner_processors as processors

logger = logging.getLogger(__name__)

from run_ner import train_v2, evaluate

In [2]:
### 로그인
# 토큰링크: https://huggingface.co/settings/tokens
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# # 내 레포지토리 로드

# REPO_NAME = "cujabes/koelectra-small-v3-discriminator"
# repo = Repository(local_dir=, clone_from=REPO_NAME)


In [19]:
config_path = "../../../../data/KoELECTRA_config/naver-ner/koelectra-small-v3.json"
# Read from config file and make args
with open(config_path) as f:
    args = AttrDict(json.load(f))
logger.info("Training/evaluation parameters {}".format(args))

# args.output_dir = os.path.join(args.ckpt_dir, args.output_dir)

init_logger()
set_seed(args)

02/19/2024 08:05:33 - INFO - __main__ -   Training/evaluation parameters AttrDict({'task': 'naver-ner', 'data_dir': '../../../../data/KoELECTRA', 'ckpt_dir': 'ckpt', 'train_file': 'train.tsv', 'dev_file': '', 'test_file': 'test.tsv', 'evaluate_test_during_training': True, 'eval_all_checkpoints': True, 'save_optimizer': False, 'do_lower_case': False, 'do_train': True, 'do_eval': True, 'max_seq_len': 128, 'num_train_epochs': 30, 'weight_decay': 0.0, 'gradient_accumulation_steps': 1, 'adam_epsilon': 1e-08, 'warmup_proportion': 0, 'max_steps': -1, 'max_grad_norm': 1.0, 'no_cuda': False, 'model_type': 'koelectra-small-v3', 'model_name_or_path': 'cujabes/koelectra-small-v3-discriminator', 'output_dir': '../../../../models/KoELECTRA', 'seed': 42, 'train_batch_size': 32, 'eval_batch_size': 128, 'logging_steps': 1000, 'save_steps': 1000, 'learning_rate': 5e-05, 'vocab_size': 80000, 'limit_alphabet': 6000, 'un_used_num': 10000})


In [12]:
### tokenizer 학습

vocab_path = "../../../../models/KoELECTRA/repo/vocab.txt"
train_file_path = os.path.join(args.data_dir, args.task, args.train_file)
output_dir = os.path.join(args.output_dir, "%s-%s"%(args.model_type, args.task), "checkpoint-best")
# model_path = os.path.join(output_dir, "vocab.txt")
move_vocab_path = os.path.join(args.output_dir, "vocab", "old_vocab_20240219.txt")

# `lowercase=False`로 할 시 `strip_accent=False`로 해야함
tokenizer = BertWordPieceTokenizer(
    vocab=vocab_path,
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=True, # Must be False if cased model
    lowercase=True,
    wordpieces_prefix="##"
)
print("WP 토크나이저 생성중")
tokenizer.train(
    files=[train_file_path],
    limit_alphabet=args.limit_alphabet,
    vocab_size=args.vocab_size
)
print("vocab 파일 이동 %s -> %s"%(vocab_path, move_vocab_path))
# 원본 vocab.txt 파일 옮기기
shutil.move(vocab_path, move_vocab_path)

print("new vocab 파일 저장 %s"%vocab_path)
vocab_list = [i[0] for i in sorted(tokenizer.get_vocab().items(), key=lambda x: x[1])]
with open(vocab_path, "w", encoding="utf-8-sig") as f:
    for vocab in vocab_list:
        f.write(vocab+"\n")

    for i in range(args.un_used_num):
        f.write("[unused%s]\n"%i)

WP 토크나이저 생성중





vocab 파일 이동 ../../../../models/KoELECTRA/repo/vocab.txt -> ../../../../models/KoELECTRA/vocab/vocab_80000_6000_20240219.txt
new vocab 파일 저장 ../../../../models/KoELECTRA/repo/vocab.txt


In [17]:
### vocab 파일 업데이트

REPO_NAME = "cujabes/koelectra-small-v3-discriminator"

# 기존 vocab file 삭제
delete_file(
    "vocab.txt",
    repo_id=REPO_NAME,
)

# 새로운 vocab 파일 업로드
upload_file(
    path_or_fileobj=vocab_path,
    path_in_repo="vocab.txt",
    repo_id=REPO_NAME,
)

CommitInfo(commit_url='https://huggingface.co/cujabes/koelectra-small-v3-discriminator/commit/60d2d671aebc0220fca2e7725edbf396ea7f163e', commit_message='Upload vocab.txt with huggingface_hub', commit_description='', oid='60d2d671aebc0220fca2e7725edbf396ea7f163e', pr_url=None, pr_revision=None, pr_num=None)

In [27]:
# 기존 vocab file 삭제
delete_file(
    "config.json",
    repo_id=REPO_NAME,
)
# 새로운 vocab 파일 업로드
upload_file(
    path_or_fileobj="/root/dothis-ai/models/KoELECTRA/repo/config.json",
    path_in_repo="config.json",
    repo_id=REPO_NAME,
)

CommitInfo(commit_url='https://huggingface.co/cujabes/koelectra-small-v3-discriminator/commit/68e631438ce3af95a0108f9aeefbaeb368052229', commit_message='Upload config.json with huggingface_hub', commit_description='', oid='68e631438ce3af95a0108f9aeefbaeb368052229', pr_url=None, pr_revision=None, pr_num=None)

In [29]:
from transformers import ElectraConfig, ElectraForTokenClassification
cache_dir = "../../../../models/huggingface"
model_name = "cujabes/koelectra-small-v3-discriminator"
processor = processors[args.task](args)
labels = processor.get_labels()
# 모델 설정 로드
config = ElectraConfig.from_pretrained(model_name, 
                num_labels=tasks_num_labels[args.task],
                id2label={str(i): label for i, label in enumerate(labels)},
                label2id={label: i for i, label in enumerate(labels)},cache_dir=cache_dir)


OSError: cujabes/koelectra-small-v3-discriminator does not appear to have a file named config.json. Checkout 'https://huggingface.co/cujabes/koelectra-small-v3-discriminator/main' for available files.

In [22]:
cache_dir = "../../../../models/huggingface"
processor = processors[args.task](args)
labels = processor.get_labels()
config = CONFIG_CLASSES[args.model_type].from_pretrained(
    args.model_name_or_path,
    num_labels=tasks_num_labels[args.task],
    id2label={str(i): label for i, label in enumerate(labels)},
    label2id={label: i for i, label in enumerate(labels)},
    cache_dir=cache_dir
)
tokenizer = TOKENIZER_CLASSES[args.model_type].from_pretrained(
    args.model_name_or_path,
    do_lower_case=args.do_lower_case,
    cache_dir=cache_dir
)
model = MODEL_FOR_TOKEN_CLASSIFICATION[args.model_type].from_pretrained(
    args.model_name_or_path,
    config=config,
    cache_dir=cache_dir
)

OSError: cujabes/koelectra-small-v3-discriminator does not appear to have a file named config.json. Checkout 'https://huggingface.co/cujabes/koelectra-small-v3-discriminator/main' for available files.

In [6]:
# GPU or CPU
args.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
model.to(args.device)
print(args.device)

cuda


In [8]:
# Load dataset
train_dataset = load_and_cache_examples(args, tokenizer, mode="train") if args.train_file else None
dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev") if args.dev_file else None
test_dataset = load_and_cache_examples(args, tokenizer, mode="test") if args.test_file else None


02/14/2024 04:55:38 - INFO - processor.ner -   Loading features from cached file ../../../../data/KoELECTRA/cached_naver-ner_koelectra-small-v3-discriminator_128_train
02/14/2024 04:55:42 - INFO - processor.ner -   Loading features from cached file ../../../../data/KoELECTRA/cached_naver-ner_koelectra-small-v3-discriminator_128_test


In [None]:
if dev_dataset == None:
    args.evaluate_test_during_training = True  # If there is no dev dataset, only use testset

if args.do_train:
    global_step, tr_loss = train_v2(args, model, train_dataset, dev_dataset, test_dataset)
    logger.info(" global_step = {}, average loss = {}".format(global_step, tr_loss))


In [9]:

results = {}
if args.do_eval:
    checkpoints = list(os.path.dirname(c) for c in
                        # sorted(glob.glob(args.output_dir + "/**/" + "pytorch_model.bin", recursive=True), key=lambda path_with_step: list(map(int, re.findall(r"\d+", path_with_step)))[-1]))
                        sorted(glob.glob(args.output_dir + "/**/" + "training_args.bin", recursive=True), key=lambda path_with_step: list(map(int, re.findall(r"\d+", path_with_step)))[-1]))

    print(checkpoints)
    if not args.eval_all_checkpoints:
        checkpoints = checkpoints[-1:]
    else:
        logging.getLogger("transformers.configuration_utils").setLevel(logging.WARN)  # Reduce logging
        logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
    logger.info("Evaluate the following checkpoints: %s", checkpoints)
    for checkpoint in checkpoints:
        global_step = checkpoint.split("-")[-1]
        model = MODEL_FOR_TOKEN_CLASSIFICATION[args.model_type].from_pretrained(checkpoint)
        model.to(args.device)
        result = evaluate(args, model, test_dataset, mode="test", global_step=global_step)
        result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
        results.update(result)

    output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as f_w:
        if len(checkpoints) > 1:
            for key in sorted(results.keys(), key=lambda key_with_step: (
                    "".join(re.findall(r'[^_]+_', key_with_step)),
                    int(re.findall(r"_\d+", key_with_step)[-1][1:])
            )):
                f_w.write("{} = {}\n".format(key, str(results[key])))
        else:
            for key in sorted(results.keys()):
                f_w.write("{} = {}\n".format(key, str(results[key])))


02/14/2024 04:55:44 - INFO - __main__ -   Evaluate the following checkpoints: ['../../../../models/KoELECTRA/koelectra-small-v3-naver-ner-ckpt/checkpoint-best']
02/14/2024 04:55:44 - INFO - run_ner -   ***** Running evaluation on test dataset (best step) *****
02/14/2024 04:55:44 - INFO - run_ner -     Num examples = 9000
02/14/2024 04:55:44 - INFO - run_ner -     Eval Batch size = 128


['../../../../models/KoELECTRA/koelectra-small-v3-naver-ner-ckpt/checkpoint-best']
 |████████████████████████████████████████| 100.00% [71/71 00:02<00:00]

02/14/2024 04:55:47 - INFO - run_ner -   ***** Eval results on test dataset *****
02/14/2024 04:55:47 - INFO - run_ner -     f1 = 0.8594204130194611
02/14/2024 04:55:47 - INFO - run_ner -     loss = 0.2844202115502156
02/14/2024 04:55:47 - INFO - run_ner -     precision = 0.8544969074255585
02/14/2024 04:55:47 - INFO - run_ner -     recall = 0.8644009846827133
02/14/2024 04:55:48 - INFO - run_ner -   
              precision    recall  f1-score   support

         AFW       0.58      0.61      0.60       394
         ANM       0.77      0.78      0.78       701
         CVL       0.84      0.84      0.84      5758
         DAT       0.92      0.93      0.92      2521
         EVT       0.77      0.78      0.78      1094
         FLD       0.60      0.68      0.63       228
         LOC       0.85      0.86      0.85      2126
         MAT       0.17      0.17      0.17        12
         NUM       0.92      0.93      0.92      5590
         ORG       0.88      0.87      0.87      4086


In [4]:
# 모델파일들을 repo 경로에 복사

# 원본 파일 경로
source_path = "../../../../models/KoELECTRA/koelectra-small-v3-naver-ner-ckpt/checkpoint-best/*"
# 대상 파일 경로 (복사될 위치와 파일 이름)
target_path = "../../../../models/KoELECTRA/repo/"

# print(glob(source_path))
for source_file in glob(source_path):
    filename = os.path.basename(source_file)
    target_file = os.path.join(target_path, filename)
    print(source_file, target_file)
    # 파일 복사
    shutil.copyfile(source_file, target_file)







In [None]:
# 원격 저장소에 업로드
commit_message = 'Initial commit'
repo.git_add()
repo.git_commit(commit_message)
repo.git_push()

