<a href="https://colab.research.google.com/github/nikickk/crawling/blob/yein/%08english_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ! pip install transformers datasets evaluate
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
import evaluate
import torch

In [3]:
# gpu 캐시 삭제
import gc
gc.collect()
torch.cuda.empty_cache()

In [2]:
# 데이터셋 로드
dataset = load_dataset("imdb")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [20]:
# 모델 및 토크나이저 로드
model_name = "Copycats/koelectra-base-v3-generalized-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [5]:
# 데이터 토크나이징
def tokenize_function(examples):
    examples
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=445)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [8]:
# Accuracy metric 정의
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [21]:
# TrainingArguments 설정
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    report_to="wandb",  # WandB로 로깅
    push_to_hub = True
)



In [22]:
# Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

In [28]:
! wandb init

[1mThis directory has been configured previously, should we re-configure it?[0m [y/N]: n
Aborted!


In [10]:
import wandb

wandb.init(project="tave")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33myenqkr[0m ([33myenqkr-study[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [31]:
# Fine-tuning 수행
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3559,0.349893,0.84872


TrainOutput(global_step=1563, training_loss=0.37213796175067726, metrics={'train_runtime': 2856.6773, 'train_samples_per_second': 8.751, 'train_steps_per_second': 0.547, 'total_flos': 7030553510282400.0, 'train_loss': 0.37213796175067726, 'epoch': 1.0})

In [48]:
model.save_pretrained('./mode_result')  # 훈련 후 모델을 저장

In [16]:
trainer.save_model('./model_result')  # 훈련된 모델을 저장

model.safetensors:   0%|          | 0.00/452M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

events.out.tfevents.1733415872.e7892369f5e9.319.0:   0%|          | 0.00/41.7k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

events.out.tfevents.1733416388.e7892369f5e9.5696.0:   0%|          | 0.00/44.2k [00:00<?, ?B/s]

In [15]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# 모델 경로에서 모델과 토크나이저 불러오기
model = AutoModelForSequenceClassification.from_pretrained('/content/mode_result')
# tokenizer = AutoTokenizer.from_pretrained('/content/mode_result')


In [17]:
model

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [113]:
from accelerate import Accelerator

# Accelerator 초기화
accelerator = Accelerator()

In [23]:
# 평가 수행
evaluation_results = trainer.evaluate()
print(evaluation_results)

{'eval_loss': 0.34989261627197266, 'eval_model_preparation_time': 0.0032, 'eval_accuracy': 0.84872, 'eval_runtime': 735.0279, 'eval_samples_per_second': 34.012, 'eval_steps_per_second': 2.126}
