In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/119-csv/VL.csv
/kaggle/input/119-csv/TL.csv


In [2]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 설정 파일 읽기
config = {
    "task": "disaster",
    "data_dir": "/kaggle/input/119-csv",
    "ckpt_dir": "./ckpt_dir",
    "train_file": "/TL.csv",
    "dev_file": False,
    "test_file": "/VL.csv",
    "evaluate_test_during_training": True,
    "eval_all_checkpoints": True,
    "save_optimizer": True,
    "do_lower_case": False,
    "do_train": True,
    "do_eval": True,
    "max_seq_len": 512,
    "num_train_epochs": 30,
    "weight_decay": 0.0,
    "gradient_accumulation_steps": 1,
    "adam_epsilon": 1e-8,
    "warmup_proportion": 0,
    "max_steps": -1,
    "max_grad_norm": 1.0,
    "no_cuda": False,
    "model_type": "Auto",
    "model_name_or_path": "beomi/KcELECTRA-base-v2022",
    "tokenizer_path": "beomi/KcELECTRA-base-v2022",
    "log_dir": "/kaggle/working/log_path",
    "log_file": "train_log.log",
    "output_dir": "/kaggle/working/finetuned_models/disasterLarge",
    "seed": 42,
    "train_batch_size": 16,  # 배치 크기를 줄임
    "eval_batch_size": 32,  # 배치 크기를 줄임
    "logging_steps": 1000,
    "save_steps": 1000,
    "learning_rate": 5e-5,
    "fp16": True  # Mixed Precision Training 활성화
}

# 데이터 로드
train_data = pd.read_csv('/kaggle/input/119-csv/TL.csv')
val_data = pd.read_csv('/kaggle/input/119-csv/VL.csv')

# 'disasterMedium' 열의 고유한 분류 이름을 가져오기
unique_labels = train_data['disasterLarge'].unique()
label_map = {label: i for i, label in enumerate(unique_labels)}

# 라벨을 숫자로 변환
train_data['label'] = train_data['disasterLarge'].map(label_map)
val_data['label'] = val_data['disasterLarge'].map(label_map)

# 데이터셋 객체 생성
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# 토크나이저와 모델 로드
tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_path'])
model = AutoModelForSequenceClassification.from_pretrained(config['model_name_or_path'], num_labels=len(unique_labels))

# 데이터셋 토크나이징 함수
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=config['max_seq_len'])

# 데이터셋 토크나이징
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# 불필요한 열 제거
train_dataset = train_dataset.remove_columns(['disasterMedium', 'urgencyLevel', 'sentiment', 'symptom', 'triage'])
val_dataset = val_dataset.remove_columns(['disasterMedium', 'urgencyLevel', 'sentiment', 'symptom', 'triage'])


2024-08-06 07:30:06.628346: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-06 07:30:06.628476: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-06 07:30:06.762858: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/450k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/511M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/127178 [00:00<?, ? examples/s]

Map:   0%|          | 0/15897 [00:00<?, ? examples/s]

In [3]:
# Data Collator 생성
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 평가 메트릭 함수 정의
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Trainer 설정
training_args = TrainingArguments(
    output_dir=config['output_dir'],
    evaluation_strategy="epoch" if config['do_eval'] else "no",
    save_strategy="epoch",  # save_strategy를 epoch로 설정
    learning_rate=config['learning_rate'],
    per_device_train_batch_size=config['train_batch_size'],
    per_device_eval_batch_size=config['eval_batch_size'],
    num_train_epochs=config['num_train_epochs'],
    weight_decay=config['weight_decay'],
    logging_dir=config['log_dir'],
    logging_steps=config['logging_steps'],
    save_steps=config['save_steps'],
    load_best_model_at_end=True,
    save_total_limit=3,
    metric_for_best_model="f1",
    report_to=[],  # 보고할 로그 서비스 설정
    fp16=config['fp16'],  # Mixed Precision Training 활성화
    gradient_checkpointing=True  # Gradient Checkpointing 활성화
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # compute_metrics 추가
)

# 모델 학습
if config['do_train']:
    trainer.train()

# 모델 평가
if config['do_eval']:
    evaluation_results = trainer.evaluate()
    print(evaluation_results)

# 학습된 모델 저장
trainer.save_model(config['output_dir'])
tokenizer.save_pretrained(config['output_dir'])







Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3252,0.325531,0.914072,0.905833,0.910381,0.914072
2,0.3015,0.322631,0.908977,0.899342,0.911787,0.908977
3,0.3018,0.378573,0.900421,0.885106,0.879673,0.900421
4,0.8037,0.791266,0.75618,0.651196,0.571809,0.75618








  _warn_prf(average, modifier, msg_start, len(result))


