In [None]:
MODELS = [
      ('xlm-mlm-enfr-1024'   ,"XLMModel"),
      ('distilbert-base-cased', "DistilBertModel"),
      ('bert-base-uncased'     ,"BertModel"),
      ('roberta-base'        ,"RobertaModel"),
      ("cardiffnlp/twitter-roberta-base-sentiment","RobertaSentTW"),
      ('xlnet-base-cased'     ,"XLNetModel"),
      ('transfo-xl-wt103'    ,"TransfoXLModel"),
      ('bert-base-cased'       ,"BertModelUncased"),
      ('xlm-roberta-base'     ,"XLMRobertaModel"),
      ('openai-gpt'           ,"OpenAIGPTModel"),
      ('gpt2'                 ,"GPT2Model")
]

## GPT-2

In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import os
import torch

from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          AdamW, 
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)

os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # 사용할 GPU 번호 (1번 GPU)


# 데이터셋 정의
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# CSV 파일에서 데이터 로드
data = pd.read_csv('./csv.csv')
texts = data['text'].tolist()
labels = data['label'].tolist()

# 모델 및 토크나이저 로드
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.padding_side = "left" 
tokenizer.pad_token = tokenizer.eos_token

model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=2)

# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

device_name = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_name)
print(device_name)
model.to(device)

# 데이터 분할
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# 데이터셋 및 데이터 로더 생성
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, max_length=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

def train_model(model, train_loader, val_loader, epochs, optimizer):
    model.train()
    for epoch in range(epochs):
        train_loss = 0.0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)

        val_loss, acc = evaluate_model(model, val_loader)
        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {acc:.4f}')

    return model

def evaluate_model(model, val_loader):
    model.eval()
    val_loss = 0.0
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
            logits = outputs.logits
            _, predicted = torch.max(logits, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)
    val_loss /= len(val_loader)
    acc = total_correct / total_samples
    return val_loss, acc

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
epochs = 5
model = train_model(model, train_loader, val_loader, epochs, optimizer)  

loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at /home/mskim/.cache/huggingface/transformers/684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at /home/mskim/.cache/huggingface/transformers/c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at /home/mskim/.cache/huggingface/transformers/b105cf342574b32b2f8d5ea86c4845f46d8162160345fd0c85bd9ca3bc5cc48e.67d01b18f2079bd75eac0b2f2e7235768c7f26bd728e7a855a1c5acae01a91a8
loading configuration file https://hugging

cuda
Epoch 1/3, Train Loss: 0.5150, Val Loss: 0.2741, Val Accuracy: 0.8900
Epoch 2/3, Train Loss: 0.2389, Val Loss: 0.2659, Val Accuracy: 0.9062
Epoch 3/3, Train Loss: 0.1744, Val Loss: 0.2560, Val Accuracy: 0.9000


## BERT

In [42]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # 사용할 GPU 번호 (1번 GPU)

from datasets import load_dataset, Features, Value, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# CSV 파일 경로 지정
csv_file_path = "./csv.csv"

# 데이터셋 로드 (열의 데이터 타입 지정)
dataset = load_dataset("csv", data_files=csv_file_path, column_names=["text", "label"], skiprows=1, features=Features({"text": Value("string"), "label": ClassLabel(num_classes=2)}))

# 모델 및 토크나이저 로드
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

device_name = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_name)
print(device_name)
model.to(device)

# 데이터셋 전처리
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=128)

dataset = dataset.map(preprocess_function, batched=True)

# 데이터셋 분할
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

# 모델 학습을 위한 TrainingArguments 설정
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
)

from datasets import load_metric
import numpy as np

def compute_metrics(eval_pred):
    metric = load_metric("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)

# 모델 학습
trainer.train()

# 모델 평가
eval_results = trainer.evaluate()
print(eval_results)

# 모델 저장
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

Using custom data configuration default-39ef09e856fe440d


Downloading and preparing dataset csv/default to /home/mskim/.cache/huggingface/datasets/csv/default-39ef09e856fe440d/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 4877.10it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 447.73it/s]
                            

Dataset csv downloaded and prepared to /home/mskim/.cache/huggingface/datasets/csv/default-39ef09e856fe440d/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 520.71it/s]
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/mskim/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading fil

cuda


100%|██████████| 4/4 [00:00<00:00, 14.59ba/s]
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3200
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 