In [1]:
! pip install transformers torch datasets scikit-learn
! pip install transformers[torch]
! pip install accelerate -U

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, ProgressCallback
from torch.utils.data import Dataset
import torch
from sklearn.metrics import mean_squared_error


# 데이터셋 로드
with open('/content/drive/My Drive/train1.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 데이터프레임으로 변환
df = pd.DataFrame(data)
df['ReviewScore'] = df['ReviewScore'].astype(float)

# 리뷰 점수 정규화
max_score = df['ReviewScore'].max()
df['ReviewScore'] = df['ReviewScore'] / max_score

# 데이터셋 분할
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['RawText'], df['ReviewScore'], test_size=0.2, random_state=42
)



# 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# 토큰화 함수 정의
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

# 토큰화 적용
train_encodings = tokenize_function(train_texts.tolist())
val_encodings = tokenize_function(val_texts.tolist())




# Dataset 클래스 정의
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(train_encodings, train_labels.tolist())
val_dataset = ReviewDataset(val_encodings, val_labels.tolist())



# MSE 계산 함수 정의
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.squeeze()
    mse = mean_squared_error(labels, preds)
    return {"mse": mse}



# 모델 및 학습 코드
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=1)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    disable_tqdm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[ProgressCallback]
)


# 학습
trainer.train()

# 평가
trainer.evaluate(eval_dataset=val_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/100 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Mse
1,0.6748,0.563595,0.563595
2,0.5992,0.466826,0.466826
3,0.4687,0.318081,0.318081
4,0.2341,0.132098,0.132098
5,0.0684,0.082076,0.082076
6,0.0431,0.062492,0.062492
7,0.0345,0.065483,0.065483
8,0.0268,0.059747,0.059747
9,0.0204,0.055573,0.055573
10,0.0223,0.06163,0.06163


{'loss': 0.6748, 'grad_norm': 15.106888771057129, 'learning_rate': 4.0000000000000003e-07, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.5635949373245239, 'eval_mse': 0.5635949373245239, 'eval_runtime': 0.1489, 'eval_samples_per_second': 134.311, 'eval_steps_per_second': 20.147, 'epoch': 1.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.5992, 'grad_norm': 14.791065216064453, 'learning_rate': 8.000000000000001e-07, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.4668259620666504, 'eval_mse': 0.46682602167129517, 'eval_runtime': 0.1326, 'eval_samples_per_second': 150.803, 'eval_steps_per_second': 22.621, 'epoch': 2.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.4687, 'grad_norm': 16.070890426635742, 'learning_rate': 1.2000000000000002e-06, 'epoch': 3.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.3180805742740631, 'eval_mse': 0.3180805742740631, 'eval_runtime': 0.1386, 'eval_samples_per_second': 144.304, 'eval_steps_per_second': 21.646, 'epoch': 3.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.2341, 'grad_norm': 11.225671768188477, 'learning_rate': 1.6000000000000001e-06, 'epoch': 4.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.1320982277393341, 'eval_mse': 0.1320982426404953, 'eval_runtime': 0.1364, 'eval_samples_per_second': 146.629, 'eval_steps_per_second': 21.994, 'epoch': 4.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0684, 'grad_norm': 2.334803819656372, 'learning_rate': 2.0000000000000003e-06, 'epoch': 5.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.08207636326551437, 'eval_mse': 0.08207637071609497, 'eval_runtime': 0.1374, 'eval_samples_per_second': 145.557, 'eval_steps_per_second': 21.834, 'epoch': 5.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0431, 'grad_norm': 6.860165119171143, 'learning_rate': 2.4000000000000003e-06, 'epoch': 6.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.062491677701473236, 'eval_mse': 0.06249167397618294, 'eval_runtime': 0.1418, 'eval_samples_per_second': 141.085, 'eval_steps_per_second': 21.163, 'epoch': 6.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0345, 'grad_norm': 1.786163091659546, 'learning_rate': 2.8000000000000003e-06, 'epoch': 7.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.06548310071229935, 'eval_mse': 0.06548310071229935, 'eval_runtime': 0.1412, 'eval_samples_per_second': 141.617, 'eval_steps_per_second': 21.243, 'epoch': 7.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0268, 'grad_norm': 2.2610323429107666, 'learning_rate': 3.2000000000000003e-06, 'epoch': 8.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.05974704027175903, 'eval_mse': 0.05974704027175903, 'eval_runtime': 0.1408, 'eval_samples_per_second': 142.019, 'eval_steps_per_second': 21.303, 'epoch': 8.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0204, 'grad_norm': 3.555490493774414, 'learning_rate': 3.6000000000000003e-06, 'epoch': 9.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.05557287484407425, 'eval_mse': 0.055572886019945145, 'eval_runtime': 0.1329, 'eval_samples_per_second': 150.498, 'eval_steps_per_second': 22.575, 'epoch': 9.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 0.0223, 'grad_norm': 2.466287851333618, 'learning_rate': 4.000000000000001e-06, 'epoch': 10.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.06162990257143974, 'eval_mse': 0.06162991374731064, 'eval_runtime': 0.1429, 'eval_samples_per_second': 139.951, 'eval_steps_per_second': 20.993, 'epoch': 10.0}
{'train_runtime': 25.0806, 'train_samples_per_second': 31.897, 'train_steps_per_second': 3.987, 'train_loss': 0.21922363728284835, 'epoch': 10.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.06162990257143974,
 'eval_mse': 0.06162991374731064,
 'eval_runtime': 0.1467,
 'eval_samples_per_second': 136.34,
 'eval_steps_per_second': 20.451,
 'epoch': 10.0}