In [1]:
!pip install transformers
!pip install torch
!pip install pandas
!pip install scikit-learn
!pip install tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import torch
from tqdm.notebook import tqdm
import numpy as np
import torch.nn as nn

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup


class CustomBertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(config.hidden_dropout_prob),
            nn.Linear(config.hidden_size // 2, config.num_labels)
        )
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

# BERT 모델과 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = CustomBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# 데이터 전처리 함수
def preprocess_data(data, tokenizer, max_len=512):
    input_ids = []
    attention_masks = []

    for text in data:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# 데이터 전처리
train_inputs, train_masks = preprocess_data(train_df['text'].values, tokenizer)
val_inputs, val_masks = preprocess_data(val_df['text'].values, tokenizer)
train_labels = torch.tensor(train_df['stars'].values, dtype=torch.long)
val_labels = torch.tensor(val_df['stars'].values, dtype=torch.long)

# 데이터로더 생성
batch_size = 32  # from 16 to 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# GPU 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 옵티마이저와 스케줄러 설정
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)  # from 2e-5 to 1e-5
total_steps = len(train_dataloader) * 10  #  from 4 to 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# 정확도 계산 함수
def accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# 모델 학습
epochs = 10  # from 4 to 10

scaler = torch.cuda.amp.GradScaler()  #  for mixed precision training

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()

    total_loss = 0

    for batch in tqdm(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        model.zero_grad()

        with torch.cuda.amp.autocast():  # Enable autocast for mixed precision
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs[0]
            total_loss += loss.item()

        scaler.scale(loss).backward()  # Scale loss for mixed precision

        scaler.unscale_(optimizer)  # Unscale gradients before clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        scaler.step(optimizer)  # Step optimizer with scaled gradients
        scaler.update()  # Update the scale factor

        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Training loss: {avg_train_loss}")

    # Validation
    model.eval()

    eval_loss = 0
    eval_accuracy = 0

    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            with torch.cuda.amp.autocast():  # Enable autocast for mixed precision
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs[0]
            loss = torch.nn.CrossEntropyLoss()(logits, b_labels)

        eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        eval_accuracy += accuracy(logits, label_ids)

    avg_val_accuracy = eval_accuracy / len(val_dataloader)
    avg_val_loss = eval_loss / len(val_dataloader)

    print(f"Validation loss: {avg_val_loss}")
    print(f"Validation accuracy: {avg_val_accuracy}")

# 테스트 데이터 로드
test_df = pd.read_json('/content/drive/My Drive/split_yelp_reviews/yelp_split_2.json', lines=True)

# 필요한 컬럼만 선택
test_df = test_df[['text', 'stars']]

test_df = test_df.sample(10000, random_state=42)

# 라벨 인코딩을 0부터 4까지로 변환
test_df['stars'] = test_df['stars'] - 1

# 라벨 범위 확인
assert test_df['stars'].min() >= 0 and test_df['stars'].max() <= 4, "Test labels are out of range"

# 테스트 데이터 전처리
test_inputs, test_masks = preprocess_data(test_df['text'].values, tokenizer)
test_labels = torch.tensor(test_df['stars'].values, dtype=torch.long)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

model.eval()

test_loss = 0
test_accuracy = 0
predictions = []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        with torch.cuda.amp.autocast():  # Enable autocast for mixed precision
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]
        loss = torch.nn.CrossEntropyLoss()(logits, b_labels)

    test_loss += loss.item()

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.extend(np.argmax(logits, axis=1).flatten())
    test_accuracy += accuracy(logits, label_ids)

avg_test_accuracy = test_accuracy / len(test_dataloader)
avg_test_loss = test_loss / len(test_dataloader)

print(f"Test loss: {avg_test_loss}")
print(f"Test accuracy: {avg_test_accuracy}")


Some weights of CustomBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.3.bias', 'classifier.3.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10


  0%|          | 0/250 [00:00<?, ?it/s]

Training loss: 1.082295440673828
Validation loss: 0.8316979786706349
Validation accuracy: 0.6736111111111112
Epoch 2/10


  0%|          | 0/250 [00:00<?, ?it/s]

Training loss: 0.745140510559082
Validation loss: 0.7686050657242064
Validation accuracy: 0.6959325396825397
Epoch 3/10


  0%|          | 0/250 [00:00<?, ?it/s]

Training loss: 0.6262107200622559
Validation loss: 0.7401878720238095
Validation accuracy: 0.6979166666666666
Epoch 4/10


  0%|          | 0/250 [00:00<?, ?it/s]

Training loss: 0.5427282981872559
Validation loss: 0.7554175967261905
Validation accuracy: 0.6989087301587301
Epoch 5/10


  0%|          | 0/250 [00:00<?, ?it/s]

Training loss: 0.4770048885345459
Validation loss: 0.743927486359127
Validation accuracy: 0.6934523809523809
Epoch 6/10


  0%|          | 0/250 [00:00<?, ?it/s]

Training loss: 0.40473936462402343
Validation loss: 0.7814902653769841
Validation accuracy: 0.6954365079365079
Epoch 7/10


  0%|          | 0/250 [00:00<?, ?it/s]

Training loss: 0.3490335464477539
Validation loss: 0.8147515190972222
Validation accuracy: 0.689484126984127
Epoch 8/10


  0%|          | 0/250 [00:00<?, ?it/s]

Training loss: 0.3114304389953613
Validation loss: 0.8460751488095238
Validation accuracy: 0.7023809523809523
Epoch 9/10


  0%|          | 0/250 [00:00<?, ?it/s]

Training loss: 0.2776127576828003
Validation loss: 0.8712410094246031
Validation accuracy: 0.6989087301587301
Epoch 10/10


  0%|          | 0/250 [00:00<?, ?it/s]

Training loss: 0.2579738845825195
Validation loss: 0.8623240637400794
Validation accuracy: 0.7008928571428571




Test loss: 0.9192729133386581
Test accuracy: 0.6878993610223643
