In [1]:
!pip install transformers
!pip install torch
!pip install pandas
!pip install scikit-learn
!pip install tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import json

def split_json(file_path, output_path, max_file_size=500):
    max_file_size_bytes = max_file_size * 1024 * 1024  # Convert MB to bytes
    current_file_size = 0
    current_file_index = 1
    current_data = []

    with open(file_path, 'r') as f:
        for line in f:
            data = json.loads(line)
            data_str = json.dumps(data) + "\n"
            data_size = len(data_str.encode('utf-8'))

            if current_file_size + data_size > max_file_size_bytes:
                # Save current split file
                output_file = f"{output_path}/yelp_split_{current_file_index}.json"
                with open(output_file, 'w') as out_f:
                    out_f.writelines(current_data)
                print(f"Saved {output_file} with size {current_file_size / (1024 * 1024):.2f} MB")

                # Reset for next split file
                current_file_index += 1
                current_data = []
                current_file_size = 0

            current_data.append(data_str)
            current_file_size += data_size

    # Save the last split file
    if current_data:
        output_file = f"{output_path}/yelp_split_{current_file_index}.json"
        with open(output_file, 'w') as out_f:
            out_f.writelines(current_data)
        print(f"Saved {output_file} with size {current_file_size / (1024 * 1024):.2f} MB")

# 경로 설정
input_file_path = '/content/drive/My Drive/yelp_academic_dataset_review.json'
output_directory = '/content/drive/My Drive/split_yelp_reviews'

# 디렉토리 생성
import os
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# JSON 파일 분할
split_json(input_file_path, output_directory, max_file_size=500)


Saved /content/drive/My Drive/split_yelp_reviews/yelp_split_1.json with size 500.00 MB
Saved /content/drive/My Drive/split_yelp_reviews/yelp_split_2.json with size 500.00 MB
Saved /content/drive/My Drive/split_yelp_reviews/yelp_split_3.json with size 500.00 MB
Saved /content/drive/My Drive/split_yelp_reviews/yelp_split_4.json with size 500.00 MB
Saved /content/drive/My Drive/split_yelp_reviews/yelp_split_5.json with size 500.00 MB
Saved /content/drive/My Drive/split_yelp_reviews/yelp_split_6.json with size 500.00 MB
Saved /content/drive/My Drive/split_yelp_reviews/yelp_split_7.json with size 500.00 MB
Saved /content/drive/My Drive/split_yelp_reviews/yelp_split_8.json with size 500.00 MB
Saved /content/drive/My Drive/split_yelp_reviews/yelp_split_9.json with size 500.00 MB
Saved /content/drive/My Drive/split_yelp_reviews/yelp_split_10.json with size 500.00 MB
Saved /content/drive/My Drive/split_yelp_reviews/yelp_split_11.json with size 207.93 MB


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import torch

# Yelp 데이터셋 로드
df = pd.read_json('/content/drive/My Drive/split_yelp_reviews/yelp_split_1.json', lines=True)

# 필요한 컬럼만 선택
df = df[['text', 'stars']]

# 데이터 샘플링
df = df.sample(10000, random_state=42)

# 데이터셋 나누기
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# 라벨 인코딩을 0부터 4까지로 변환
train_df['stars'] = train_df['stars'] - 1
val_df['stars'] = val_df['stars'] - 1

# BERT Tokenizer 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 데이터 전처리 함수
def preprocess_data(data, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []

    for review in data:
        encoded = tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

train_inputs, train_masks = preprocess_data(train_df['text'].values, tokenizer)
val_inputs, val_masks = preprocess_data(val_df['text'].values, tokenizer)

# 라벨 추출
train_labels = torch.tensor(train_df['stars'].values, dtype=torch.long)
val_labels = torch.tensor(val_df['stars'].values, dtype=torch.long)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [4]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [6]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)


In [9]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=5,
    output_attentions=False,
    output_hidden_states=False,
)

model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 10
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import numpy as np
from tqdm.notebook import tqdm

def accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        model.zero_grad()

        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)

    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Average training loss: {avg_train_loss}")

    model.eval()

    eval_loss = 0
    eval_accuracy = 0

    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)
        logits = outputs.logits
        loss = torch.nn.CrossEntropyLoss()(logits, b_labels)

        eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        eval_accuracy += accuracy(logits, label_ids)

    avg_val_accuracy = eval_accuracy / len(val_dataloader)
    avg_val_loss = eval_loss / len(val_dataloader)

    print(f"Validation loss: {avg_val_loss}")
    print(f"Validation accuracy: {avg_val_accuracy}")


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 1/10
Average training loss: 0.9077893990278244
Validation loss: 0.7525830104351043
Validation accuracy: 0.6895


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 2/10
Average training loss: 0.6568946364223958
Validation loss: 0.7455077424049378
Validation accuracy: 0.6895


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 3/10
Average training loss: 0.48634009532630446
Validation loss: 0.8628284083604812
Validation accuracy: 0.6875


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 4/10
Average training loss: 0.3251379416808486
Validation loss: 0.9297824131250382
Validation accuracy: 0.688


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 5/10
Average training loss: 0.19869103645160793
Validation loss: 1.1266707321703433
Validation accuracy: 0.687


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 6/10
Average training loss: 0.11409741891548038
Validation loss: 1.3124085838794708
Validation accuracy: 0.681


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 7/10
Average training loss: 0.07739778366219252
Validation loss: 1.366257292985916
Validation accuracy: 0.6795


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 8/10
Average training loss: 0.04459299302008003
Validation loss: 1.4552232463359833
Validation accuracy: 0.689


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 9/10
Average training loss: 0.0315625091153197
Validation loss: 1.5241228246688843
Validation accuracy: 0.6735


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 10/10
Average training loss: 0.02275758000370115
Validation loss: 1.5446725959777832
Validation accuracy: 0.6805


In [14]:
# 테스트 데이터 로드
test_df = pd.read_json('/content/drive/My Drive/split_yelp_reviews/yelp_split_2.json', lines=True)

# 필요한 컬럼만 선택
test_df = test_df[['text', 'stars']]

test_df = test_df.sample(10000, random_state=42)

# 라벨 인코딩을 0부터 4까지로 변환
test_df['stars'] = test_df['stars'] - 1

# 라벨 범위 확인
assert test_df['stars'].min() >= 0 and test_df['stars'].max() <= 4, "Test labels are out of range"

# 테스트 데이터 전처리
test_inputs, test_masks = preprocess_data(test_df['text'].values, tokenizer)
test_labels = torch.tensor(test_df['stars'].values, dtype=torch.long)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)




In [15]:
model.eval()

test_loss = 0
test_accuracy = 0
predictions = []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)
    logits = outputs.logits
    loss = torch.nn.CrossEntropyLoss()(logits, b_labels)

    test_loss += loss.item()

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.extend(np.argmax(logits, axis=1).flatten())
    test_accuracy += accuracy(logits, label_ids)

avg_test_accuracy = test_accuracy / len(test_dataloader)
avg_test_loss = test_loss / len(test_dataloader)

print(f"Test loss: {avg_test_loss}")
print(f"Test accuracy: {avg_test_accuracy}")


Test loss: 1.6129379770636558
Test accuracy: 0.6708
