 Google Drive 마운트

In [None]:
# Google Drive 마운트 (데이터 파일 접근을 위해)
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# 1. 필요한 라이브러리 임포트
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import shutil

# GPU 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [None]:
# 2. 데이터 로드 및 전처리
# 데이터 경로 설정 (Google Drive에 있는 파일 경로)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

path = '/content/drive/MyDrive/Colab Notebooks/na_rm data.csv'
data = pd.read_csv(path)

# 데이터 구조 확인
print(data.columns)
print(data.head())

# 결측치가 있는 행 제거
data = data.dropna(subset=['lexile', 'text'])

# 텍스트와 레이블 분리
texts = data['text'].tolist()
lexile_scores = data['lexile'].tolist()

# Lexile 점수를 8개의 범주로 나누기
bins = [0, 630, 760, 850, 930, 990, 1120, 1300, float('inf')]
labels = list(range(8))  # 0, 1, 2, 3, 4, 5, 6, 7
lexile_categories = pd.cut(lexile_scores, bins=bins, labels=labels)

# 범주형 데이터를 정수로 변환
lexile_categories = lexile_categories.astype(int).tolist()

Mounted at /content/drive
Index(['lexile', 'text'], dtype='object')
   lexile                                               text
0    1100  International scam artists use clever schemes ...
1    1140  A critical election loomed. The country was de...
2     810  It was a beautiful night in late August. We we...
3     700  As Angela stared out the school bus window, he...
4    1110  In the 1950s and 1960s, racial conflicts were ...


In [None]:
# 3. 데이터셋 클래스 정의
class LexileDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        # 토큰화
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        # 텐서로 변환
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)

        return item


In [None]:
# 4. 토크나이저 로드 및 데이터셋 생성
# BERT 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Train/Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, lexile_categories, test_size=0.2, random_state=42)

# 커스텀 데이터셋 생성
train_dataset = LexileDataset(train_texts, train_labels, tokenizer)
val_dataset = LexileDataset(val_texts, val_labels, tokenizer)


In [None]:
# 5. 모델 및 학습 설정
# BERT 모델 및 분류기 Head 정의
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8)
model.to(device)

training_args = TrainingArguments(
    output_dir='./results',          # output 디렉터리
    evaluation_strategy="epoch",   # 매 에폭마다 평가
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',            # 로그 디렉터리
    logging_steps=10,                # 로그 출력 주기
    report_to="tensorboard"         # 로그 활성화
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 6. 모델 학습
trainer.train()


Epoch,Training Loss,Validation Loss
1,1.867,1.781707
2,1.6302,1.519786
3,1.4254,1.472857


TrainOutput(global_step=246, training_loss=1.6949927613018005, metrics={'train_runtime': 315.8836, 'train_samples_per_second': 12.327, 'train_steps_per_second': 0.779, 'total_flos': 512304821968896.0, 'train_loss': 1.6949927613018005, 'epoch': 3.0})

In [None]:
# 7. 모델 평가
trainer.evaluate()

{'eval_loss': 1.4728572368621826,
 'eval_runtime': 13.4613,
 'eval_samples_per_second': 24.143,
 'eval_steps_per_second': 1.56,
 'epoch': 3.0}

In [None]:
# 8. 새로운 텍스트 예측
test_text = '''
As soon as the light in the bedroom went out there was a stirring and a
fluttering all through the farm buildings. Word had gone round during the
day that old Major, the prize Middle White boar, had had a strange dream
on the previous night and wished to communicate it to the other animals.
It had been agreed that they should all meet in the big barn as soon as
Mr. Jones was safely out of the way. Old Major (so he was always called,
though the name under which he had been exhibited was Willingdon Beauty)
was so highly regarded on the farm that everyone was quite ready to lose
an hour's sleep in order to hear what he had to say.
'''
encoded_input = tokenizer(test_text, return_tensors='pt', padding='max_length', truncation=True, max_length=256)
encoded_input = {key: val.to(device) for key, val in encoded_input.items()}  # 모델 디바이스에 맞춰 이동
output = model(**encoded_input)
predicted_label = torch.argmax(output.logits, dim=1)

# 범주형 레이블에 따른 Lexile 레벨 출력
categories = {
    0: "0-630 (0%-10%)",
    1: "631-760 (10%-20%)",
    2: "761-850 (20%-30%)",
    3: "851-930 (30%-40%)",
    4: "931-990 (40%-50%)",
    5: "991-1120 (50%-70%)",
    6: "1121-1300 (70%-90%)",
    7: "1301-2650 (90%-100%)"
}
print(f"Predicted Lexile Level: {categories[predicted_label.item()]}")


NameError: name 'tokenizer' is not defined

In [None]:
# 9. 학습된 모델 저장 및 다운로드
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

# Google Colab을 통한 파일 다운로드
from google.colab import files
shutil.make_archive('saved_model', 'zip', './saved_model')
files.download('saved_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>