In [17]:
from transformers import AutoTokenizer, BertForSequenceClassification
import torch

# Correct model name for KoBERT
model_name = 'skt/kobert-base-v1'

# Use AutoTokenizer to automatically select the correct tokenizer class
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with the correct number of labels for classification
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# GPU settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

print("Tokenizer and model loaded successfully")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizer and model loaded successfully


In [20]:
# 엑셀 파일 경로
file_path = r"C:\Users\user\Desktop\도서관_공모전\최종\2_Selenium_책소개\book_introductions.xlsx"

# 엑셀 파일 불러오기
df = pd.read_excel(file_path)

# KoBERT 입력 형식으로 변환하는 함수
def preprocess(text):
    if not isinstance(text, str):
        text = str(text)  # 비 문자열 입력을 문자열로 변환
    encoding = tokenizer.encode_plus(
        text,
        max_length=512,  # KoBERT의 최대 입력 길이
        add_special_tokens=True,
        padding='max_length',
        truncation=True,  # 긴 텍스트는 자르기
        return_attention_mask=True,
        return_tensors='pt',
    )
    return encoding['input_ids'].squeeze(0), encoding['attention_mask'].squeeze(0)

# 감정 예측 함수
def predict(text):
    try:
        input_id, attention_mask = preprocess(text)
        input_id = input_id.to(device)
        attention_mask = attention_mask.to(device)
        
        with torch.no_grad():
            output = model(input_ids=input_id.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
            logits = output.logits
            prediction = torch.argmax(logits, dim=-1).cpu().numpy()[0]
            probabilities = torch.softmax(logits, dim=-1).cpu().numpy()[0]
            
        label_dict = {0: '긍정', 1: '부정', 2: '중립'}
        return label_dict[prediction], probabilities
    except Exception as e:
        print(f"Error processing text: {text}. Error: {e}")
        return '오류', [0, 0, 0]  # 오류 발생 시 기본값

# tqdm 적용하여 감정 분석 수행
def analyze_sentiments(df):
    results = []
    for text in tqdm(df['책 소개'].fillna(''), desc="감정 분석 진행 중"):  # NaN을 빈 문자열로 대체
        sentiment, probabilities = predict(text)
        results.append({'sentiment': sentiment, 'probabilities': probabilities})
    return results

# 감정 분석 수행
results = analyze_sentiments(df)
df_results = pd.DataFrame(results)

# 원본 데이터와 결과 합치기
df = df.join(df_results)

# 결과를 새로운 엑셀 파일로 저장
output_path = r"C:\Users\user\Desktop\도서관_공모전\최종\2_Selenium_책소개\book_introductions_with_predictions.xlsx"
df.to_excel(output_path, index=False)

감정 분석 진행 중: 100%|█████████████████████████████████████████████████████████| 5000/5000 [4:38:25<00:00,  3.34s/it]
