In [None]:
# GPU 확인
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


Using device: cuda


In [None]:
pip install tqdm



In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
from transformers import pipeline
import json
from collections import Counter
from tqdm import tqdm

# Step 1: 사전 학습된 감성 분석 모델 로드
sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased")

# 최대 입력 길이 설정
MAX_TOKEN_LENGTH = 512

# 레이블 매핑 (LABEL_0 -> NEGATIVE, LABEL_1 -> POSITIVE)
label_mapping = {"LABEL_0": "NEGATIVE", "LABEL_1": "POSITIVE"}

# Step 2: JSON 파일 로드
file_path = "/content/cleaned_english_comments_no_emojis.json"  # 원본 파일 경로
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Step 3: 각 경기별로 감성 분석 결과 추가
print("Sentiment analysis in progress...")
for match in tqdm(data, desc="Processing matches"):
    comments = match.get("comments", [])  # 댓글 목록 가져오기
    if comments:
        # 댓글에 대해 감성 분석 수행
        results = []
        for comment in comments:
            # 토큰 길이가 512를 초과하면 앞부분만 처리
            if len(comment) > MAX_TOKEN_LENGTH:
                truncated_comment = comment[:MAX_TOKEN_LENGTH]
                result = sentiment_model(truncated_comment)[0]
            else:
                result = sentiment_model(comment)[0]

            # 레이블 매핑 적용
            result["label"] = label_mapping.get(result["label"], result["label"])
            results.append(result)

        sentiments = [result["label"] for result in results]  # 매핑된 감성 레이블 가져오기

        # 감성별 개수 세기
        sentiment_counts = Counter(sentiments)
        total_comments = len(sentiments)

        # 감성 비율 계산 (백분율)
        sentiment_ratios = {
            "POSITIVE": sentiment_counts.get("POSITIVE", 0) / total_comments * 100,  # 긍정 비율
            "NEGATIVE": sentiment_counts.get("NEGATIVE", 0) / total_comments * 100   # 부정 비율
        }

        # 감성 분석 결과를 경기 데이터에 추가
        match["comments_analysis"] = {
            "sentiment_ratios": sentiment_ratios,  # 감성 비율 추가
            "comments": [  # 댓글별 분석 결과 추가
                {
                    "comment": comment,
                    "sentiment": result["label"],  # 매핑된 감성 레이블
                    "score": result["score"]      # 모델의 신뢰도 점수
                }
                for comment, result in zip(comments, results)
            ]
        }

# Step 4: 수정된 데이터를 새로운 JSON 파일로 저장
output_path = "/content/sentiment_analysis_with_original_structure.json"
with open(output_path, "w", encoding="utf-8") as outfile:
    json.dump(data, outfile, indent=4, ensure_ascii=False)

print(f"Sentiment analysis completed and saved to {output_path}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Sentiment analysis in progress...


Processing matches: 100%|██████████| 130/130 [02:52<00:00,  1.32s/it]


Sentiment analysis completed and saved to /content/sentiment_analysis_with_original_structure.json


In [None]:

# Step 4: 수정된 데이터를 새로운 JSON 파일로 저장
output_path = "/content/drive/MyDrive/Colab Notebooks/빅데이터/sentiment_analysis_with_original_structure.json"
with open(output_path, "w", encoding="utf-8") as outfile:
    json.dump(data, outfile, indent=4, ensure_ascii=False)

print(f"Sentiment analysis completed and saved to {output_path}")

Sentiment analysis completed and saved to /content/drive/MyDrive/Colab Notebooks/빅데이터/sentiment_analysis_with_original_structure.json
