In [1]:
# 1. 필요 라이브러리 임포트
import os
import sys
sys.path.append(os.path.abspath("../src"))
import pandas as pd
import time
import altair as alt
import re
import json

# --- GROQ API 키 설정 ---
import openai
os.environ["GROQ_API_KEY"] = "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"
openai.api_key = os.getenv("GROQ_API_KEY")
openai.api_base = "https://api.groq.com/openai/v1"

In [2]:
emotion_valence = {
    "joy": 1.00, "love": 0.95, "affection": 0.90, "gratitude": 0.88, "excitement": 0.85,
    "amusement": 0.82, "relief": 0.80, "pride": 0.78, "confidence": 0.75, "ambition": 0.73,
    "protectiveness": 0.70, "determination": 0.68, "anticipation": 0.65, "respect": 0.63,
    "curiosity": 0.60, "surprise": 0.58, "incredulity": 0.55, "authority": 0.52,
    "calm": 0.50, "neutral": 0.50, "neutrality": 0.50, "seriousness": 0.48,
    "caution": 0.45, "concern": 0.42, "nostalgia": 0.40, "awe": 0.38,
}

In [3]:
def parse_srt_file(filepath):
    import srt
    with open(filepath, 'r', encoding='utf-8') as f:
        srt_text = f.read()
    subtitles = list(srt.parse(srt_text))
    return [
        {
            "id": i + 1,
            "start": sub.start.total_seconds(),
            "end": sub.end.total_seconds(),
            "text": sub.content.strip()
        }
        for i, sub in enumerate(subtitles)
    ]

In [4]:
def analyze_subtitle(text, emotion_valence, model="llama3-70b-8192"):
    emotion_labels = list(emotion_valence.keys())
    prompt = f"""
    You are an assistant analyzing movie dialogue.
    You MUST select zero or more emotions from the following list ONLY:
    {emotion_labels}
    Return only a valid JSON in this format:
    {{
    "emotions": [ ... ], 
    "situation": "short summary", 
    "situation_type": "category"
    }}
    Text: "{text}"
    """
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
    )
    match = re.search(r"\{.*\}", response.choices[0].message.content, re.DOTALL)
    if match:
        try:
            return json.loads(match.group())
        except Exception as e:
            print(f"[!] JSON 파싱 실패: {e} | 원본 응답: {response.choices[0].message.content}")
    return {"emotions": [], "situation": "unknown", "situation_type": "unknown"}


In [7]:
SAMPLE_START = 4206  # 예시: 1시간 10분 6초
SAMPLE_END = 4530    # 예시: 1시간 15분 30초
# 1. 자막 로드
srt_path = "../data/raw/Ran.1985.eng.srt"
subs = parse_srt_file(srt_path)

subs = [sub for sub in subs if (sub["start"] >= SAMPLE_START) and (sub["start"] <= SAMPLE_END)]

# 2. 감정/상황 태깅
labeled = []
for sub in subs:
    res = analyze_subtitle(sub["text"], emotion_valence)
    sub.update(res)
    labeled.append(sub)
    time.sleep(2.1)  # Groq API 제한

subs_df = pd.DataFrame(labeled)
subs_df.to_json("../data/processed/ran_llm_enriched.json", force_ascii=False, indent=2)
subs_df.to_csv("../data/processed/ran_llm_enriched.csv", index=False)
print("자막 감정 분석 저장 완료!")

[!] JSON 파싱 실패: Invalid control character at: line 3 column 59 (char 99) | 원본 응답: {
"emotions": ["concern", "authority"], 
"situation": "Someone is telling someone else to go back, 
"situation_type": "command"
}
자막 감정 분석 저장 완료!


In [8]:
def calc_valence(emotions):
    if not emotions:
        return 0.5
    scores = [emotion_valence.get(e.lower(), 0.5) for e in emotions]
    return sum(scores) / len(scores)

subs_df["valence"] = subs_df["emotions"].apply(calc_valence)

In [9]:
chart = alt.Chart(subs_df).mark_circle(size=70, opacity=0.8).encode(
    x=alt.X('start', title='Time (s)'),
    y=alt.Y('valence', title='Emotion Valence', scale=alt.Scale(domain=[0,1])),
    color=alt.Color('valence:Q', scale=alt.Scale(scheme='turbo')),
    tooltip=['id','text','emotions','valence','situation','situation_type']
).properties(width=850, height=350, title="Subtitle Emotion Timeline (No Scene)")

chart.interactive().show()