<a href="https://colab.research.google.com/github/minjung21/Final-Team9/blob/main/%ED%8C%8C%EC%9D%B4%EB%84%90_%ED%94%84%EB%A1%9C%EC%A0%9D%ED%8A%B81.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import pandas as pd

In [32]:
# 사용자 업로드한 파일 로드 (영화들이 섞여 있는 파일이라고 가정)
df = pd.read_csv('/content/TMDB_processed.csv')

In [33]:
import pandas as pd

# 메이저 스튜디오 리스트
MAJOR_STUDIOS = [
    "Warner Bros.", "Warner Bros", "Warner Bros. Pictures",
    "Universal Pictures", "Walt Disney Pictures", "Walt Disney Studios",
    "Paramount Pictures", "Sony Pictures", "Columbia Pictures",
    "20th Century Fox", "20th Century Studios", "New Line Cinema",
    "Marvel Studios", "Lucasfilm", "Pixar", "DreamWorks"
]

# NaN 방지: 결측치는 빈 문자열로
df['production_companies'] = df['production_companies'].fillna('')

# 숫자형 구분 함수
def classify_film_numeric(row):
    prod_companies = str(row['production_companies'])
    budget = row['budget']

    # Indie Film
    if budget < 20_000_000 and not any(studio in prod_companies for studio in major_studios):
        return 0  # Indie #4500000

    # Blockbuster
    if budget >= 100_000_000:
        return 2  # Blockbuster

    # Studio Film
    return 1  # Studio Film (나머지)

# 새로운 컬럼 추가
df['film_code'] = df.apply(classify_film_numeric, axis=1)

# 확인
print(df[['budget','production_companies','film_code']].head())


In [34]:
# film_code 컬럼까지 추가된 df라고 가정
output_path = "TMDB_with_film_code1.csv"  # 저장할 파일명

# CSV로 저장 (인덱스는 안 저장하도록 index=False)
df.to_csv(output_path, index=False, encoding='utf-8-sig')

print(f"저장 완료: {output_path}")


In [42]:
import pandas as pd
import numpy as np
import ast
from typing import Iterable, Union

MAJOR_STUDIOS = [
    "Warner Bros.", "Warner Bros", "Warner Bros. Pictures",
    "Universal Pictures", "Walt Disney Pictures", "Walt Disney Studios",
    "Paramount Pictures", "Sony Pictures", "Columbia Pictures",
    "20th Century Fox", "20th Century Studios", "New Line Cinema",
    "Marvel Studios", "Lucasfilm", "Pixar", "DreamWorks"
]

BUDGET_INDI_THRESHOLD   = 20_000_000
BUDGET_BLOCKB_THRESHOLD = 100_000_000

# --- 여기서부터 모든 규칙은 r["roi"](이미 존재) 를 사용 ---
SUCCESS_RULES = {
    "Blockbuster": [
        ("Global Blockbuster", lambda r: r["revenue"] >= 1_000_000_000 and r["roi"] >= 1.2),
        ("Mega Hit",          lambda r: r["revenue"] >=   500_000_000 and r["roi"] >= 1.1),
        ("Hit",               lambda r: r["revenue"] >=   200_000_000 and r["roi"] >= 1.0),
        ("Moderate",          lambda r: r["revenue"] >=    50_000_000),
        ("Flop",              lambda r: True),
    ],
    "Studio": [
        ("Mega Hit", lambda r: r["revenue"] >= 500_000_000 or (r["revenue"] >= 200_000_000 and r["roi"] >= 1.2)),
        ("Hit",      lambda r: r["revenue"] >= 100_000_000 and r["roi"] >= 1.0),
        ("Moderate", lambda r: r["revenue"] >=  20_000_000 and r["roi"] >= 0.9),
        ("Flop",     lambda r: True),
    ],
    "Indie": [
        ("Super Hit", lambda r: r["roi"] >= 3.0 and r["revenue"] >= 5_000_000),
        ("Hit",       lambda r: r["roi"] >= 2.0 and r["revenue"] >= 2_000_000),
        ("Moderate",  lambda r: r["roi"] >= 1.2 and r["revenue"] >= 1_000_000),
        ("Flop",      lambda r: True),
    ],
}

def to_company_names(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, list):
        if len(x) == 0:
            return []
        if isinstance(x[0], dict) and "name" in x[0]:
            return [str(d.get("name", "")).strip() for d in x]
        return [str(s).strip() for s in x]
    s = str(x).strip()
    if s == "":
        return []
    if (s.startswith("[") and s.endswith("]")) or (s.startswith("{") and s.endswith("}")):
        try:
            parsed = ast.literal_eval(s)
            return to_company_names(parsed)
        except Exception:
            pass
    return [t.strip() for t in s.split(",") if t.strip()]

def is_major(companies):
    comp_lower = [c.lower() for c in companies]
    for m in MAJOR_STUDIOS:
        key = m.lower().replace(".", "").strip()
        for c in comp_lower:
            if key in c.replace(".", "").strip():
                return True
    return False

def classify_scale(budget: float, companies: list) -> int:
    if pd.notna(budget) and budget >= BUDGET_BLOCKB_THRESHOLD:
        return 2  # Blockbuster
    if pd.notna(budget) and budget < BUDGET_INDI_THRESHOLD and not is_major(companies):
        return 0  # Indie
    return 1      # Studio

def scale_label(code: int) -> str:
    return {0: "Indie", 1: "Studio", 2: "Blockbuster"}.get(code, "Unknown")

def success_for_scale(scale: str, row: pd.Series) -> str:
    rules = SUCCESS_RULES.get(scale, [("Unknown", lambda r: True)])
    for label, cond in rules:
        try:
            if cond(row):
                return label
        except Exception:
            continue
    return "Unknown"

def classify_movies_using_existing_roi(df: pd.DataFrame,
                                       prod_col: str = "production_companies",
                                       budget_col: str = "budget",
                                       revenue_col: str = "revenue",
                                       roi_col: str = "roi") -> pd.DataFrame:
    out = df.copy()

    # 기본 정리
    out[prod_col]    = out[prod_col].fillna("")
    out[budget_col]  = pd.to_numeric(out[budget_col], errors="coerce")
    out[revenue_col] = pd.to_numeric(out[revenue_col], errors="coerce")
    out[roi_col]     = pd.to_numeric(out[roi_col], errors="coerce")

    # (선택) ROI 값의 비정상치 방지: 음수/무한대 → NaN
    out.loc[~np.isfinite(out[roi_col]) | (out[roi_col] < 0), roi_col] = np.nan

    # 규모 분류
    out["_companies_list"] = out[prod_col].apply(to_company_names)
    out["scale_code"]  = out.apply(lambda r: classify_scale(r[budget_col], r["_companies_list"]), axis=1)
    out["scale_label"] = out["scale_code"].map(scale_label)

    # 성과 티어 (기존 roi 사용)
    out["success_tier"] = out.apply(lambda r: success_for_scale(r["scale_label"], r), axis=1)

    # 최종 라벨
    out["final_label"] = out["scale_label"] + "–" + out["success_tier"]

    return out.drop(columns=["_companies_list"])

In [45]:
# film_code 컬럼까지 추가된 df라고 가정
output_path = "TMDB.csv"  # 저장할 파일명

# CSV로 저장 (인덱스는 안 저장하도록 index=False)
df.to_csv(output_path, index=False, encoding='utf-8-sig')

print(f"저장 완료: {output_path}")

저장 완료: TMDB.csv


In [46]:
import pandas as pd
import numpy as np
import ast

# -----------------------------
# 설정
# -----------------------------
MAJOR_STUDIOS = [
    "Warner Bros.", "Warner Bros", "Warner Bros. Pictures",
    "Universal Pictures", "Walt Disney Pictures", "Walt Disney Studios",
    "Paramount Pictures", "Sony Pictures", "Columbia Pictures",
    "20th Century Fox", "20th Century Studios", "New Line Cinema",
    "Marvel Studios", "Lucasfilm", "Pixar", "DreamWorks"
]

BUDGET_INDI_THRESHOLD   = 20_000_000
BUDGET_BLOCKB_THRESHOLD = 100_000_000

# 예외 영화: 제목(key)에 대해 강제로 scale_code(value) 지정
EXCEPTION_MOVIES = {
    "Parasite": 1,  # Studio로 강제 지정
    # 필요 시 추가 가능
}

# -----------------------------
# Helper 함수
# -----------------------------
def to_company_names(x):
    """production_companies 컬럼 처리"""
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, list):
        if len(x) == 0:
            return []
        if isinstance(x[0], dict) and "name" in x[0]:
            return [str(d.get("name", "")).strip() for d in x]
        return [str(s).strip() for s in x]
    s = str(x).strip()
    if s == "":
        return []
    if (s.startswith("[") and s.endswith("]")) or (s.startswith("{") and s.endswith("}")):
        try:
            parsed = ast.literal_eval(s)
            return to_company_names(parsed)
        except Exception:
            pass
    return [t.strip() for t in s.split(",") if t.strip()]

def is_major(companies):
    """MAJOR_STUDIOS 포함 여부 확인"""
    comp_lower = [c.lower() for c in companies]
    for m in MAJOR_STUDIOS:
        key = m.lower().replace(".", "").strip()
        for c in comp_lower:
            if key in c.replace(".", "").strip():
                return True
    return False

def classify_scale(title, budget, companies):
    """budget + MAJOR_STUDIO + 예외 영화 처리"""
    # 예외 처리
    if title in EXCEPTION_MOVIES:
        return EXCEPTION_MOVIES[title]

    # Blockbuster
    if pd.notna(budget) and budget >= BUDGET_BLOCKB_THRESHOLD:
        return 2
    # Indie
    if pd.notna(budget) and budget < BUDGET_INDI_THRESHOLD and not is_major(companies):
        return 0
    # 나머지 Studio
    return 1

def scale_label(code):
    return {0: "Indie", 1: "Studio", 2: "Blockbuster"}.get(code, "Unknown")

# -----------------------------
# 적용 함수
# -----------------------------
def classify_movies(df, prod_col="production_companies", budget_col="budget"):
    out = df.copy()

    # 컬럼 정리
    out[prod_col]   = out[prod_col].fillna("")
    out[budget_col] = pd.to_numeric(out[budget_col], errors="coerce")

    # _companies_list 생성
    out["_companies_list"] = out[prod_col].apply(to_company_names)

    # scale_code 분류
    out["scale_code"]  = out.apply(lambda r: classify_scale(r.get("title", ""), r[budget_col], r["_companies_list"]), axis=1)
    out["scale_label"] = out["scale_code"].map(scale_label)

    return out.drop(columns=["_companies_list"])


In [47]:

df_classified = classify_movies(df)

# 확인
print(df_classified[["title", "budget", "scale_label"]].head())

# CSV 저장
df_classified.to_csv("TMDB_classified_v2.csv", index=False)


             title     budget  scale_label
0        Inception  160000000  Blockbuster
1     Interstellar  165000000  Blockbuster
2  The Dark Knight  185000000  Blockbuster
3           Avatar  237000000  Blockbuster
4     The Avengers  220000000  Blockbuster
