<a href="https://colab.research.google.com/github/HeyJae-zero/Final-Team9/blob/main/%EB%B3%84%EC%A0%90%EC%9D%B8%EA%B5%AC_SR_%EC%83%81%EA%B4%80%EA%B4%80%EA%B3%84_%EB%B6%84%EC%84%9D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import warnings
warnings.filterwarnings("ignore")

import io
import numpy as np
import pandas as pd

# statsmodels 보장 설치 (Colab에 기본 포함되어 있지만, 없으면 설치)
try:
    import statsmodels.api as sm
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "statsmodels"])
    import statsmodels.api as sm

# 0) 파일 업로드 (Colab 전용)
from google.colab import files
print("[INFO] 업로드 창이 열립니다. 'TMDB_processed.csv'를 선택하세요.")
uploaded = files.upload()  # 사용자 파일 선택
if not uploaded:
    raise RuntimeError("업로드된 파일이 없습니다. 다시 실행 후 CSV를 선택해 주세요.")
fname = next(iter(uploaded))  # 첫 번째 업로드 파일명

# 1) CSV 로드 (인코딩 자동 시도)
try:
    df = pd.read_csv(io.BytesIO(uploaded[fname]), encoding="utf-8-sig")
except UnicodeDecodeError:
    df = pd.read_csv(io.BytesIO(uploaded[fname]), encoding="cp949")

print(f"[INFO] Loaded: {fname} | shape={df.shape}")

# 2) 유틸: 컬럼 자동 탐지
def pick_column(df: pd.DataFrame, candidates):
    lower_map = {c.lower(): c for c in df.columns}
    for cand in candidates:
        if cand in df.columns:
            return cand
        if cand.lower() in lower_map:
            return lower_map[cand.lower()]
    return None

budget_col = pick_column(df, ["budget", "budget_adj", "budget_usd", "budget_clean"])
revenue_col = pick_column(df, ["revenue", "revenue_adj", "revenue_usd", "gross", "worldwide_gross"])
votecnt_col = pick_column(df, ["vote_count", "voteCount", "n_ratings", "ratings_count", "tmdb_vote_count"])

if budget_col is None or revenue_col is None or votecnt_col is None:
    raise ValueError(
        f"필수 컬럼을 찾지 못했습니다. "
        f"budget:{budget_col}, revenue:{revenue_col}, vote_count:{votecnt_col} "
        f"→ CSV의 실제 컬럼명을 확인하세요."
    )

# 3) 숫자형 변환 & 전처리
for col in [budget_col, revenue_col, votecnt_col]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df = df[(df[budget_col] > 0) & (df[revenue_col] > 0)].copy()
df = df.dropna(subset=[budget_col, revenue_col, votecnt_col])

# Success Ratio 계산
df["success_ratio"] = df[revenue_col] / df[budget_col]

# 4) OLS (원자료)
X_raw = sm.add_constant(df[votecnt_col])
y_raw = df["success_ratio"]
model_raw = sm.OLS(y_raw, X_raw).fit()

print("=" * 100)
print("OLS (Raw) : success_ratio ~ vote_count")
print("=" * 100)
print(model_raw.summary())

# 5) OLS (로그-로그)
dfl = df[(df["success_ratio"] > 0) & (df[votecnt_col] > 0)].copy()
dfl["log_success_ratio"] = np.log(dfl["success_ratio"])
dfl["log_vote_count"]    = np.log(dfl[votecnt_col])

X_log = sm.add_constant(dfl["log_vote_count"])
y_log = dfl["log_success_ratio"]
model_log = sm.OLS(y_log, X_log).fit()

print("\n" + "=" * 100)
print("OLS (Log-Log) : log(success_ratio) ~ log(vote_count)")
print("=" * 100)
print(model_log.summary())

# 강건 표준오차 HC3 — 필요 시 주석 해제
model_raw_robust = model_raw.get_robustcov_results(cov_type="HC3")
model_log_robust = model_log.get_robustcov_results(cov_type="HC3")
print("\n" + "=" * 100)
print("OLS (Raw) with HC3 robust SE")
print("=" * 100)
print(model_raw_robust.summary())
print("\n" + "=" * 100)
print("OLS (Log-Log) with HC3 robust SE")
print("=" * 100)
print(model_log_robust.summary())

# 6) 요약 저장(+다운로드)
summary_out = "tmdb_processed_success_ratio_regression_summary.csv"
out = []
out.append({
    "model": "raw",
    "coef_const": model_raw.params.get("const", np.nan),
    "coef_vote_count": model_raw.params.get(votecnt_col, np.nan),
    "p_vote_count": model_raw.pvalues.get(votecnt_col, np.nan),
    "r2": model_raw.rsquared
})
out.append({
    "model": "loglog",
    "coef_const": model_log.params.get("const", np.nan),
    "coef_log_vote_count": model_log.params.get("log_vote_count", np.nan),
    "p_log_vote_count": model_log.pvalues.get("log_vote_count", np.nan),
    "r2": model_log.rsquared
})
pd.DataFrame(out).to_csv(summary_out, index=False)
print(f"\n[INFO] Saved summary → /content/{summary_out}")

# Colab에서 즉시 파일 다운로드
try:
    files.download(summary_out)
except Exception:
    print("[WARN] 자동 다운로드 실패 시, 왼쪽 Files 패널에서 직접 내려받으세요.")


[INFO] 업로드 창이 열립니다. 'TMDB_processed.csv'를 선택하세요.


Saving TMDB_processed.csv to TMDB_processed (2).csv
[INFO] Loaded: TMDB_processed (2).csv | shape=(9107, 25)
OLS (Raw) : success_ratio ~ vote_count
                            OLS Regression Results                            
Dep. Variable:          success_ratio   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.5109
Date:                Mon, 22 Sep 2025   Prob (F-statistic):              0.475
Time:                        05:30:42   Log-Likelihood:            -1.0770e+05
No. Observations:                9107   AIC:                         2.154e+05
Df Residuals:                    9105   BIC:                         2.154e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>