## 데이터 정제

1 차 정제. 형변환, buget / revenue 0 인 행들 삭제. genre explode

In [37]:
import pandas as pd
import ast

# === 1) CSV 로드 ===
df = pd.read_csv("./data_processed/tmdb_kr_theatrical_2005_2025.csv")

print("원본 shape:", df.shape)
display(df.head())
df.info()

# === 2) budget/revenue == 0 → 행 제거 ===
df = df[(df["budget"] != 0) & (df["revenue"] != 0)]

# === 3) ROI 계산 ===
df["roi"] = (df["revenue"] - df["budget"]) / df["budget"]

# === 4) release_date 날짜형 변환 ===
df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce")

# === 5) release_year / release_month → int (nullable) 변환 ===
df["release_year"]  = df["release_year"].astype("Int64")
df["release_month"] = df["release_month"].astype("Int64")

# === 6) 필수 컬럼 결측 제거 ===
df = df.dropna(subset=["budget", "revenue", "release_date", "genres"])

# === 7) genres 문자열 → 리스트 파싱 ===
def parse_genres(x):
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return []
    s = str(x).strip()
    # JSON/리스트 문자열이면 literal_eval
    if s.startswith("[") and s.endswith("]"):
        try:
            val = ast.literal_eval(s)
            # 리스트 안에 dict가 있을 수도 있음: {"name": "..."}
            out = []
            for item in val:
                if isinstance(item, dict):
                    out.append(item.get("name"))
                else:
                    out.append(str(item).strip())
            return [g for g in out if g]
        except Exception:
            pass
    # 혹시 "드라마, 로맨스"처럼 쉼표 구분일 수도 있음
    if "," in s:
        return [t.strip() for t in s.split(",") if t.strip()]
    # 단일 장르 문자열인 경우
    return [s] if s else []

df["genres"] = df["genres"].apply(parse_genres)

# === 8) 장르가 비어있는 행 제거 ===
df = df[df["genres"].apply(lambda lst: isinstance(lst, list) and len(lst) > 0)]

# === 9) 장르 explode ===
df_exploded = df.explode("genres", ignore_index=True)

# 빈 문자열/결측 방지
df_exploded = df_exploded[df_exploded["genres"].notna() & (df_exploded["genres"].str.strip() != "")]

# === 10) 저장 ===
df.to_csv("./data_processed/tmdb_kr_theatrical_clean.csv", index=False, encoding="utf-8-sig")
df_exploded.to_csv("./data_processed/tmdb_kr_theatrical_clean_exploded.csv", index=False, encoding="utf-8-sig")

# === 11) 확인 ===
print("explode 후 shape:", df_exploded.shape)
display(df_exploded[["title","genres","release_year","revenue","roi"]].head(10))
df_exploded.info()

원본 shape: (4662, 16)


Unnamed: 0,movie_id,title,original_title,original_language,release_date,runtime,budget,revenue,vote_average,vote_count,popularity,genres,production_companies,production_countries,release_year,release_month
0,670,올드보이,올드보이,ko,2003-11-21,120,3000000,17500000,8.252,9150,11.6819,"['드라마', '스릴러', '미스터리', '액션']","['Show East', 'Egg Film', 'Cineclick Asia']",['KR'],2003.0,11.0
1,1255,괴물,괴물,ko,2006-07-27,119,11000000,88489643,6.98,2931,4.4383,"['공포', '드라마', 'SF']","['Chungeorahm Film', 'Showbox']",['KR'],2006.0,7.0
2,1963,활,활,ko,2005-05-12,88,950000,2032404,6.921,208,1.2098,['드라마'],"['Kim Ki Duk Film', 'Happinet Pictures', 'Cine...","['JP', 'KR']",2005.0,5.0
3,2015,밀양,밀양,ko,2007-05-23,142,2900000,11581469,7.193,215,1.6151,['드라마'],"['Cinema Service', 'Pinehouse Film', 'CJ Enter...",['KR'],2007.0,5.0
4,4550,친절한 금자씨,친절한 금자씨,ko,2005-07-29,112,0,23803308,7.5,1752,3.348,"['드라마', '스릴러']","['TSJ Entertainment', 'Ilshin Capital Investme...",['KR'],2005.0,7.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4662 entries, 0 to 4661
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movie_id              4662 non-null   int64  
 1   title                 4662 non-null   object 
 2   original_title        4662 non-null   object 
 3   original_language     4662 non-null   object 
 4   release_date          4660 non-null   object 
 5   runtime               4662 non-null   int64  
 6   budget                4662 non-null   int64  
 7   revenue               4662 non-null   int64  
 8   vote_average          4662 non-null   float64
 9   vote_count            4662 non-null   int64  
 10  popularity            4662 non-null   float64
 11  genres                4662 non-null   object 
 12  production_companies  4662 non-null   object 
 13  production_countries  4662 non-null   object 
 14  release_year          4660 non-null   float64
 15  release_month        

Unnamed: 0,title,genres,release_year,revenue,roi
0,올드보이,드라마,2003,17500000,4.833333
1,올드보이,스릴러,2003,17500000,4.833333
2,올드보이,미스터리,2003,17500000,4.833333
3,올드보이,액션,2003,17500000,4.833333
4,괴물,공포,2006,88489643,7.044513
5,괴물,드라마,2006,88489643,7.044513
6,괴물,SF,2006,88489643,7.044513
7,활,드라마,2005,2032404,1.139373
8,밀양,드라마,2007,11581469,2.99361
9,디 워,판타지,2007,75108998,1.347156


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   movie_id              251 non-null    int64         
 1   title                 251 non-null    object        
 2   original_title        251 non-null    object        
 3   original_language     251 non-null    object        
 4   release_date          251 non-null    datetime64[ns]
 5   runtime               251 non-null    int64         
 6   budget                251 non-null    int64         
 7   revenue               251 non-null    int64         
 8   vote_average          251 non-null    float64       
 9   vote_count            251 non-null    int64         
 10  popularity            251 non-null    float64       
 11  genres                251 non-null    object        
 12  production_companies  251 non-null    object        
 13  production_countries

In [35]:
df[df['vote_count'] < 10]

Unnamed: 0,movie_id,title,original_title,original_language,release_date,runtime,budget,revenue,vote_average,vote_count,popularity,genres,production_companies,production_countries,release_year,release_month,roi
158,47649,빼꼼의 머그잔 여행,빼꼼의 머그잔 여행,ko,2007-03-22,76,5900000,618400,3.8,5,0.3635,"[애니메이션, 가족, 코미디, 모험]","['BRB Internacional', 'RG Animation Studios', ...","['KR', 'ES']",2007,3,-0.895186
730,230883,손님은 왕이다,손님은 왕이다,ko,2006-02-23,104,10000,80000,5.5,8,1.5087,[스릴러],[],[],2006,2,7.0
1290,387845,무수단,무수단,ko,2016-03-03,87,500,500,6.1,9,1.8997,"[미스터리, 스릴러, 액션]","['Opus Pictures', 'Golden Tide Pictures']",['KR'],2016,3,0.0
1636,441189,터닝메카드W: 블랙미러의 부활,터닝메카드W: 블랙미러의 부활,ko,2017-01-18,72,1000000,2775228,0.0,0,1.5401,"[애니메이션, TV 영화]",['(주)희원엔터테인먼트'],['KR'],2017,1,1.775228
2033,524435,신 전래동화,신 전래동화,ko,2018-04-25,83,100000,100,4.0,2,0.6899,[코미디],['Oh’YES'],['KR'],2018,4,-0.999
3145,790034,멋진 신세계,멋진 신세계,ko,2012-04-05,41,1600000,636990,0.0,0,1.6347,"[SF, 공포, 코미디]","['Zio Entertainment', 'TimeStory Group']",['KR'],2012,4,-0.601881
3147,790037,해피 버스데이,해피 버스데이,ko,2012-04-05,41,1600000,636990,0.0,0,1.1531,"[SF, 코미디]","['Zio Entertainment', 'TimeStory Group']",['KR'],2012,4,-0.601881
4260,1212102,하와이 연가,하와이 연가,ko,2024-10-30,62,800000,250000,5.0,1,0.2412,"[음악, 다큐멘터리, 역사, 애니메이션, 드라마]",['Now Production Films'],"['KR', 'US']",2024,10,-0.6875
