## 데이터 정제

1 차 정제. 형변환, buget / revenue 0 인 행들 삭제. genre explode

In [19]:
import pandas as pd

# 1) CSV 로드
df = pd.read_csv("./data_processed/tmdb_kr_theatrical_2005_2025.csv")

# 2) 데이터 기본 정보 확인
print("원본 shape:", df.shape)
display(df.head())
df.info()

# 3) budget/revenue == 0 → 행 제거
df = df[(df["budget"] != 0) & (df["revenue"] != 0)]

# 4) ROI 계산
df["roi"] = (df["revenue"] - df["budget"]) / df["budget"]

# 5) release_date 날짜형 변환
df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce")

# 6) release_year / release_month → int 변환
df["release_year"]  = df["release_year"].astype("Int64")
df["release_month"] = df["release_month"].astype("Int64")

# 7) 필수 컬럼만 dropna
df = df.dropna(subset=["budget", "revenue", "release_date", "genres"])

# 8) 장르 explode
df_exploded = df.explode("genres").reset_index(drop=True)

# 9) 장르 없는 행 제거
df_exploded = df_exploded.dropna(subset=["genres"])

# 10) 저장
df.to_csv("./data_processed/tmdb_kr_theatrical_clean.csv", index=False, encoding="utf-8")
df_exploded.to_csv("./data_processed/tmdb_kr_theatrical_clean_exploded.csv", index=False, encoding="utf-8")

# 11) 미리보기
print("전처리 후 shape:", df_exploded.shape)
display(df_exploded.head(10))
df_exploded.info()

원본 shape: (4670, 16)


Unnamed: 0,movie_id,title,original_title,original_language,release_date,runtime,budget,revenue,vote_average,vote_count,popularity,genres,production_companies,production_countries,release_year,release_month
0,670,올드보이,올드보이,ko,2003-11-21,120,3000000,17500000,8.252,9150,11.6819,"['드라마', '스릴러', '미스터리', '액션']","['Show East', 'Egg Film', 'Cineclick Asia']",['KR'],2003.0,11.0
1,1255,괴물,괴물,ko,2006-07-27,119,11000000,88489643,6.98,2931,4.4383,"['공포', '드라마', 'SF']","['Chungeorahm Film', 'Showbox']",['KR'],2006.0,7.0
2,1963,활,활,ko,2005-05-12,88,950000,2032404,6.921,208,1.2098,['드라마'],"['Kim Ki Duk Film', 'Happinet Pictures', 'Cine...","['JP', 'KR']",2005.0,5.0
3,2015,밀양,밀양,ko,2007-05-23,142,2900000,11581469,7.193,215,1.6151,['드라마'],"['Cinema Service', 'Pinehouse Film', 'CJ Enter...",['KR'],2007.0,5.0
4,4550,친절한 금자씨,친절한 금자씨,ko,2005-07-29,112,0,23803308,7.5,1752,3.348,"['드라마', '스릴러']","['TSJ Entertainment', 'Ilshin Capital Investme...",['KR'],2005.0,7.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4670 entries, 0 to 4669
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movie_id              4670 non-null   int64  
 1   title                 4670 non-null   object 
 2   original_title        4670 non-null   object 
 3   original_language     4670 non-null   object 
 4   release_date          4668 non-null   object 
 5   runtime               4670 non-null   int64  
 6   budget                4670 non-null   int64  
 7   revenue               4670 non-null   int64  
 8   vote_average          4670 non-null   float64
 9   vote_count            4670 non-null   int64  
 10  popularity            4670 non-null   float64
 11  genres                4670 non-null   object 
 12  production_companies  4670 non-null   object 
 13  production_countries  4670 non-null   object 
 14  release_year          4668 non-null   float64
 15  release_month        

Unnamed: 0,movie_id,title,original_title,original_language,release_date,runtime,budget,revenue,vote_average,vote_count,popularity,genres,production_companies,production_countries,release_year,release_month,roi
0,670,올드보이,올드보이,ko,2003-11-21,120,3000000,17500000,8.252,9150,11.6819,"['드라마', '스릴러', '미스터리', '액션']","['Show East', 'Egg Film', 'Cineclick Asia']",['KR'],2003,11,4.833333
1,1255,괴물,괴물,ko,2006-07-27,119,11000000,88489643,6.98,2931,4.4383,"['공포', '드라마', 'SF']","['Chungeorahm Film', 'Showbox']",['KR'],2006,7,7.044513
2,1963,활,활,ko,2005-05-12,88,950000,2032404,6.921,208,1.2098,['드라마'],"['Kim Ki Duk Film', 'Happinet Pictures', 'Cine...","['JP', 'KR']",2005,5,1.139373
3,2015,밀양,밀양,ko,2007-05-23,142,2900000,11581469,7.193,215,1.6151,['드라마'],"['Cinema Service', 'Pinehouse Film', 'CJ Enter...",['KR'],2007,5,2.99361
4,10253,디 워,디 워,ko,2007-08-01,92,32000000,75108998,4.525,428,2.35,"['판타지', '드라마', '공포', '액션', '스릴러', 'SF']","['Younggu-Art Movies', 'Showbox']",['KR'],2007,8,1.347156
5,14968,웰컴 투 동막골,웰컴 투 동막골,ko,2005-08-04,133,8000000,33579813,7.445,190,1.2664,"['코미디', '드라마', '전쟁']","['Film It Suda', 'Showbox']",['KR'],2005,8,3.197477
6,17903,쌍화점,쌍화점,ko,2008-12-30,132,10000000,18980744,6.899,148,4.6242,['드라마'],"['Opus Pictures', 'Showbox', 'United Pictures'...",['KR'],2008,12,0.898074
7,18514,두번째 사랑,두번째 사랑,ko,2007-06-20,90,3500000,689473,6.6,44,1.646,"['드라마', '로맨스']","['NOWFILM', 'Vox3 Films']","['KR', 'US']",2007,6,-0.803008
8,19627,무림여대생,무림여대생,ko,2008-06-26,118,7000000,162256,6.519,27,1.6496,"['코미디', '로맨스']","['Bear Entertainment', 'Prime Entertainment']",['KR'],2008,6,-0.976821
9,22536,박쥐,박쥐,ko,2009-04-30,133,5000000,13085023,7.097,979,3.2794,"['드라마', '공포', '스릴러']",['Moho Film'],['KR'],2009,4,1.617005


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   movie_id              94 non-null     int64         
 1   title                 94 non-null     object        
 2   original_title        94 non-null     object        
 3   original_language     94 non-null     object        
 4   release_date          94 non-null     datetime64[ns]
 5   runtime               94 non-null     int64         
 6   budget                94 non-null     int64         
 7   revenue               94 non-null     int64         
 8   vote_average          94 non-null     float64       
 9   vote_count            94 non-null     int64         
 10  popularity            94 non-null     float64       
 11  genres                94 non-null     object        
 12  production_companies  94 non-null     object        
 13  production_countries  