<a href="https://colab.research.google.com/github/piaoruilin/20251R0136COSE47101/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0


# 1. 데이터에서 원하는 열만 추출

In [2]:
import pandas as pd

file_path = '/content/2025-05-20_global_alltime.xlsx' # tudum top-10 엑셀 파일 불러오기

columns_to_extract = [
    'week',
    'category',
    'show_title',
    'weekly_hours_viewed',
    'runtime',
    'weekly_views',
    'cumulative_weeks_in_top_10'
]

try:
    df = pd.read_excel(file_path, usecols=columns_to_extract)

    print(f"파일 '{file_path}'에서 다음 열들을 성공적으로 추출했습니다:")
    print(columns_to_extract)

    print("\n추출된 DataFrame (df) 정보:")
    df.info()

    print("\nDataFrame (df)의 처음 5행:")
    print(df.head())

except FileNotFoundError:
    print(f"오류: 파일 '{file_path}'을(를) 찾을 수 없습니다.")
    print("파일 경로가 올바른지, 파일 이름에 오타가 없는지 확인해주세요.")
    print("Google Drive를 사용하는 경우, Drive가 올바르게 마운트되었는지 확인해주세요.")
    print("Colab 왼쪽에 있는 '파일' 탭에서 파일 위치를 직접 확인할 수 있습니다.")
except ValueError as e:

    print(f"\n오류 발생: {e}")
    print("엑셀 파일에 요청한 열들이 모두 존재하는지, 열 이름이 정확한지 확인해주세요.")
    print(f"요청한 열: {columns_to_extract}")

except Exception as e:
    print(f"\n엑셀 파일을 읽는 중 예상치 못한 오류가 발생했습니다: {e}")

파일 '/content/2025-05-20_global_alltime.xlsx'에서 다음 열들을 성공적으로 추출했습니다:
['week', 'category', 'show_title', 'weekly_hours_viewed', 'runtime', 'weekly_views', 'cumulative_weeks_in_top_10']

추출된 DataFrame (df) 정보:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8080 entries, 0 to 8079
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   week                        8080 non-null   object 
 1   category                    8080 non-null   object 
 2   show_title                  8080 non-null   object 
 3   weekly_hours_viewed         8080 non-null   int64  
 4   runtime                     4000 non-null   float64
 5   weekly_views                4000 non-null   float64
 6   cumulative_weeks_in_top_10  8080 non-null   int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 442.0+ KB

DataFrame (df)의 처음 5행:
         week         category                  show_title  \
0  2025-05-11  Films (En

# 2. API를 통해 영화/프로그램 이름과 TMDB ID를 매칭

In [None]:
import requests
from tqdm import tqdm
import os
from dotenv import load_dotenv

# .env 파일에서 api키 받아오기 (tmdb api 키 필요)
load_dotenv()
API_KEY = os.getenv('API_KEY')

print(f"API Key loaded: {API_KEY[:5]}..." if API_KEY else "No API key found")

df['id'] = pd.NA

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Fetching TMDB IDs"):
    title_from_df = str(row.show_title)
    category = row.category
    url = None
    api_title_key = None

    encoded_title = requests.utils.quote(title_from_df)

    if category == "Films (English)" or category == "Films (Non-English)":
        url = f"https://api.themoviedb.org/3/search/movie?query={encoded_title}&include_adult=true&language=en-US&page=1"
        api_title_key = 'title'
    elif category == "TV (English)" or category == "TV (Non-English)":
        url = f"https://api.themoviedb.org/3/search/tv?query={encoded_title}&include_adult=true&language=en-US&page=1"
        api_title_key = 'name'
    else:
        tqdm.write(f"카테고리 오류. DF Title: '{title_from_df}', Category: {row.category}")
        continue

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        response_data = response.json()
        match_found_in_results = False

        if response_data.get('results'):
            for result_item in response_data['results']:
                if api_title_key in result_item and 'id' in result_item:
                    api_title = str(result_item[api_title_key])
                    api_id = result_item['id']

                    if title_from_df.lower() == api_title.lower():
                        df.loc[index, 'id'] = api_id
                        match_found_in_results = True
                        break
                else:
                    tqdm.write(f"  Malformed result for '{title_from_df}': Lacks '{api_title_key}' or 'id'. Item: {result_item}")


            if not match_found_in_results and response_data['results']:
                first_api_title_candidate = str(response_data['results'][0].get(api_title_key, "N/A"))
                tqdm.write(f"  Title mismatch for '{title_from_df}'. Searched API, but no exact match. First API candidate: '{first_api_title_candidate}' (ID not added).")
            elif not response_data['results']:
                 tqdm.write(f"  No items in 'results' list from API for '{title_from_df}'")


        else:
            tqdm.write(f"  No 'results' key or empty results in API response for '{title_from_df}'")

        # time.sleep(0.1) # 100ms delay

    except requests.exceptions.HTTPError as e:
        tqdm.write(f"  HTTP error for '{title_from_df}': {e.response.status_code} {e.response.reason}. URL: {url}")
    except requests.exceptions.RequestException as e:
        tqdm.write(f"  API request failed for '{title_from_df}': {e}. URL: {url}")
    except ValueError as e:
        tqdm.write(f"  Failed to decode JSON for '{title_from_df}': {e}. URL: {url}")
    except Exception as e:
        tqdm.write(f"  An unexpected error occurred for '{title_from_df}': {e}. URL: {url}")


print("\nFinal DataFrame with 'id' column:")
print(df)
print("\nID column data type:")
print(df['id'].dtype)

print("\nValue counts for 'id' column (including NAs):")
print(df['id'].value_counts(dropna=False))

In [None]:
# id가 매칭되지 않은 작품 확인
print("\nValue counts for 'id' column (including NAs):")
print(df['id'].value_counts(dropna=False))
df[df['id'].isna()].sort_values(by='cumulative_weeks_in_top_10', ascending=False)


Value counts for 'id' column (including NAs):
id
<NA>       354
79696       51
66732       50
91239       49
93405       39
          ... 
100074       1
41160        1
1469239      1
1144430      1
24249        1
Name: count, Length: 2403, dtype: Int64


Unnamed: 0,week,category,show_title,weekly_hours_viewed,runtime,weekly_views,cumulative_weeks_in_top_10,id
6575,2022-03-20,Films (Non-English),Chernobyl 1986,4490000,,,5,
5959,2022-07-10,TV (Non-English),"Malverde, el santo patrón",7580000,,,5,
2796,2024-01-14,TV (Non-English),Single’s Inferno,24500000,13.95,1800000,5,
6614,2022-03-13,Films (Non-English),Chernobyl 1986,5330000,,,4,
5996,2022-07-03,TV (Non-English),"Malverde, el santo patrón",9630000,,,4,
...,...,...,...,...,...,...,...,...
780,2024-12-29,TV (English),Christmas Gameday: Ravens vs. Texans,42800000,0.00,14400000,1,
781,2024-12-29,TV (English),Christmas Gameday: Chiefs vs. Steelers,41200000,0.00,14300000,1,
8058,2021-07-04,Films (Non-English),Rurouni Kenshin: Origins,2150000,,,1,
98,2025-04-27,Films (Non-English),Sin instrucciones,2100000,1.65,1300000,1,


In [None]:
# 매칭되지 않은 id 수동 설정
df.loc[df['show_title'] == 'Fear Street Part 1: 1994', 'id'] = 591273

In [None]:
# csv 파일 저장
df.to_csv('top-10_with_id.csv', index=False)

In [None]:
# csv 파일 불러오기
csv_path = "top-10_with_id.csv"

df = pd.read_csv(csv_path)
df['weekly_views'] = df['weekly_views'].astype('Int64')
df['id'] = df['id'].astype('Int64')

df = df.dropna(subset=['id'])

df

Unnamed: 0,week,category,show_title,weekly_hours_viewed,runtime,weekly_views,cumulative_weeks_in_top_10,id
0,2025-05-11,Films (English),Nonnas,29000000,1.9000,15300000,1,1151039
1,2025-05-11,Films (English),Inside Man: Most Wanted,21800000,1.7667,12300000,1,619278
2,2025-05-11,Films (English),A Deadly American Marriage,16900000,1.7167,9800000,1,1466938
3,2025-05-11,Films (English),Havoc,16300000,1.7833,9100000,3,668489
4,2025-05-11,Films (English),Twilight,8700000,2.0333,4300000,4,8966
...,...,...,...,...,...,...,...,...
8075,2021-07-04,TV (Non-English),Elite,10530000,,,1,76669
8076,2021-07-04,TV (Non-English),Elite,10200000,,,1,76669
8077,2021-07-04,TV (Non-English),Elite,10140000,,,1,76669
8078,2021-07-04,TV (Non-English),Katla,9190000,,,1,104157


# 3. 작품 id를 이용해 tmdb에서 장르와 키워드 불러오기

## 3.1. 장르 불러오기

In [None]:
import pandas as pd
import requests
from tqdm import tqdm
import time
from dotenv import load_dotenv

# .env 파일에서 api키 받아오기
load_dotenv()
API_KEY = os.getenv('API_KEY')

print(f"API Key loaded: {API_KEY[:5]}..." if API_KEY else "No API key found")


if 'df' not in locals() or not isinstance(df, pd.DataFrame):
    print("!!! 'df' DataFrame이 정의되지 않았습니다. 스크립트 상단에서 로드해주세요. !!!")
    exit()


if 'genres' not in df.columns:
    print("Initializing 'genres' column.")
    df['genres'] = pd.Series([None for _ in range(len(df))], index=df.index, dtype=object)
    for i in df.index:
        df.at[i, 'genres'] = []
else:
    print("Re-initializing existing 'genres' column.")

    for i in df.index:
        df.at[i, 'genres'] = []


print("\n--- DEBUGGING INFORMATION (AFTER 'genres' INIT with .at) ---")
if 'genres' in df.columns:
    print(f"df['genres'].dtype: {df['genres'].dtype}")
    print(f"First 3 values of df['genres']:\n{df['genres'].head(3).tolist()}")
    try:
        if len(df) > 0:
            temp_list = ["Test Assignment"]
            df.at[df.index[0], 'genres'] = temp_list
            print(f"Test assignment to df.at[df.index[0], 'genres'] successful. Value: {df.at[df.index[0], 'genres']}")
            df.at[df.index[0], 'genres'] = []
            print(f"Test reset to df.at[df.index[0], 'genres'] with [] successful. Value: {df.at[df.index[0], 'genres']}")
        else:
            print("DataFrame is empty, skipping test assignment.")
    except Exception as e:
        print(f"!!! ERROR during test assignment to df.at['genres']: {e}")
else:
    print("'genres' column not found after initialization attempt.")
print("-----------------------------------------------------")


headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

print("\nStarting API calls...\n")

for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Fetching TMDB Genres"):
    tmdb_id_from_df = None
    try:
        tmdb_id_from_df = int(row['id'])
    except (ValueError, TypeError, KeyError) as e:
        tqdm.write(f"Skipping row {index} (Original ID: {row.get('id', 'N/A')}): Invalid or missing TMDB ID. Error: {e}")
        try:
            df.at[index, 'genres'] = ["Error: Invalid TMDB ID in source"]
        except Exception as assign_e:
            print(f"!!!!! CRITICAL ERROR during assignment in (ValueError/TypeError/KeyError) except block for index {index}: {assign_e}")
        continue

    category = row.get('category')
    url = None

    if category == "Films (English)" or category == "Films (Non-English)":
        url = f"https://api.themoviedb.org/3/movie/{tmdb_id_from_df}?language=en-US"
    elif category == "TV (English)" or category == "TV (Non-English)":
        url = f"https://api.themoviedb.org/3/tv/{tmdb_id_from_df}?language=en-US"
    else:
        tqdm.write(f"카테고리 오류. TMDB ID: {tmdb_id_from_df}, Category: {category}")
        try:
            df.at[index, 'genres'] = [f"Error: Invalid Category '{category}'"]
        except Exception as assign_e:
            print(f"!!!!! CRITICAL ERROR during assignment in category 'else' block for index {index}: {assign_e}")
        continue

    if url:
        response_text_snippet = "Response not captured yet"
        try:
            response = requests.get(url, headers=headers)
            response_text_snippet = response.text[:200] if response and hasattr(response, 'text') else "Response has no text or is None"
            response.raise_for_status()
            data = response.json()

            genres_data_list = data.get('genres', [])

            if genres_data_list:
                genre_names = [genre_info['name'] for genre_info in genres_data_list if 'name' in genre_info]
                df.at[index, 'genres'] = genre_names
            else:
                df.at[index, 'genres'] = []

            # time.sleep(0.05)

        except requests.exceptions.HTTPError as e:
            error_msg_list = []
            if e.response.status_code == 404:
                tqdm.write(f"API 요청 실패 (404 Not Found). TMDB ID: {tmdb_id_from_df}. URL: {url}")
                error_msg_list = ["Error: Item Not Found (404)"]
            else:
                tqdm.write(f"API 요청 실패. TMDB ID: {tmdb_id_from_df}. Status: {e.response.status_code}. URL: {url}. Error: {e}")
                error_msg_list = [f"Error: API HTTP {e.response.status_code}"]
            try:
                df.at[index, 'genres'] = error_msg_list
            except Exception as assign_e:
                print(f"!!!!! CRITICAL ERROR during assignment in HTTPError except block for index {index}: {assign_e}")

        except requests.exceptions.RequestException as e:
            tqdm.write(f"API 연결 오류. TMDB ID: {tmdb_id_from_df}. URL: {url}. Error: {e}")
            try:
                df.at[index, 'genres'] = ["Error: API Connection Failed"]
            except Exception as assign_e:
                print(f"!!!!! CRITICAL ERROR during assignment in RequestException except block for index {index}: {assign_e}")

        except ValueError as e:
            tqdm.write(f"JSON 파싱 실패. TMDB ID: {tmdb_id_from_df}. URL: {url}. Error: {e}. Response text: {response_text_snippet}")
            try:
                df.at[index, 'genres'] = ["Error: JSON Parse Failed"]
            except Exception as assign_e:
                print(f"!!!!! CRITICAL ERROR during assignment in ValueError except block for index {index}: {assign_e}")
                print(f"Attempted to assign: {['Error: JSON Parse Failed']}")


        except Exception as e:
            tqdm.write(f"처리 중 알 수 없는 오류 발생. TMDB ID: {tmdb_id_from_df}. URL: {url}. Error: {e}")
            try:
                df.at[index, 'genres'] = ["Error: Unknown Processing Error"]
            except Exception as assign_e:
                print(f"!!!!! CRITICAL ERROR during assignment in Generic Exception except block for index {index}: {assign_e}")


print("\n--- FINAL DataFrame ---")
print(df[['id', 'category', 'genres']].head(10))
print("\n--- FINAL df.info() ---")
df.info()

API Key loaded: eyJhb...
Initializing 'genres' column.

--- DEBUGGING INFORMATION (AFTER 'genres' INIT with .at) ---
df['genres'].dtype: object
First 3 values of df['genres']:
[[], [], []]
Test assignment to df.at[df.index[0], 'genres'] successful. Value: ['Test Assignment']
Test reset to df.at[df.index[0], 'genres'] with [] successful. Value: []
-----------------------------------------------------

Starting API calls...



Fetching TMDB Genres: 100%|██████████| 7726/7726 [07:54<00:00, 16.28it/s]


--- FINAL DataFrame ---
        id         category                     genres
0  1151039  Films (English)                   [Comedy]
1   619278  Films (English)  [Action, Crime, Thriller]
2  1466938  Films (English)       [Documentary, Crime]
3   668489  Films (English)  [Action, Crime, Thriller]
4     8966  Films (English)  [Fantasy, Drama, Romance]
5   682507  Films (English)  [Drama, Mystery, Romance]
6   110538  Films (English)          [Comedy, Romance]
7    61222  Films (English)                    [Drama]
8  1462776  Films (English)              [Documentary]
9   917496  Films (English)  [Horror, Comedy, Fantasy]

--- FINAL df.info() ---
<class 'pandas.core.frame.DataFrame'>
Index: 7726 entries, 0 to 8079
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   week                        7726 non-null   object 
 1   category                    7726 non-null   object 
 2   show_title 




# 3.2. 키워드 불러오기

In [None]:
import pandas as pd
import requests
from tqdm import tqdm
import time
from dotenv import load_dotenv

# .env 파일에서 api키 받아오기
load_dotenv()
API_KEY = os.getenv('API_KEY')

print(f"API Key loaded: {API_KEY[:5]}..." if API_KEY else "No API key found")


if 'df' not in locals() or not isinstance(df, pd.DataFrame):
    print("!!! 'df' DataFrame이 정의되지 않았습니다. 스크립트 상단에서 로드해주세요. !!!")
    exit()

if 'keywords' not in df.columns:
    print("Initializing 'keywords' column.")
    df['keywords'] = pd.Series([None for _ in range(len(df))], index=df.index, dtype=object)
    for i in df.index:
        df.at[i, 'keywords'] = []
else:
    print("Re-initializing existing 'keywords' column.")
    for i in df.index:
        df.at[i, 'keywords'] = []


print("\n--- DEBUGGING INFORMATION (AFTER 'keywords' INIT with .at) ---")
if 'keywords' in df.columns:
    print(f"df['keywords'].dtype: {df['keywords'].dtype}")
    print(f"First 3 values of df['keywords']:\n{df['keywords'].head(3).tolist()}")
    try:
        if len(df) > 0:
            temp_list = ["Test Assignment"]
            df.at[df.index[0], 'keywords'] = temp_list
            print(f"Test assignment to df.at[df.index[0], 'keywords'] successful. Value: {df.at[df.index[0], 'keywords']}")
            df.at[df.index[0], 'keywords'] = []
            print(f"Test reset to df.at[df.index[0], 'keywords'] with [] successful. Value: {df.at[df.index[0], 'keywords']}")
        else:
            print("DataFrame is empty, skipping test assignment.")
    except Exception as e:
        print(f"!!! ERROR during test assignment to df.at['keywords']: {e}")
else:
    print("'keywords' column not found after initialization attempt.")
print("-----------------------------------------------------")


headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

print("\nStarting API calls...\n")

for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Fetching TMDB Keywords"):
    tmdb_id_from_df = None
    try:
        tmdb_id_from_df = int(row['id'])
    except (ValueError, TypeError, KeyError) as e:
        tqdm.write(f"Skipping row {index} (Original ID: {row.get('id', 'N/A')}): Invalid or missing TMDB ID. Error: {e}")
        try:
            df.at[index, 'keywords'] = ["Error: Invalid TMDB ID in source"]
        except Exception as assign_e:
            print(f"!!!!! CRITICAL ERROR during assignment in (ValueError/TypeError/KeyError) except block for index {index}: {assign_e}")
        continue

    category = row.get('category')
    url = None

    if category == "Films (English)" or category == "Films (Non-English)":
        url = f"https://api.themoviedb.org/3/movie/{tmdb_id_from_df}/keywords"
    elif category == "TV (English)" or category == "TV (Non-English)":
        url = f"https://api.themoviedb.org/3/tv/{tmdb_id_from_df}/keywords"
    else:
        tqdm.write(f"카테고리 오류. TMDB ID: {tmdb_id_from_df}, Category: {category}")
        try:
            df.at[index, 'keywords'] = [f"Error: Invalid Category '{category}'"]
        except Exception as assign_e:
            print(f"!!!!! CRITICAL ERROR during assignment in category 'else' block for index {index}: {assign_e}")
        continue

    if url:
        response_text_snippet = "Response not captured yet"
        try:
            response = requests.get(url, headers=headers)
            response_text_snippet = response.text[:200] if response and hasattr(response, 'text') else "Response has no text or is None"
            response.raise_for_status()
            data = response.json()

            keywords_data_list = []
            if category == "Films (English)" or category == "Films (Non-English)":
                keywords_data_list = data.get('keywords', [])
            elif category == "TV (English)" or category == "TV (Non-English)":
                keywords_data_list = data.get('results', [])

            if keywords_data_list:
                keyword_names = [kw_info['name'] for kw_info in keywords_data_list if 'name' in kw_info]
                df.at[index, 'keywords'] = keyword_names
            else:
                df.at[index, 'keywords'] = []

            # time.sleep(0.05)

        except requests.exceptions.HTTPError as e:
            error_msg_list = []
            if e.response.status_code == 404:
                tqdm.write(f"API 요청 실패 (404 Not Found). TMDB ID: {tmdb_id_from_df}. URL: {url}")
                error_msg_list = ["Error: Item Not Found (404)"]
            else:
                tqdm.write(f"API 요청 실패. TMDB ID: {tmdb_id_from_df}. Status: {e.response.status_code}. URL: {url}. Error: {e}")
                error_msg_list = [f"Error: API HTTP {e.response.status_code}"]
            try:
                df.at[index, 'keywords'] = error_msg_list
            except Exception as assign_e:
                print(f"!!!!! CRITICAL ERROR during assignment in HTTPError except block for index {index}: {assign_e}")

        except requests.exceptions.RequestException as e:
            tqdm.write(f"API 연결 오류. TMDB ID: {tmdb_id_from_df}. URL: {url}. Error: {e}")
            try:
                df.at[index, 'keywords'] = ["Error: API Connection Failed"]
            except Exception as assign_e:
                print(f"!!!!! CRITICAL ERROR during assignment in RequestException except block for index {index}: {assign_e}")

        except ValueError as e:
            tqdm.write(f"JSON 파싱 실패. TMDB ID: {tmdb_id_from_df}. URL: {url}. Error: {e}. Response text: {response_text_snippet}")
            try:
                df.at[index, 'keywords'] = ["Error: JSON Parse Failed"]
            except Exception as assign_e:
                print(f"!!!!! CRITICAL ERROR during assignment in ValueError except block for index {index}: {assign_e}")
                print(f"Attempted to assign: {['Error: JSON Parse Failed']}")


        except Exception as e:
            tqdm.write(f"처리 중 알 수 없는 오류 발생. TMDB ID: {tmdb_id_from_df}. URL: {url}. Error: {e}")
            try:
                df.at[index, 'keywords'] = ["Error: Unknown Processing Error"]
            except Exception as assign_e:
                print(f"!!!!! CRITICAL ERROR during assignment in Generic Exception except block for index {index}: {assign_e}")


print("\n--- FINAL DataFrame ---")
print(df[['id', 'category', 'keywords']].head(10))
print("\n--- FINAL df.info() ---")
df.info()

API Key loaded: eyJhb...
Initializing 'keywords' column.

--- DEBUGGING INFORMATION (AFTER 'keywords' INIT with .at) ---
df['keywords'].dtype: object
First 3 values of df['keywords']:
[[], [], []]
Test assignment to df.at[df.index[0], 'keywords'] successful. Value: ['Test Assignment']
Test reset to df.at[df.index[0], 'keywords'] with [] successful. Value: []
-----------------------------------------------------

Starting API calls...



Fetching TMDB Keywords: 100%|██████████| 7726/7726 [07:43<00:00, 16.67it/s]


--- FINAL DataFrame ---
        id         category                                           keywords
0  1151039  Films (English)  [restaurant, revitalization, based on true sto...
1   619278  Films (English)  [neo-nazism, hostage-taking, bank robbery, hos...
2  1466938  Films (English)                                       [true crime]
3   668489  Films (English)  [winter, detective, rescue mission, shootout, ...
4     8966  Films (English)  [high school, soulmates, based on novel or boo...
5   682507  Films (English)  [based on novel or book, artist, bullying, alc...
6   110538  Films (English)                                                 []
7    61222  Films (English)  [highway, sunscreen, sound proofing, ventilati...
8  1462776  Films (English)  [britain, bombing, united kingdom, london blit...
9   917496  Films (English)  [afterlife, haunted house, sequel, paranormal,...

--- FINAL df.info() ---
<class 'pandas.core.frame.DataFrame'>
Index: 7726 entries, 0 to 8079
Data columns




In [None]:
# NA 값은 0으로 처리 (0으로 처리하지 않으면 불러올 때 float로 불러옴)
df = df.fillna(0)
df

Unnamed: 0,week,category,show_title,weekly_hours_viewed,runtime,weekly_views,cumulative_weeks_in_top_10,id,genres,keywords
0,2025-05-11,Films (English),Nonnas,29000000,1.9000,15300000,1,1151039,[Comedy],"[restaurant, revitalization, based on true sto..."
1,2025-05-11,Films (English),Inside Man: Most Wanted,21800000,1.7667,12300000,1,619278,"[Action, Crime, Thriller]","[neo-nazism, hostage-taking, bank robbery, hos..."
2,2025-05-11,Films (English),A Deadly American Marriage,16900000,1.7167,9800000,1,1466938,"[Documentary, Crime]",[true crime]
3,2025-05-11,Films (English),Havoc,16300000,1.7833,9100000,3,668489,"[Action, Crime, Thriller]","[winter, detective, rescue mission, shootout, ..."
4,2025-05-11,Films (English),Twilight,8700000,2.0333,4300000,4,8966,"[Fantasy, Drama, Romance]","[high school, soulmates, based on novel or boo..."
...,...,...,...,...,...,...,...,...,...,...
8075,2021-07-04,TV (Non-English),Elite,10530000,0.0000,0,1,76669,"[Crime, Mystery, Drama]","[love triangle, jealousy, infidelity, seductio..."
8076,2021-07-04,TV (Non-English),Elite,10200000,0.0000,0,1,76669,"[Crime, Mystery, Drama]","[love triangle, jealousy, infidelity, seductio..."
8077,2021-07-04,TV (Non-English),Elite,10140000,0.0000,0,1,76669,"[Crime, Mystery, Drama]","[love triangle, jealousy, infidelity, seductio..."
8078,2021-07-04,TV (Non-English),Katla,9190000,0.0000,0,1,104157,"[Drama, Mystery, Sci-Fi & Fantasy]","[small town, volcano, outbreak, volcanic erupt..."


In [None]:
# csv 파일 저장하기
df.to_csv('top-10_with_genres_keywords.csv', index=False)

In [None]:
# 저장한 csv 파일 확인
df = pd.read_csv('top-10_with_genres_keywords.csv')
df

Unnamed: 0,week,category,show_title,weekly_hours_viewed,runtime,weekly_views,cumulative_weeks_in_top_10,id,genres,keywords
0,2025-05-11,Films (English),Nonnas,29000000,1.9000,15300000,1,1151039,['Comedy'],"['restaurant', 'revitalization', 'based on tru..."
1,2025-05-11,Films (English),Inside Man: Most Wanted,21800000,1.7667,12300000,1,619278,"['Action', 'Crime', 'Thriller']","['neo-nazism', 'hostage-taking', 'bank robbery..."
2,2025-05-11,Films (English),A Deadly American Marriage,16900000,1.7167,9800000,1,1466938,"['Documentary', 'Crime']",['true crime']
3,2025-05-11,Films (English),Havoc,16300000,1.7833,9100000,3,668489,"['Action', 'Crime', 'Thriller']","['winter', 'detective', 'rescue mission', 'sho..."
4,2025-05-11,Films (English),Twilight,8700000,2.0333,4300000,4,8966,"['Fantasy', 'Drama', 'Romance']","['high school', 'soulmates', 'based on novel o..."
...,...,...,...,...,...,...,...,...,...,...
7721,2021-07-04,TV (Non-English),Elite,10530000,0.0000,0,1,76669,"['Crime', 'Mystery', 'Drama']","['love triangle', 'jealousy', 'infidelity', 's..."
7722,2021-07-04,TV (Non-English),Elite,10200000,0.0000,0,1,76669,"['Crime', 'Mystery', 'Drama']","['love triangle', 'jealousy', 'infidelity', 's..."
7723,2021-07-04,TV (Non-English),Elite,10140000,0.0000,0,1,76669,"['Crime', 'Mystery', 'Drama']","['love triangle', 'jealousy', 'infidelity', 's..."
7724,2021-07-04,TV (Non-English),Katla,9190000,0.0000,0,1,104157,"['Drama', 'Mystery', 'Sci-Fi & Fantasy']","['small town', 'volcano', 'outbreak', 'volcani..."
