# [영화진흥위원회 API](https://www.kobis.or.kr/kobisopenapi/homepg/apiservice/searchServiceInfo.do) / [다음 영화 API](https://movie.daum.net) / [TMDB 영화 API](https://developer.themoviedb.org/docs)

In [3]:
import requests
from urllib import parse
from bs4 import BeautifulSoup
from tqdm import tqdm
import time
from datetime import datetime, timedelta

def null_coalesce(value, default=''):
    return value if value is not None else default

def dict_coalesce(dictionary, key, default=''):
    try:
        return dictionary[key]
    except KeyError:
        return default

def dict_combiner(old_dict, new_dict):
    return {**old_dict, **new_dict}

In [None]:
# keys list

KOBIS_KEY = ""

TMDB_API_KEY = ""
TMDB_API_TOKEN = ""

NAVER_CLIENT_ID = ""
NAVER_CLIENT_SECRET = ""

In [563]:
## KOBIS 영화 API - 영화목록 조회

def get_kobis_movie_api(KEY):
    response = requests.get(f"http://kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieList.json?key={KEY}")
    totCnt = response.json()['movieListResult']['totCnt']
    itemPerPage = 100
    movie_list = []
    for curPage in tqdm(range(1, totCnt // itemPerPage + 2), desc="영화목록"):
        response = requests.get(f"http://kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieList.json?key={KEY}&curPage={curPage}&itemPerPage={itemPerPage}")
        movie_list += response.json()['movieListResult']['movieList']
    return movie_list

movie_list = get_kobis_movie_api(KOBIS_KEY)
query_list = [m['movieNm'] for m in movie_list]

영화목록: 100%|██████████████████████████████████████████████████████████████████████| 975/975 [15:39<00:00,  1.04it/s]


In [None]:
## 다음 영화검색

url = "https://movie.daum.net/api"

def get_daum_movie_api(url, referer):
    headers = {
        "accept": "application/json, text/plain",
        "accept-language": "ko,en;q=0.9,en-US;q=0.8",
        "referer": f"https://movie.daum.net/{referer}",
        "sec-ch-ua": "Chromium",
        "sec-ch-ua-mobile": "?0",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67",
    }
    response = requests.get(url, headers=headers)
    response.close()
    return response.json() if 200 <= response.status_code < 300 else None
    
def get_daum_movie_search(query):
    result = []
    i = 1
    while True:
        body = get_daum_movie_api(f'{url}/search?t=movie&q={parse.quote(query)}&page={i}&size=20', "")['result']['search_result']
        result += body['documents']
        if body['meta']['is_end']: break
        i += 1
    return result

def get_daum_movie_info(movieId):
    result = {}
    for query in ['main', 'crew']:
        result = {**result, **get_daum_movie_api(f'{url}/movie/{movieId}/{query}', f'api/movie/{movieId}/main')}
    for k, query in {'photoList': f'movie/{movieId}/photoList', 'videoList': f'video/list/movie/{movieId}'}.items():
        i = 1
        contents = []
        while True:
            body = get_daum_movie_api(f'{url}/{query}?page={i}&size=20', f'api/movie/{movieId}/main')
            contents += body['contents']
            if body['page']['last']: break
            i += 1
        result = {**result, **{k: contents}}
    return result

daum_movie_list = []

for query in tqdm(query_list, desc="다음 영화목록"):
    daum_movie_list += [m['document']['movieId'] for m in get_daum_movie_search(query)]  # if m['document']['movieId'] not in daum_movie_list
daum_movie_list = list(set(daum_movie_list))
for i, movieId in enumerate(tqdm(daum_movie_list, desc="다음 영화상세정보")):
    # if i < 10: continue
    daum_movie_list[i] = get_daum_movie_info(movieId)

print('\n완료😊')

In [None]:
daum_movie_list[100]

In [574]:
set(sum([m['movieCommon']['genres'] for m in daum_movie_list[:1000]], [])) # 장르 리스트

{'SF',
 '가족',
 '공연',
 '공포',
 '다큐멘터리',
 '드라마',
 '로맨스/멜로',
 '무협',
 '뮤지컬',
 '미스터리',
 '범죄',
 '서부',
 '성인',
 '스릴러',
 '시대극',
 '실험',
 '애니메이션',
 '액션',
 '어드벤처',
 '전쟁',
 '코미디',
 '판타지'}

In [None]:
## TMDB 영화 API - https://developer.themoviedb.org/reference/search-movie, https://developer.themoviedb.org/docs/image-basics

def get_first_to_last_day_of_month(year=0, month=0, day=1):
    date = datetime(year, month, day) if bool(year) else datetime.now()
    first_day_of_month = date.replace(day=1)  # 한 달의 시작일 계산
    last_day_of_month = (first_day_of_month + timedelta(days=32)).replace(day=1) - timedelta(days=1)  # 한 달 후의 날짜 계산 (현재 월의 일 수를 고려)
    # print("한 달의 시작일:", first_day_of_month.strftime("%Y-%m-%d"))
    # print("한 달의 마지막 일:", last_day_of_month.strftime("%Y-%m-%d"))
    return first_day_of_month.strftime("%Y-%m-%d"), last_day_of_month.strftime("%Y-%m-%d")

def get_tmdb_movie_img(path):
    return f"https://image.tmdb.org/t/p/original{path}"

def get_tmdb_movie_api(API_TOKEN, url):
    headers = {
        "accept": "application/json",
        "Authorization": f"Bearer {API_TOKEN}",
        "User-Agent": "ReadMe-API-Explorer",
    }
    response = requests.get(f"https://api.themoviedb.org/3/{url}", headers=headers)
    response.close()
    return response.json() if 200 <= response.status_code < 300 else None
    
def get_tmdb_movie_list(API_TOKEN, url, query=""):  # https://developer.themoviedb.org/reference/discover-movie
    result = []; i = 1;
    while True:
        body = get_tmdb_movie_api(API_TOKEN, f'{url}&page={i}')
        if body and "results" in body:
            total_pages = body['total_pages']
            if total_pages and total_pages < 501:
                result += body['results']
                print(f"🎬TMDB 영화목록 Progress: {i}/{total_pages} {query[:20] if len(query) >= 18 else query.ljust(20)}", end="\r")
                if i == total_pages: break
            else: return total_pages;
        else: 
            print(body, end="\r")
            break
        i += 1
    return result

def get_tmdb_movie_info(API_TOKEN, movieID):  # https://developer.themoviedb.org/reference/movie-details
    url = f"movie/{movieID}?append_to_response=credits%2Cimages%2Cvideos&language=ko-KP"
    return get_tmdb_movie_api(API_TOKEN, url)

def get_tmdb_movie_body(movieID):
    url = f"https://www.themoviedb.org/movie/{movieID}"
    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "accept-encoding": "gzip, deflate, br",
        "accept-language": "ko,en;q=0.9,en-US;q=0.8",
        # "cookie": "tmdb.prefs=%7B%22adult%22%3Afalse%2C%22i18n_fallback_language%22%3A%22en-US%22%2C%22locale%22%3A%22ko-KR%22%2C%22country_code%22%3A%22KR%22%2C%22timezone%22%3A%22Asia%2FSeoul%22%7D; OptanonAlertBoxClosed=2023-08-30T02:22:05.645Z; _ga_HCD46FTYN6=GS1.1.1694154457.2.1.1694154981.60.0.0; _gid=GA1.2.250106362.1694155013; _dc_gtm_UA-2087971-10=1; _ga_4257LNMWD9=GS1.1.1694155013.4.1.1694156589.2.0.0; _ga=GA1.1.494673786.1693922056; OptanonConsent=isGpcEnabled=0&datestamp=Fri+Sep+08+2023+16%3A03%3A09+GMT%2B0900+(%ED%95%9C%EA%B5%AD+%ED%91%9C%EC%A4%80%EC%8B%9C)&version=202303.2.0&browserGpcFlag=0&isIABGlobal=false&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0002%3A0%2CC0003%3A0%2CC0004%3A0&AwaitingReconsent=false&geolocation=KR%3B11; tmdb.session=AYylWH11OlLvWKZFVOZYNh-d3ODreiY6NIYVD6_h_cmzWjuD3ASJtmrZeYYEeMUBgwOfq9tSW554N4fASwuZNyTdZ0WBI2uVRmSmk58oGGT-RUNUAS7BOeVFibqrEMn1rRHifaqoZztS2tBq2GJo88ydRmAtn-EXiiHsMuE64OwBJChF15VKfEMdwGNoyo3HO0a2dgI9XcKW4J4tLnWncc31KGZzFHJ9VKuXaATZ6K2MxBl_g6OMThrVHWfnCuRmSp97vdl6FvtfH9vZpsIdVL5RPGxR45Fho1vTd2G22h2T2ijTJhZdOU3NziyWc4t9XNxGsT08P867MWmNLwnrB9kvABlkTlPH4FNtd-5NSVPqHDkMGWGnPdqHG7SL3mwtZJ79C6wO5_kHv1EO6yLGphOMm2zDCnL6xFkxIho-W2rxZpVj82AOnNLk0RdvVTad2usH0fQe-MS2qIQtXkpy4B4RXFABNFVGQMYMF9gLCOzfhzdfjJxcTnQ-Av2LDqzEqiPiXelFDl594xKvGps9bJqt2cdmAVgxFZTgcQ6EjkiYoe38phvHzUhYeqDaFF9XUGU2w1YFfxCoO43GBuyVaTVCK8xnkZLsfUZ8Tjr6itWRZzonYQTKzaGAS4T76dDZeuSK9FipptiCL3t8qx9Q8HyhBWKLw9xJNmWmzLjy5yG8jAJttlZJ0-HlH_Ss5jymuA%3D%3D",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67",
    }
    response = requests.get(url, headers=headers)
    body = BeautifulSoup(response.text, 'html.parser') if 200 <= response.status_code < 300 else None
    return body


tmdb_movie_list = []

# for i, query in enumerate(query_list):  # GET MOVIES LIST FIRST VERSION(query)
#     url = f'search/movie?language=ko-KP&query={query}'  # GET MOVIES LIST API_TOKEN VERSION
#     tmdb_movie_list += [m['id'] for m in get_tmdb_movie_list(TMDB_API_TOKEN, url, f"- {i}/{len(query_list)} {query}")]

today = datetime.now()
for year in range(1875, today.year + 1):  # GET MOVIES LIST SECOND VERSION(year)
    url = f"discover/movie?include_adult=true&include_video=true&language=ko-KP&sort_by=popularity.desc"
    data = get_tmdb_movie_list(TMDB_API_TOKEN, f"{url}&year={year}", f"{year}년")
    if data:
        if isinstance(data, list):
            tmdb_movie_list += data
        else:
            for month in range(1, 13):
                gte, lte = get_first_to_last_day_of_month(year, month)
                data = get_tmdb_movie_list(TMDB_API_TOKEN, f"{url}&release_date.gte={gte}&release_date.lte={lte}", f"{year}년 {month}월")
                if isinstance(data, list):
                    tmdb_movie_list += data
                if year == today.year and month == today.month: break
    else: continue
tmdb_movie_list = list(set([m['id'] for m in tmdb_movie_list]))

for i, movieID in enumerate(tqdm(tmdb_movie_list, desc="TMDB 영화상세정보")):
#     tmdb_movie_list[i] = get_tmdb_movie_body(movieID)  # HTML에서 직접 가져옴
    tmdb_movie_list[i] = get_tmdb_movie_info(TMDB_API_TOKEN, movieID)  # Api-Details에서 가져옴
    
# tmdb_movie_list = [get_tmdb_movie_info(TMDB_API_TOKEN, movieID) for movieID in tmdb_movie_list]

print('\n완료😊')

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame(movie_list)
df.replace({None: np.nan}, inplace=True)

df.info()
df.describe(include='all')
# df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96272 entries, 0 to 96271
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   movieCd      96272 non-null  object
 1   movieNm      96272 non-null  object
 2   movieNmEn    76451 non-null  object
 3   prdtYear     92506 non-null  object
 4   openDt       29582 non-null  object
 5   typeNm       90155 non-null  object
 6   prdtStatNm   95829 non-null  object
 7   nationAlt    86876 non-null  object
 8   genreAlt     84476 non-null  object
 9   repNationNm  86878 non-null  object
 10  repGenreNm   83355 non-null  object
 11  directors    64745 non-null  object
 12  companys     15204 non-null  object
 13  director     64745 non-null  object
 14  peopleNm     64745 non-null  object
 15  company      15204 non-null  object
 16  companyCd    15204 non-null  object
 17  companyNm    15204 non-null  object
dtypes: object(18)
memory usage: 13.2+ MB


Unnamed: 0,movieCd,movieNm,movieNmEn,prdtYear,openDt,typeNm,prdtStatNm,nationAlt,genreAlt,repNationNm,repGenreNm,directors,companys,director,peopleNm,company,companyCd,companyNm
count,96272,96272,76451,92506,29582,90155,95829,86876,84476,86878,83355,64745,15204,64745,64745,15204,15204,15204
unique,96272,89022,68903,122,6887,5,7,2140,2090,85,21,27039,4338,24926,24926,3497,3497,3490
top,20232828,하루,Package Screening,2019,20080904,장편,기타,한국,드라마,한국,드라마,코지마 유키오,20161021(주)영화사가을,코지마 유키오,코지마 유키오,20161021(주)영화사가을,20161021,(주)영화사가을
freq,1,22,608,6080,36,57353,67045,31829,16076,32281,21625,386,838,386,386,838,838,838


In [None]:
## 네이버 영화검색

def get_naver_movie_poster(original_url):
    src_param = parse.parse_qs(parse.urlparse(original_url).query).get('src', [''])[0]
    return f"https://search.pstatic.net/common?{parse.urlencode({'src': src_param})}" if src_param else original_url

def get_naver_movie_body(query):
    response = requests.get(f"https://search.naver.com/search.naver?query={parse.quote(query)}")
    return BeautifulSoup(response.text, 'html.parser') if 200 <= response.status_code < 300 else None

def get_naver_movie_box(query):
    body = get_naver_movie_body(query)
    return {info.select_one("dt").text.strip() if info.select_one("dt") else 'N/A': info.select_one("dd").text.strip() if info.select_one("dd") else 'N/A' for info in body.select(".cm_content_area .info_group")}

def get_naver_movie_info(query):
    query = f"{query} 영화"
    body_info = get_naver_movie_box(query)
    movie_info = {
        "star": dict_coalesce(body_info, '평점'),
        "audience": dict_coalesce(body_info, '관객수'),
    }    
    body = get_naver_movie_body(query).select_one(".cm_content_area")
    movie_info['img'] = get_naver_movie_poster(body.select_one("img")['src']) if body else ""

    more_btn = body.select_one("a._tail")['href']
    
    movie_info['synopsis'] = body[1].select_one("p.text").text.strip() if body else ""
    query = f"{query} 정보"
    body_info = get_naver_movie_box(query)
    movie_info = {**{
        "distributor": dict_coalesce(body_info, '배급'), 
        "showTm": dict_coalesce(body_info, '러닝타임'), 
        "watchGradeNm": dict_coalesce(body_info, '등급'),}
    , **movie_info, }
    return movie_info

def get_naver_movie_api(CLIENT_ID, CLIENT_SECRET, query): # 서비스 종료 https://developers.naver.com/notice/article/9553
    url = f'https://openapi.naver.com/v1/search/movie.json?query={query}'
    headers = {
        "host": "openapi.naver.com",
        "accept": "*/*",
        "X-Naver-Client-Id": CLIENT_ID,
        "X-Naver-Client-Secret": CLIENT_SECRET,
    }
    response = requests.get(url, headers=headers)
    return response.json() if 200 <= response.status_code < 300 else None

# for i, movie in enumerate(tqdm(movie_list, desc="네이버 영화상세정보")):
#     query = f"{movie['movieNm']} 영화"
#     movie_info = get_naver_movie_info(query)
#     movie_list[i] = {**movie, **movie_info}
    # time.sleep(5)

In [None]:
## 파일저장(.json)

def save_movie_list_json(movie_list, save_name="movie_list"):
    movie_list.to_json(f'./{path}.json', force_ascii=False, orient='records', indent=4)

save_movie_list_json(daum_movie_list, "daum_movie_list")
save_movie_list_json(tmdb_movie_list, "tmdb_movie_list")

---

In [None]:
## KOBIS 영화 API - 영화상세정보 조회

response_keys = ['companyCd', 'companyNm', 'companyNmEn', 'companyPartNames', 'ceoNm', 'filmoNames']

def get_kobis_movie_api_xml(KEY):
    movie_list = []
    response = requests.get(f"http://kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieList.xml?key={KEY}")
    if response.status_code == 200:
        totcnt = int(BeautifulSoup(response.text, 'xml').select_one("movieListResult totCnt").text)
        itemPerPage = 100
        for curPage in tqdm(range(1, totcnt // itemPerPage + 2), desc="영화목록"):
            response = requests.get(f"http://kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieList.xml?key={KEY}&curPage={curPage}&itemPerPage={itemPerPage}")
            movie_body = BeautifulSoup(response.text, 'xml').select("movie")
            movie_list += [{tag.name : (tag.text if tag.text else None) for tag in movie.find_all()} for movie in movie_body]
        print('성공!😻', len(movie_list), len(movie_list) == totcnt)
        return movie_list
    else :
        print(response.status_code)
        
def get_kobis_movie_api_info(KEY):
    response = requests.get(f"http://kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieList.json?key={KEY}")
    totCnt = response.json()['movieListResult']['totCnt']
    itemPerPage = 100; movie_list = [];
    for curPage in tqdm(range(1, totCnt // itemPerPage + 2), desc="영화목록"):
        response = requests.get(f"http://kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieList.json?key={KEY}&curPage={curPage}&itemPerPage={itemPerPage}")
        movie_list += [m['movieCd'] for m in response.json()['movieListResult']['movieList']]
    for i, movieCd in enumerate(tqdm(movie_list), desc="영화상세정보"):
        try:
            response = requests.get(f"http://www.kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieInfo.json?key={KEY}&movieCd={movieCd}")
            movie_list[i] = response.json()['movieInfoResult']['movieInfo']
        except Exception as e:
            print(movieCd, response.json())

movie_list = get_kobis_movie_api_info(KOBIS_KEY)

100%|██████████| 963/963 [31:52<00:00,  1.99s/it]

성공!😻 96272 True





In [25]:
response = requests.get(f"http://www.kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieInfo.json?key={KEY}&movieCd={movie_list[970]}")
response.json()

{'faultInfo': {'message': '키의 하루 이용량을 초과하였습니다.', 'errorCode': '320011'}}