# 1. 라이브러리 선언

In [1]:
import pandas as pd
import datetime
import requests

from bs4                            import BeautifulSoup
from time                           import sleep
from selenium                       import webdriver
from webdriver_manager.chrome       import ChromeDriverManager

# 2. kofic 모든 영화목록 크롤링

## 1) 영화목록 크롤링

In [2]:
driver = webdriver.Chrome(executable_path=ChromeDriverManager().install())
url    = "https://www.kobis.or.kr/kobis/business/mast/mvie/searchMovieList.do"

driver.get(url)
sleep(2)

page_num        = 0
movie_detail_df = pd.DataFrame()
while 1:
    # 10개 페이지를 다 가져왔으면 next_button을 누릅니다.
    if page_num == 10:
        sleep(1)
        next_button_xpath = '//*[@id="pagingForm"]/div/a[3]'
        next_button       = driver.find_element_by_xpath(next_button_xpath)
        next_button.click()
        sleep(2)
        
        page_num = 0
        continue
    
    # 페이지를 하나씩 더해줍니다.
    page_num   += 1
    page_xpath  = f'//*[@id="pagingForm"]/div/ul/li[{page_num}]/a'
    page_button = driver.find_element_by_xpath(page_xpath)
    page_button.click()
    sleep(1.5)
    
    # driver 새로 갱신하지 않은 오류가 나올 수도 있습니다.
    html            = BeautifulSoup(driver.page_source, "lxml")
    temp            = html.select("table")
    table           = pd.read_html(str(temp))
    df              = table[0]
    movie_detail_df = movie_detail_df.append(df)
    sleep(1)
    
movie_detail_df = movie_detail_df.reset_index(drop=True)
driver.quit()





KeyboardInterrupt: 

## 2) csv 저장하기

In [None]:
movie_detail_df.to_csv("data/movie_basic_info.csv", index=False)

# 3. kofic api에서 영화 상세정보 가져오기

## 1) 영화 상세정보 가져오기

In [316]:
# api 접근 토큰
key1 = "1baad2f53504b36a98f2638e869ff944"
key2 = "749dd0d41f0199cb036e3dfa99cc3b65" 

# url
print("http://www.kobis.or.kr/kobisopenapi/homepg/apiservice/searchServiceInfo.do")

http://www.kobis.or.kr/kobisopenapi/homepg/apiservice/searchServiceInfo.do


In [None]:
"""
kofic api 사용하여 영화 상세정보 가져오기
"""

# url
print("http://www.kobis.or.kr/kobisopenapi/homepg/apiservice/searchServiceInfo.do")

start = 81900
end   = start + 2950
print(f"{start}_{end}")

url              = "http://www.kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieInfo.json"
movie_basic_df   = pd.read_csv("data/movie_basic_info.csv")
movie_detail_df  = pd.DataFrame()
movie_codes      = movie_basic_df["영화코드"].values.tolist()
for movie_code in movie_codes[start:end]:
    data = {
        "key"    : key1,
        "movieCd": movie_code
    }
    response = requests.get(url, data)
    
    try:
        movie_detail_info = response.json()["movieInfoResult"]["movieInfo"]
    except KeyError:
        print("하루 key 사용량 초과")
        break
    movie_detail_df = movie_detail_df.append(movie_detail_info, ignore_index=True)
    
movie_detail_df = movie_detail_df.reset_index(drop=True)
path            = f"data/movie_detail_data{start}_{end}.csv"
movie_detail_df.to_csv(path, index=False)
pd.read_csv(path)

## 2) movie_detail_info 통합

In [330]:
import os
file_names = os.listdir("data/")

df_list = []
for file_name in file_names:
    if "movie_detail_data" in file_name:
        path = f"data/{file_name}"
        df = pd.read_csv(path)
        df_list.append(df)

## 3) movie_detail_info 통합본 csv화 시키기

In [None]:
movie_data = pd.DataFrame()
for df in df_list:
    movie_data = movie_data.append(df)

movie_data.to_csv("data/movie_detail_info.csv", index=False)

# 4. 주간 TOP 10 영화 데이터 가져오기

## 1) api 호출 및 데이터 가져오기

In [237]:
# 2003 11 10부터 데이터가 존재한다.
def get_weekly_top_10_movies(key, start_date="20031110", end_date="20031117", show=False):
    """
    kofic 주간/주말 박스오피스 api에 요청하여
    주간 TOP 10 영화 리스트를 가져온다.
    
    key       : 발급받은 오픈 api key
    start_date: 영화 정보를 가져올 시작 날짜(yyyymmdd)
    end_date  : 영화 정보를 가져올 마지막 날짜(yyyymmdd)
    show      : 요쳥 진행 상황 출력할 것인가(TF)
    """
    url = "http://www.kobis.or.kr/kobisopenapi/webservice/rest/boxoffice/searchWeeklyBoxOfficeList.json"
    
    start_year  = int(start_date[:4])
    start_month = int(start_date[4:6])
    start_day   = int(start_date[6:])
    
    end_year  = int(end_date[:4])
    end_month = int(end_date[4:6])
    end_day   = int(end_date[6:])
    
    start_date = datetime.datetime(start_year, start_month, start_day)
    end_date   = datetime.datetime(end_year, end_month, end_day)
    
    weekly_top_10_movie_df = pd.DataFrame()
    while start_date < end_date:
        date = start_date.strftime("$Y%m%d") 
        data = {
            "key"        : key,
            "targetDt"   : date,
            "weekGb"     : "0",
            "itemPerPage": "10",
        }
        response   = requests.get(url, params=data)
        data       = response.json()
        movie_data = data["boxOfficeResult"]["weeklyBoxOfficeList"]
        
        # 날짜 출력(1주 단위)
        if show:
            showrange  = data["boxOfficeResult"]["showRange"]
            print(showrange)
        
        # TOP 10 영화들을 새로운 dataframe에 넣어준다.
        for movie_info in movie_data:
            weekly_top_10_movie_df = weekly_top_10_movie_df.append(movie_info, ignore_index=True)
        
        # start_date를 7일씩 더해준다.
        week_day   = datetime.timedelta(days=7)
        start_date = start_date + week_day
    
    return weekly_top_10_movie_df

In [229]:
key        = "749dd0d41f0199cb036e3dfa99cc3b65"
start_date = "20031110"
end_date   = "20210809"
weekly_10_movie_df = get_weekly_top_10_movies(start_date=start_date, end_date=end_date, key=key, show=True)
weekly_10_movie_df.to_csv("data/weekly_10_movie.csv", index=False)

20031110~20031116
20031117~20031123
20031124~20031130
20031201~20031207
20031208~20031214
20031215~20031221
20031222~20031228
20031229~20040104
20040105~20040111
20040112~20040118
20040119~20040125
20040126~20040201
20040202~20040208
20040209~20040215
20040216~20040222
20040223~20040229
20040301~20040307
20040308~20040314
20040315~20040321
20040322~20040328
20040329~20040404
20040405~20040411
20040412~20040418
20040419~20040425
20040426~20040502
20040503~20040509
20040510~20040516
20040517~20040523
20040524~20040530
20040531~20040606
20040607~20040613
20040614~20040620
20040621~20040627
20040628~20040704
20040705~20040711
20040712~20040718
20040719~20040725
20040726~20040801
20040802~20040808
20040809~20040815
20040816~20040822
20040823~20040829
20040830~20040905
20040906~20040912
20040913~20040919
20040920~20040926
20040927~20041003
20041004~20041010
20041011~20041017
20041018~20041024
20041025~20041031
20041101~20041107
20041108~20041114
20041115~20041121
20041122~20041128
20041129~2

## 2) 중복되는 영화 처리

In [245]:
weekly_movie_df = pd.read_csv("data/weekly_10_movie.csv")
weekly_movie_df["movieCd"].duplicated().sum()

6205

In [283]:
reverse_df = weekly_movie_df[::-1].reset_index()
reverse_df

Unnamed: 0,index,rnum,rank,rankInten,rankOldAndNew,movieCd,movieNm,openDt,salesAmt,salesShare,salesInten,salesChange,salesAcc,audiCnt,audiInten,audiChange,audiAcc,scrnCnt,showCnt
0,9258,10,10,58,OLD,20218834,그린 나이트,2021-08-05,99400010,0.6,98491010,10835.1,100309010,10803,10702,10596.0,10904,235,1065
1,9257,9,9,0,NEW,20218815,블랙핑크 더 무비,2021-08-04,154036500,0.9,154036500,100.0,154036500,11761,11761,100.0,11761,147,799
2,9256,8,8,-2,OLD,20218364,랑종,2021-07-14,126192130,0.7,-339038900,-72.9,8586730770,12347,-33394,-73.0,827673,158,928
3,9255,7,7,0,NEW,20218875,극장판 도라에몽: 진구의 신공룡,2021-08-05,337195080,2.0,337195080,100.0,337195080,38669,38669,100.0,38669,502,3142
4,9254,6,6,-1,OLD,20202185,방법: 재차의,2021-07-28,440767580,2.5,-575734170,-56.6,1476778330,45306,-73224,-61.8,166005,640,6090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9254,4,5,5,0,NEW,20030127,스캔들-조선남녀상열지사,2003-10-02,8320000,5.5,8320000,100.0,8320000,851,851,100.0,851,3,35
9255,3,4,4,0,NEW,20030154,황산벌,2003-10-17,12723500,8.3,12723500,100.0,12723500,1307,1307,100.0,1307,5,77
9256,2,3,3,0,NEW,20030247,아이덴티티,2003-10-31,18316500,12.0,18316500,100.0,18316500,1985,1985,100.0,1985,4,78
9257,1,2,2,0,NEW,20030152,위대한 유산,2003-10-24,23113000,15.2,23113000,100.0,23113000,2557,2557,100.0,2557,5,93


In [314]:
movie_external_info = reverse_df[~reverse_df["movieNm"].duplicated()]
movie_external_info = movie_external_info[["movieCd", "movieNm", "openDt", "audiAcc", "salesAcc"]]
movie_external_info

Unnamed: 0,movieCd,movieNm,openDt,audiAcc,salesAcc
0,20218834,그린 나이트,2021-08-05,10904,100309010
1,20218815,블랙핑크 더 무비,2021-08-04,11761,154036500
2,20218364,랑종,2021-07-14,827673,8586730770
3,20218875,극장판 도라에몽: 진구의 신공룡,2021-08-05,38669,337195080
4,20202185,방법: 재차의,2021-07-28,166005,1476778330
...,...,...,...,...,...
9250,20030402,써클,2003-11-14,47,321500
9251,20030395,최후의 만찬,2003-11-14,59,2163500
9252,20030284,깝스,2003-11-05,103,969000
9253,20030393,케이트 앤 레오폴드,2003-10-31,195,1591000


## 3) csv 저장하기

In [331]:
movie_external_info.to_csv("data/movie_external_info.csv", index=False)