In [1]:
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

import time
import re

In [2]:
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")

driver = webdriver.Chrome('chromedriver',options=options)
driver.implicitly_wait(3)

url = 'https://www.kobis.or.kr/kobis/business/mast/mvie/searchMovieList.do'

In [3]:
def get_actors(movie_df):
    driver.get(url)
    
    movie_actor_df = pd.DataFrame(movie_df)
    movie_actor_df['actor1_code'] = np.nan
    movie_actor_df['actor2_code'] = np.nan
    
    for i in range(len(movie_df)):
        driver.execute_script("mstView('movie','"+str(movie_df.loc[i,"code"])+"')")
        driver.implicitly_wait(10)
        
        try:
            elem = driver.find_element(By.CSS_SELECTOR, "div[name=\""+str(movie_df.loc[i,"code"])+"_staff\"] > dl > div:nth-child(2) > dd > table:nth-child(1) > tbody > tr > td > a:nth-child(2)")
            onClickStr = elem.get_attribute('onClick')
            movie_df.loc[i,"actor1_code"] = onClickStr.split("'")[3]
        except Exception:
            continue
            
        try:
            elem = driver.find_element(By.CSS_SELECTOR, "div[name=\""+str(movie_df.loc[i,"code"])+"_staff\"] > dl > div:nth-child(2) > dd > table:nth-child(1) > tbody > tr > td > a:nth-child(3)")
            onClickStr = elem.get_attribute('onClick')
            movie_df.loc[i,"actor2_code"] = onClickStr.split("'")[3]
        except Exception:
            continue
    
    return movie_actor_df


In [4]:
movie_df = pd.read_excel('./Preprocessed_Data.xlsx').loc[:,['movie_name','code']]
movie_actor_df = get_actors(movie_df)

In [7]:
movie_actor_df.to_csv('17_18_preprocessed_data/movie_actor_df.csv', encoding='utf-8-sig')
movie_actor_df

Unnamed: 0,movie_name,code,actor1_code,actor2_code
0,택시운전사,20162869,10037018,10080533
1,신과함께-죄와 벌,20150976,10087253,10071209
2,공조,20151228,10088975,10054128
3,스파이더맨: 홈 커밍,20167303,20272630,10016538
4,범죄도시,20172742,10021341,10054204
...,...,...,...,...
187,프리즌 이스케이프,20207893,10009288,10056554
188,검객,20178401,10060612,20129449
189,조제,20201002,10087751,20279929
190,사라진 시간,20193859,10067353,10030161


In [8]:
actor_list = list(set(list(movie_actor_df['actor1_code'].dropna())+list(movie_actor_df['actor2_code'].dropna())))

In [9]:
def get_actor_movies(actor_list):
    driver.get(url)
    columns = ['actor_code', 'movie_name', 'movie_code']
    actor_movie = pd.DataFrame(columns=columns)
    
    driver.execute_script("mstView('people','"+str(actor_list[0])+"')")
    driver.implicitly_wait(10)
    driver.find_element(By.CSS_SELECTOR, "ul.list_tab > li:nth-child(2)").click()
    driver.implicitly_wait(10)

    
    for i in range(len(actor_list)):
        page_index = 0
        while True:
            page_index += 1
            driver.execute_script("dtlReq('people','"+str(actor_list[i])+"','filmo','N','"+str(page_index)+"')")
            driver.implicitly_wait(10)
            
            html = BeautifulSoup(driver.page_source, 'html.parser')
            movie_list = html.select('ul.fmList>li')
            if len(movie_list) == 0: break
            
            for li in html.select('.fmList>li'):
                if '주연' not in li.select_one('dl>dd').get_text(): continue

                movie = li.select_one('dl>dt>a')
                movie_name = movie.get_text()
                movie_code = movie['onclick'].split("'")[3]

                new_row = [[actor_list[i], movie_name, movie_code]]
                actor_movie = actor_movie.append(pd.DataFrame(new_row, columns=columns), ignore_index=True)

    return actor_movie


In [10]:
actor_movie = get_actor_movies(actor_list)

In [11]:
actor_movie.to_csv('17_18_preprocessed_data/actor_movie.csv', encoding='utf-8-sig')
actor_movie

Unnamed: 0,actor_code,movie_name,movie_code
0,20209526,더 문,20218935
1,20209526,언더독(Underdog),20186324
2,20209526,스윙키즈(Swing Kids),20175547
3,20209526,7호실(Room No.7),20168754
4,20209526,형(MY ANNOYING BROTHER),20154661
...,...,...,...
4391,10065531,라스베가스의 공포와 혐오(Fear And Loathing In Las Vegas),19988459
4392,10065531,도니 브래스코 (Donnie Brasco),19980158
4393,10065531,조니 뎁의 돈 쥬앙 (Don Juan Demarco),19950299
4394,10065531,길버트 그레이프(What'S Eating Gilbert Grape),20040805


In [9]:
actor_movie = pd.read_csv('17_18_preprocessed_data/actor_movie.csv')
actor_movie

Unnamed: 0.1,Unnamed: 0,actor_code,movie_name,movie_code
0,0,20224649,공조2:인터내셔날,20215601
1,1,20224649,해피뉴이어(가제),20217807
2,2,20224649,기적(Miracle),20200474
3,3,20224649,엑시트(EXIT),20184621
4,4,20224649,공조(Confidential Assignment),20151228
...,...,...,...,...
2890,2890,10054008,가위(A nightmare),20000009
2891,2891,10054008,동감(Ditto),20000011
2892,2892,10054008,MOB 2025 1편 : 전운(MOB 2025),20008019
2893,2893,10054008,주유소 습격사건(Attack The Gas Station!),19990067


In [12]:
movie_code_list = list(set(actor_movie['movie_code']))
len(movie_code_list)

3287

In [13]:
def get_movie_info(movie_list):
    columns = ['movie_code', 'release', 'acc_audience_num']
    movie_info = pd.DataFrame(columns=columns)
    
    for i in range(len(movie_list)):
        if i%30 == 0 and i != 0:
            movie_info.to_csv('./17_18_preprocessed_data/movie_info/movie_info_'+str(i)+'.csv', encoding='utf-8-sig')
            movie_info = pd.DataFrame(columns=columns)
            print(i)
            
        new_row = [movie_list[i]]
        
        driver.get(url)
        driver.execute_script("mstView('movie','"+movie_list[i]+"')")
        
        html = BeautifulSoup(driver.page_source, 'html.parser')
        elem = html.find('dt', text='개봉일')
        release = '해당정보없음' if elem is None else elem.find_next_sibling('dd').text.strip()

        new_row.append(release)
        
        if release == '해당정보없음':
            new_row.append(0)
            movie_info = movie_info.append(pd.DataFrame([new_row], columns=columns), ignore_index=True)
            continue
        
        driver.find_element(By.CSS_SELECTOR, "div.wrap_tab.tab2 > ul.list_tab > li:nth-child(2) > a").click()
        driver.implicitly_wait(60)
        
        try:
            acc_audience_num = driver.find_element(By.CSS_SELECTOR, "div.item_tab.statistics > div:nth-child(1) > table > tbody > tr:nth-child(2) > td:nth-child(4)").text
            acc_audience_num = re.match(r'^([0-9].+)\s\(.+\)$', acc_audience_num).group(1).replace(',', '')
            new_row.append(int(acc_audience_num))
        except Exception:
            new_row.append(0)
        
        movie_info = movie_info.append(pd.DataFrame([new_row], columns=columns), ignore_index=True)
        
    return movie_info


In [16]:
movie_info = get_movie_info(movie_code_list)

30
60
90
120
150
180
210
240
270
300
330
360
390
420
450
480
510
540
570
600
630
660
690
720
750
780
810
840
870
900
930
960
990
1020
1050
1080
1110
1140
1170
1200
1230
1260
1290
1320
1350
1380
1410
1440
1470
1500
1530
1560
1590
1620
1650
1680
1710
1740
1770
1800
1830
1860
1890
1920
1950
1980
2010
2040
2070
2100
2130
2160
2190
2220
2250
2280
2310
2340
2370
2400
2430
2460
2490
2520
2550
2580
2610
2640
2670
2700
2730
2760
2790
2820
2850
2880
2910
2940
2970
3000
3030
3060
3090
3120
3150
3180
3210
3240
3270


In [17]:
movie_info.to_csv('./17_18_preprocessed_data/movie_info/movie_info_'+str(len(movie_code_list))+'.csv', encoding='utf-8-sig')

In [18]:
movie_info

Unnamed: 0,movie_code,release,acc_audience_num
0,20176201,2017-06-22,71362
1,19960222,1996-07-13,0
2,20178161,2018-06-13,3152872
3,20090929,2010-02-11,270683
4,20149266,2015-04-29,1472006
5,20098095,해당정보없음,0
6,20011272,해당정보없음,0
7,19970240,1997-09-10,0
8,20113745,2012-05-30,636631
9,20135115,해당정보없음,0


In [19]:
import glob

In [20]:
file_list = glob.glob('./17_18_preprocessed_data/movie_info/*')
len(file_list)

110

In [21]:
columns = list(pd.read_csv(file_list[0], index_col=0).columns)

In [22]:
movie_info_total = pd.DataFrame(columns=columns)

for file in file_list:
    movie_info = pd.read_csv(file)
    movie_info_total = movie_info_total.append(pd.DataFrame(movie_info, columns=columns), ignore_index=True)

In [23]:
movie_info_total.to_csv('17_18_preprocessed_data/movie_info_total.csv', encoding='utf-8-sig')