## 배우 영향력 구하기
영화 하나하나마다 배우 프로필에 들어가서 이전 출연작 정보를 가져오는 것은 너무 비효율적    
-> 정보를 가져와야 할 모든 배우의 유니크한 리스트를 만들어서 배우 프로필을 방문하여 이전 출연작 정보를 모두 가져옴

In [1]:
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

import time
import re

In [11]:
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")

driver = webdriver.Chrome('chromedriver',options=options)
driver.implicitly_wait(3)

url = 'https://www.kobis.or.kr/kobis/business/mast/mvie/searchMovieList.do'

### 파일 이름
|name|desc|
|:---|:---|
|file_name_movie_df|가져올 영화 정보 파일|
|file_name_movie_actor_df|수집해야 할 영화의 주연 배우 2명의 배우 코드|
|file_name_actor_movie|배우가 주연으로 출연한 모든 출연작|
|directory_movie_info|영화의 관람객수와 개봉일을 저장할 디렉토리|
|file_name_movie_info_total|분산되어 있던 파일 병합|

In [None]:
# 파일이름
file_name_movie_df = './Preprocessed_Data.xlsx'
file_name_movie_actor_df = '17_18_preprocessed_data/movie_actor_df.csv'
file_name_actor_movie = '17_18_preprocessed_data/actor_movie.csv'
directory_movie_info = './17_18_preprocessed_data/movie_info/'
file_name_movie_info_total = '17_18_preprocessed_data/movie_info_total.csv'

In [4]:
# 넷플릭스용 파일이름
file_name_movie_df = './Preprocessed_Data.xlsx'
file_name_movie_actor_df = './netflix_preprocessed_data/movie_actor_df.csv'
file_name_actor_movie = './netflix_preprocessed_data/actor_movie.csv'
directory_movie_info = './netflix_preprocessed_data/movie_info/'
file_name_movie_info_total = 'netflix_preprocessed_data/movie_info_total.csv'

### 모든 배우의 KOBIS code 가져오기
주연 배우 2명의 배우 코드를 영화별로 가져옴

In [5]:
def get_actors(movie_df):
    driver.get(url)
    
    movie_actor_df = pd.DataFrame(movie_df)
    movie_actor_df['actor1_code'] = np.nan
    movie_actor_df['actor2_code'] = np.nan
    
    for i in range(len(movie_df)):
        driver.execute_script("mstView('movie','"+str(movie_df.loc[i,"code"])+"')")
        driver.implicitly_wait(10)
        
        try:
            elem = driver.find_element(By.CSS_SELECTOR, "div[name=\""+str(movie_df.loc[i,"code"])+"_staff\"] > dl > div:nth-child(2) > dd > table:nth-child(1) > tbody > tr > td > a:nth-child(2)")
            onClickStr = elem.get_attribute('onClick')
            movie_df.loc[i,"actor1_code"] = onClickStr.split("'")[3]
        except Exception:
            continue
            
        try:
            elem = driver.find_element(By.CSS_SELECTOR, "div[name=\""+str(movie_df.loc[i,"code"])+"_staff\"] > dl > div:nth-child(2) > dd > table:nth-child(1) > tbody > tr > td > a:nth-child(3)")
            onClickStr = elem.get_attribute('onClick')
            movie_df.loc[i,"actor2_code"] = onClickStr.split("'")[3]
        except Exception:
            continue
    
    return movie_actor_df


In [4]:
movie_df = pd.read_excel(file_name_movie_df).loc[:,['movie_name','code']]
movie_actor_df = get_actors(movie_df)

In [2]:
movie_actor_df.to_csv(file_name_movie_actor_df, index_col=0)
movie_actor_df

NameError: name 'movie_actor_df' is not defined

In [6]:
movie_actor_df=pd.read_csv(file_name_movie_actor_df, index_col=0)
movie_actor_df

Unnamed: 0,movie_name,code,actor1_code,actor2_code
0,오징어 게임,,10057315,20305129
1,킹덤,,10069179,10019065
2,보건교사 안은영,,10062071,20279929
3,옥자,,10081721,10063816
4,승리호,,10037229,20201026


`actor1`과 `actor2`의 배우 코드를 유니크한 리스트로 생성

In [7]:
actor_list = list(set(list(movie_actor_df['actor1_code'].dropna())+list(movie_actor_df['actor2_code'].dropna())))

### 배우의 출연작 수집
배우가 **주연으로** 출연했던 모든 영화 코드를 가져옴

In [8]:
def get_actor_movies(actor_list):
    driver.get(url)
    columns = ['actor_code', 'movie_name', 'movie_code']
    actor_movie = pd.DataFrame(columns=columns)
    
    driver.execute_script("mstView('people','"+str(actor_list[0])+"')")
    driver.implicitly_wait(10)
    driver.find_element(By.CSS_SELECTOR, "ul.list_tab > li:nth-child(2)").click()
    driver.implicitly_wait(10)

    
    for i in range(len(actor_list)):
        page_index = 0
        while True:
            page_index += 1
            driver.execute_script("dtlReq('people','"+str(actor_list[i])+"','filmo','N','"+str(page_index)+"')")
            driver.implicitly_wait(10)
            
            html = BeautifulSoup(driver.page_source, 'html.parser')
            movie_list = html.select('ul.fmList > li')
            if len(movie_list) == 0: break
            
            for li in html.select('.fmList > li'):
                if '주연' not in li.select_one('dl > dd').get_text(): continue

                movie = li.select_one('dl > dt > a')
                movie_name = movie.get_text()
                movie_code = movie['onclick'].split("'")[3]
                
                new_row = [[actor_list[i], movie_name, movie_code]]
                actor_movie = actor_movie.append(pd.DataFrame(new_row, columns=columns), ignore_index=True)

    return actor_movie


In [9]:
actor_movie = get_actor_movies(actor_list)

In [10]:
actor_movie

Unnamed: 0,actor_code,movie_name,movie_code
0,20201026,승리호(SPACE SWEEPERS),20192662
1,20201026,외계+인,20208446
2,20201026,리틀 포레스트(Little Forest),20170841
3,20201026,1987(1987: When the Day Comes),20170590
4,20201026,아가씨(The Handmaiden),20144444
...,...,...,...
190,10069179,좋은 친구들(Confession),20148745
191,10069179,결혼전야(Marriage Blue),20136061
192,10069179,나는 왕이로소이다(I am the King),20124080
193,10069179,키친(The Naked Kitchen),20090022


In [12]:
actor_movie.to_csv(file_name_actor_movie, encoding='utf-8-sig')
actor_movie

Unnamed: 0,actor_code,movie_name,movie_code
0,20201026,승리호(SPACE SWEEPERS),20192662
1,20201026,외계+인,20208446
2,20201026,리틀 포레스트(Little Forest),20170841
3,20201026,1987(1987: When the Day Comes),20170590
4,20201026,아가씨(The Handmaiden),20144444
...,...,...,...
190,10069179,좋은 친구들(Confession),20148745
191,10069179,결혼전야(Marriage Blue),20136061
192,10069179,나는 왕이로소이다(I am the King),20124080
193,10069179,키친(The Naked Kitchen),20090022


In [13]:
actor_movie = pd.read_csv(file_name_actor_movie, index_col=0)
actor_movie

Unnamed: 0,actor_code,movie_name,movie_code
0,20201026,승리호(SPACE SWEEPERS),20192662
1,20201026,외계+인,20208446
2,20201026,리틀 포레스트(Little Forest),20170841
3,20201026,1987(1987: When the Day Comes),20170590
4,20201026,아가씨(The Handmaiden),20144444
...,...,...,...
190,10069179,좋은 친구들(Confession),20148745
191,10069179,결혼전야(Marriage Blue),20136061
192,10069179,나는 왕이로소이다(I am the King),20124080
193,10069179,키친(The Naked Kitchen),20090022


In [14]:
movie_code_list = list(set(actor_movie['movie_code']))
len(movie_code_list)

193

### 영화 정보 수집
위에서 가져온 영화 코드를 기준으로 영화의 최종 관람객수와 개봉일을 가져옴

In [15]:
def get_movie_info(movie_list):
    columns = ['movie_code', 'release', 'acc_audience_num']
    movie_info = pd.DataFrame(columns=columns)
    
    for i in range(len(movie_list)):
        if i%30 == 0 and i != 0:
            movie_info.to_csv(directory_movie_info+'movie_info_'+str(i)+'.csv', encoding='utf-8-sig')
            movie_info = pd.DataFrame(columns=columns)
            print(i)
            
        new_row = [movie_list[i]]
        
        driver.get(url)
        driver.execute_script("mstView('movie','"+str(movie_list[i])+"')")
        
        html = BeautifulSoup(driver.page_source, 'html.parser')
        elem = html.find('dt', text='개봉일')
        release = '해당정보없음' if elem is None else elem.find_next_sibling('dd').text.strip()

        new_row.append(release)
        
        if release == '해당정보없음':
            new_row.append(0)
            movie_info = movie_info.append(pd.DataFrame([new_row], columns=columns), ignore_index=True)
            continue
        
        driver.find_element(By.CSS_SELECTOR, "div.wrap_tab.tab2 > ul.list_tab > li:nth-child(2) > a").click()
        driver.implicitly_wait(60)
        
        try:
            acc_audience_num = driver.find_element(By.CSS_SELECTOR, "div.item_tab.statistics > div:nth-child(1) > table > tbody > tr:nth-child(2) > td:nth-child(4)").text
            acc_audience_num = re.match(r'^([0-9].+)\s\(.+\)$', acc_audience_num).group(1).replace(',', '')
            new_row.append(int(acc_audience_num))
        except Exception:
            new_row.append(0)
        
        movie_info = movie_info.append(pd.DataFrame([new_row], columns=columns), ignore_index=True)
        
    return movie_info


In [16]:
movie_info = get_movie_info(movie_code_list)

30
60
90
120
150
180


In [17]:
movie_info.to_csv(directory_movie_info+'movie_info_'+str(len(movie_code_list))+'.csv', encoding='utf-8-sig')

In [18]:
movie_info

Unnamed: 0,movie_code,release,acc_audience_num
0,20196309,2019-07-02,8021064
1,20100056,해당정보없음,0
2,20119515,2011-07-20,2945137
3,20210140,해당정보없음,0
4,20110302,2011-08-10,7470633
5,20130783,2013-10-02,188084
6,20018147,해당정보없음,0
7,20060644,해당정보없음,0
8,20182001,2018-06-21,33229
9,19970036,1997-04-26,0


In [19]:
import glob

In [20]:
file_list = glob.glob(directory_movie_info+'*')
len(file_list)

7

In [21]:
columns = list(pd.read_csv(file_list[0], index_col=0).columns)

In [22]:
movie_info_total = pd.DataFrame(columns=columns)

for file in file_list:
    movie_info = pd.read_csv(file)
    movie_info_total = movie_info_total.append(pd.DataFrame(movie_info, columns=columns), ignore_index=True)

In [23]:
movie_info_total.to_csv(file_name_movie_info_total, encoding='utf-8-sig')