In [1]:
!mkdir package

In [15]:
from bs4 import BeautifulSoup 
from urllib.request import urlopen
from tqdm.notebook import tqdm

import pandas as pd
import re

In [18]:
def crawl_movie_list_top10(url = 'https://movie.naver.com/movie/sdb/rank/rmovie.naver'):
    
    Ranking   = []
    Title     = []
    Link      = []
    Range_ac  = []

    html = urlopen(url)
    soup = BeautifulSoup(html, "lxml")

    table_tag = soup.find('table', 'list_ranking')
    tbody_tag = table_tag.find('tbody')
    tr_tags = tbody_tag.find_all('tr')

    for tr_tag in tr_tags[1:11]:
        
        td_tags = tr_tag.find_all('td')

        # 순위
        ranking = td_tags[0].find('img')
        ranking = rankinf_f if ranking is None else int(ranking['alt'])          
        rankinf_f = ranking

        # 영화명
        movie = td_tags[1].find('a')
        title = movie.get_text().strip()

        # link
        root_url = 'https://movie.naver.com' if movie['href'].startswith('/') else ''
        link = root_url + movie['href']
        link = link.strip()
        
        # range_ac
        range_sign = td_tags[2].find('img')
        range_sign = range_sign['alt']
        range_sign = '-' if range_sign=='down' else ''
        range_num = td_tags[3].get_text()
        range_num = range_num.strip()
        range_ac = range_sign + range_num
    
        Ranking.append(ranking)
        Title.append(title)
        Link.append(link)
        Range_ac.append(range_ac)

    data   = {'순위':Ranking, '영화명':Title, '변동폭':Range_ac, '링크':Link }
    top10_df = pd.DataFrame(data)
    top10_df.set_index('순위', inplace=True)
    
    return top10_df   

In [19]:
url = 'https://movie.naver.com/movie/sdb/rank/rmovie.naver'
top10_df = crawl_movie_list_top10(url)
top10_df

Unnamed: 0_level_0,영화명,변동폭,링크
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,범죄도시2,0,https://movie.naver.com/movie/bi/mi/basic.nave...
2,쥬라기 월드: 도미니언,0,https://movie.naver.com/movie/bi/mi/basic.nave...
3,브로커,0,https://movie.naver.com/movie/bi/mi/basic.nave...
4,닥터 스트레인지: 대혼돈의 멀티버스,0,https://movie.naver.com/movie/bi/mi/basic.nave...
5,마녀(魔女) Part2. The Other One,2,https://movie.naver.com/movie/bi/mi/basic.nave...
6,그대가 조국,0,https://movie.naver.com/movie/bi/mi/basic.nave...
6,니 부모 얼굴이 보고 싶다,-1,https://movie.naver.com/movie/bi/mi/basic.nave...
8,카시오페아,0,https://movie.naver.com/movie/bi/mi/basic.nave...
9,탑건: 매버릭,0,https://movie.naver.com/movie/bi/mi/basic.nave...
10,피는 물보다 진하다,2,https://movie.naver.com/movie/bi/mi/basic.nave...


In [20]:
file_name_step1 = 'data/movie_top10.csv'
top10_df.to_csv(file_name_step1, encoding='UTF-8')

In [21]:
def get_scores(star_scores):
    '''
    관람객, 평론가, 네티즌의 평점을 리턴
    Return : str()
    '''
    arr_score = list()
    for star_score in star_scores:
        each_score = star_score.get_text()
        re_score = re.search('\d{1,2}\.\d{1,2}', each_score)

        if re_score is not None:
            score = re_score.group()
            arr_score.append(score)
            # print('[{}]'.format(score))
        else :
            pass

    if len(arr_score) > 2:
        score1 = arr_score[0]
        score2 = arr_score[1]
        score3 = arr_score[2]
    else:
        score1 = '0.00'
        score2 = '0.00'
        score3 = '0.00'

    scores = "관람객:{} / 평론가:{} / 네티즌:{}".format(score1, score2, score3)
    return scores

In [22]:
# info_spec = mv_info_area.find('dl', 'info_spec')
def get_movie_info(info_spec):
    '''
    영화정보(장르, 감독, 출연, 등급, 관객수)를 리턴
    Return : dict()
    '''
    arr_dt = info_spec.find_all('dt')
    arr_dd = info_spec.find_all('dd')

    dt_step = [ 'step1', 'step2', 'step3', 'step4', 'step9' ]
    mv_dict = {
        'genre'     : None,
        'director'  : None,
        'casting'   : None,
        'rating'    : None,
        'ticketing' : None,
    }

    for idx in range(len(arr_dt)):
        dt_class = arr_dt[idx]['class'][0]
        dd_value = arr_dd[idx]
        # print('{} : {} -> {}'.format(idx, dt_class, type(dd_value)))

        mv_dict = collect_mv_dict(mv_dict, dt_class, dd_value)
        
    return mv_dict
    

def collect_mv_dict(mv_dict, dt_class, dd_value):
    if dt_class=='step1':
        genreTag =  dd_value
        genre = genreTag
        genre = genreTag.find('span').get_text()
        genre = genre.replace('\n', '')
        genre = genre.replace('\r', '')
        genre = genre.replace('\t', '')
        genre = genre.strip()
        mv_dict['genre'] = genre        
        
    elif dt_class=='step2':
        directorTag =  dd_value
        director = directorTag
        director = directorTag.get_text()
        director = director.strip()
        mv_dict['director'] = director        
        
    elif dt_class=='step3':
        castingTag =  dd_value
        casting = castingTag
        casting = castingTag.get_text()
        casting = casting.replace('더보기', '')
        casting = casting.strip()
        mv_dict['casting'] = casting

    elif dt_class=='step4':
        ratingTag =  dd_value
        rating = ratingTag
        rating = ratingTag.get_text()
        rating = rating.replace('\n', '')
        rating = rating.replace('\r', '')
        rating = rating.replace('\t', '')
        rating = rating.strip()
        mv_dict['rating'] = rating

    elif dt_class=='step9':
        ticketingTag =  dd_value
        ticketing = ticketingTag
        ticketing = ticketingTag.find('p').get_text()
        ticketing = ticketing.strip()
        mv_dict['ticketing'] = ticketing

    return mv_dict

In [23]:
def crawl_movie_detail_page(df) :
    '''
    영화상세페이지를 크롤링 한후, DataFrame에 컬럼정보를 추가한다.
    Return : DataFrame
    '''

    Score     = []
    Genre     = []
    Director  = []
    Casting   = []
    Rating    = []
    Ticketing = []

    for url_page in tqdm(df['링크']):

        html = urlopen(url_page)
        soup = BeautifulSoup(html, "lxml")

        mv_info_area = soup.find('div', 'mv_info_area')

        # 스코어
        star_scores = mv_info_area.find_all('div', 'star_score')
        scores = get_scores(star_scores)

        # 영화정보
        info_spec = mv_info_area.find('dl', 'info_spec')
        mv_dict = get_movie_info(info_spec)

        # 리스트에 담기
        Score.append(scores)
        Genre.append(mv_dict['genre'])
        Director.append(mv_dict['director'])
        Casting.append(mv_dict['casting'])
        Rating.append(mv_dict['rating'])
        Ticketing.append(mv_dict['ticketing'])

    # DataFrame에 추가
    df['평점'] = Score
    df['장르'] = Genre
    df['감독'] = Director
    df['출연'] = Casting
    df['등급'] = Rating
    df['흥행'] = Ticketing

    # print('Crawling is Finished !!!')
    
    columns = list(df.columns)
    columns.append(columns.pop(2))
    
    final_df = df.loc[:, columns]
        
    return final_df

In [24]:
# df = pd.read_csv(file_name_step1, encoding='UTF-8')
df = pd.read_csv(file_name_step1, encoding='UTF-8', index_col='순위')
df

Unnamed: 0_level_0,영화명,변동폭,링크
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,범죄도시2,0,https://movie.naver.com/movie/bi/mi/basic.nave...
2,쥬라기 월드: 도미니언,0,https://movie.naver.com/movie/bi/mi/basic.nave...
3,브로커,0,https://movie.naver.com/movie/bi/mi/basic.nave...
4,닥터 스트레인지: 대혼돈의 멀티버스,0,https://movie.naver.com/movie/bi/mi/basic.nave...
5,마녀(魔女) Part2. The Other One,2,https://movie.naver.com/movie/bi/mi/basic.nave...
6,그대가 조국,0,https://movie.naver.com/movie/bi/mi/basic.nave...
6,니 부모 얼굴이 보고 싶다,-1,https://movie.naver.com/movie/bi/mi/basic.nave...
8,카시오페아,0,https://movie.naver.com/movie/bi/mi/basic.nave...
9,탑건: 매버릭,0,https://movie.naver.com/movie/bi/mi/basic.nave...
10,피는 물보다 진하다,2,https://movie.naver.com/movie/bi/mi/basic.nave...


In [25]:
final_df = crawl_movie_detail_page(df)
final_df

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0_level_0,영화명,변동폭,평점,장르,감독,출연,등급,흥행,링크
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,범죄도시2,0,관람객:9.07 / 평론가:6.56 / 네티즌:9.51,"범죄, 액션",이상용,"마동석(마석도), 손석구(강해상), 최귀화(전일만)",[국내] 15세 관람가,"9,686,860명(06.09 기준)",https://movie.naver.com/movie/bi/mi/basic.nave...
2,쥬라기 월드: 도미니언,0,관람객:6.85 / 평론가:6.40 / 네티즌:6.18,"액션, 모험",콜린 트레보로우,"크리스 프랫(오웬 그래디), 브라이스 달라스 하워드(클레어 디어링)",[국내] 12세 관람가,"2,302,724명(06.09 기준)",https://movie.naver.com/movie/bi/mi/basic.nave...
3,브로커,0,관람객:7.48 / 평론가:6.80 / 네티즌:6.18,드라마,고레에다 히로카즈,"송강호, 강동원, 배두나",[국내] 12세 관람가,"257,989명(06.09 기준)",https://movie.naver.com/movie/bi/mi/basic.nave...
4,닥터 스트레인지: 대혼돈의 멀티버스,0,관람객:7.79 / 평론가:6.80 / 네티즌:7.33,"액션, 판타지, 모험",샘 레이미,"베네딕트 컴버배치(닥터 스트레인지), 엘리자베스 올슨(완다 막시모프 / 스칼렛 위치)",[국내] 12세 관람가,"5,863,047명(06.09 기준)",https://movie.naver.com/movie/bi/mi/basic.nave...
5,마녀(魔女) Part2. The Other One,2,관람객:0.00 / 평론가:0.00 / 네티즌:0.00,액션,박훈정,"신시아, 박은빈, 서은수",[국내] 15세 관람가,,https://movie.naver.com/movie/bi/mi/basic.nave...
6,그대가 조국,0,관람객:9.80 / 평론가:5.00 / 네티즌:7.21,다큐멘터리,이승준,,[국내] 12세 관람가,"303,132명(06.09 기준)",https://movie.naver.com/movie/bi/mi/basic.nave...
6,니 부모 얼굴이 보고 싶다,-1,관람객:7.75 / 평론가:6.14 / 네티즌:7.68,드라마,김지훈,설경구,[국내] 15세 관람가,,https://movie.naver.com/movie/bi/mi/basic.nave...
8,카시오페아,0,관람객:8.38 / 평론가:6.50 / 네티즌:9.22,드라마,신연식,"안성기(인우), 서현진(수진), 주예림(지나)",[국내] 12세 관람가,"18,025명(06.09 기준)",https://movie.naver.com/movie/bi/mi/basic.nave...
9,탑건: 매버릭,0,관람객:0.00 / 평론가:0.00 / 네티즌:0.00,액션,조셉 코신스키,"톰 크루즈(매버릭), 마일즈 텔러(루스터), 제니퍼 코넬리(페니)",[국내] 12세 관람가,,https://movie.naver.com/movie/bi/mi/basic.nave...
10,피는 물보다 진하다,2,관람객:5.50 / 평론가:3.00 / 네티즌:6.17,액션,김희성,"조동혁(두현), 이완(영민), 임정은(지영)",[국내] 15세 관람가,,https://movie.naver.com/movie/bi/mi/basic.nave...


In [26]:
file_name_step2 = 'data/movie_top10_final.csv'
final_df.to_csv(file_name_step1, encoding='UTF-8')

### 1단계 : 리스트 크롤링
>데이터수집 : 순위, 영화명, 변동폭, ~평점(관람객/평론가/네티즌), 장르, 감독, 출연, 등급~

In [27]:
%%writefile ./package/mvr_step1_crawler.py
# package/mvr_step1_crawler.py
from bs4 import BeautifulSoup 
from urllib.request import urlopen
from tqdm.notebook import tqdm

import pandas as pd
import re


def crawl_movie_list_top10(url = 'https://movie.naver.com/movie/sdb/rank/rmovie.naver'):
    
    Ranking   = []
    Title     = []
    Link      = []
    Range_ac  = []

    html = urlopen(url)
    soup = BeautifulSoup(html, "lxml")

    table_tag = soup.find('table', 'list_ranking')
    tbody_tag = table_tag.find('tbody')
    tr_tags = tbody_tag.find_all('tr')

    for tr_tag in tr_tags[1:11]:
        
        td_tags = tr_tag.find_all('td')

        # 순위
        ranking = td_tags[0].find('img')
        ranking = rankinf_f if ranking is None else int(ranking['alt'])          
        rankinf_f = ranking
        
        # 영화명
        movie = td_tags[1].find('a')
        title = movie.get_text().strip()

        # link
        root_url = 'https://movie.naver.com' if movie['href'].startswith('/') else ''
        link = root_url + movie['href']
        link = link.strip()
        
        # range_ac
        range_sign = td_tags[2].find('img')
        range_sign = range_sign['alt']
        range_sign = '-' if range_sign=='down' else ''
        range_num = td_tags[3].get_text()
        range_num = range_num.strip()
        range_ac = range_sign + range_num
    
        Ranking.append(ranking)
        Title.append(title)
        Link.append(link)
        Range_ac.append(range_ac)

    data   = {'순위':Ranking, '영화명':Title, '변동폭':Range_ac, '링크':Link }
    top10_df = pd.DataFrame(data)
    top10_df.set_index('순위', inplace=True)
    
    return top10_df   


if __name__ == "__main__":
    url = 'https://movie.naver.com/movie/sdb/rank/rmovie.naver'
    top10_df = crawl_movie_list_top10(url)
    
    file_name_step1 = 'data/movie_top10.csv'
    top10_df.to_csv(file_name_step1, encoding='UTF-8')

    top10_df

Overwriting ./package/mvr_step1_crawler.py


### 2단계 : 상세페이지 크롤링
>항목 : ~순위, 영화명, 변동폭,~ 평점(관람객/평론가/네티즌), 장르, 감독, 출연, 등급

In [28]:
%%writefile ./package/mvr_step2_crawler.py
# package/mvr_step2_crawler.py
from bs4 import BeautifulSoup 
from urllib.request import urlopen
from tqdm.notebook import tqdm

import pandas as pd
import re

# star_scores = mv_info_area.find_all('div', 'star_score')
def get_scores(star_scores):
    '''
    관람객, 평론가, 네티즌의 평점을 리턴
    Return : str()
    '''
    arr_score = list()
    for star_score in star_scores:
        each_score = star_score.get_text()
        re_score = re.search('\d{1,2}\.\d{1,2}', each_score)

        if re_score is not None:
            score = re_score.group()
            arr_score.append(score)
            # print('[{}]'.format(score))
        else :
            pass

    if len(arr_score) > 2:
        score1 = arr_score[0]
        score2 = arr_score[1]
        score3 = arr_score[2]
    else:
        score1 = '0.00'
        score2 = '0.00'
        score3 = '0.00'

    scores = "관람객:{} / 평론가:{} / 네티즌:{}".format(score1, score2, score3)
    return scores




# info_spec = mv_info_area.find('dl', 'info_spec')
def get_movie_info(info_spec):
    '''
    영화정보(장르, 감독, 출연, 등급, 관객수)를 리턴
    Return : dict()
    '''
    arr_dt = info_spec.find_all('dt')
    arr_dd = info_spec.find_all('dd')

    dt_step = [ 'step1', 'step2', 'step3', 'step4', 'step9' ]
    mv_dict = {
        'genre'     : None,
        'director'  : None,
        'casting'   : None,
        'rating'    : None,
        'ticketing' : None,
    }

    for idx in range(len(arr_dt)):
        dt_class = arr_dt[idx]['class'][0]
        dd_value = arr_dd[idx]
        # print('{} : {} -> {}'.format(idx, dt_class, type(dd_value)))

        mv_dict = collect_mv_dict(mv_dict, dt_class, dd_value)
        
    return mv_dict
    

def collect_mv_dict(mv_dict, dt_class, dd_value):
    if dt_class=='step1':
        genreTag =  dd_value
        genre = genreTag
        genre = genreTag.find('span').get_text()
        genre = genre.replace('\n', '')
        genre = genre.replace('\r', '')
        genre = genre.replace('\t', '')
        genre = genre.strip()
        mv_dict['genre'] = genre        
        
    elif dt_class=='step2':
        directorTag =  dd_value
        director = directorTag
        director = directorTag.get_text()
        director = director.strip()
        mv_dict['director'] = director        
        
    elif dt_class=='step3':
        castingTag =  dd_value
        casting = castingTag
        casting = castingTag.get_text()
        casting = casting.replace('더보기', '')
        casting = casting.strip()
        mv_dict['casting'] = casting

    elif dt_class=='step4':
        ratingTag =  dd_value
        rating = ratingTag
        rating = ratingTag.get_text()
        rating = rating.replace('\n', '')
        rating = rating.replace('\r', '')
        rating = rating.replace('\t', '')
        rating = rating.strip()
        mv_dict['rating'] = rating

    elif dt_class=='step9':
        ticketingTag =  dd_value
        ticketing = ticketingTag
        ticketing = ticketingTag.find('p').get_text()
        ticketing = ticketing.strip()
        mv_dict['ticketing'] = ticketing

    return mv_dict


def crawl_movie_detail_page(df) :
    '''
    영화상세페이지를 크롤링 한후, DataFrame에 컬럼정보를 추가한다.
    Return : DataFrame
    '''

    Score     = []
    Genre     = []
    Director  = []
    Casting   = []
    Rating    = []
    Ticketing = []

    for url_page in tqdm(df['링크']):

        html = urlopen(url_page)
        soup = BeautifulSoup(html, "lxml")

        mv_info_area = soup.find('div', 'mv_info_area')

        # 스코어
        star_scores = mv_info_area.find_all('div', 'star_score')
        scores = get_scores(star_scores)

        # 영화정보
        info_spec = mv_info_area.find('dl', 'info_spec')
        mv_dict = get_movie_info(info_spec)

        # 리스트에 담기
        Score.append(scores)
        Genre.append(mv_dict['genre'])
        Director.append(mv_dict['director'])
        Casting.append(mv_dict['casting'])
        Rating.append(mv_dict['rating'])
        Ticketing.append(mv_dict['ticketing'])

    # DataFrame에 추가
    df['평점'] = Score
    df['장르'] = Genre
    df['감독'] = Director
    df['출연'] = Casting
    df['등급'] = Rating
    df['흥행'] = Ticketing

    # print('Crawling is Finished !!!')
    
    columns = list(df.columns)
    columns.append(columns.pop(2))
    
    final_df = df.loc[:, columns]
        
    return final_df



if __name__ == "__main__":
    # df = pd.read_csv(file_name_step1, encoding='UTF-8')
    df = pd.read_csv(file_name_step1, encoding='UTF-8', index_col='순위')

    final_df = crawl_movie_detail_page(df)
    
    file_name_step2 = 'data/movie_top10_final.csv'
    final_df.to_csv(file_name_step2, encoding='UTF-8')

    final_df

Overwriting ./package/mvr_step2_crawler.py
