## Page Content & Link Dataframe (Beautifulsoup)

### Development Environment

In [89]:
import re
import requests
import urllib
import pandas as pd
import time 
import tqdm
from bs4 import BeautifulSoup
from urllib.request import urlretrieve, urlopen
from urllib.error import URLError, HTTPError

### Page Information

In [90]:
def make_page_content_link_list(soup):

    html_tags = ['div', 'img', 'description', 'thumbnail']
    title_list = []
    link_list = []

    a = soup.select("a")

    # print(a)

    for a_text in a:
        a_text = str(a_text)

        a_title = re.search('target="_self">(.+?)</a>', a_text)
        if a_title != None:
            a_title = a_title.group()
            if all(html_tag not in a_title for html_tag in html_tags):
                a_title = a_title.replace('/">', '')
                a_title = a_title.replace('</a>', '')
                a_title = a_title.replace('target="_self">', '')
                title_list.append(a_title)

        a_link = re.search('<a href=(.+?) target="_self">', a_text)
       
        if a_link != None:
            a_link = a_link.group()
            a_link = a_link.replace('<a href="', 'https://www.hankookilbo.com')
            a_link = a_link.replace('target="_self">', '')
            a_link = a_link.replace("Read/", "Read/Print/")
            a_link = a_link.replace('">', '')
            a_link = a_link.replace('"', '')
            link_list.append(a_link)
            link_list = dict.fromkeys(link_list)
            link_list = list(link_list)

    title_list.reverse()
    link_list.reverse()

    return title_list, link_list

In [91]:
def make_total_page_content_link_list(link_pattern, number_of_page):

    total_title_list = []
    total_link_list = []

    with tqdm.tqdm(range(number_of_page)) as pbar:
        # query = urllib.parse.quote('keyword')
        for page_num in pbar:
            
            time.sleep(0.2)

            url = link_pattern[0] + str(page_num + 1)

            try:
                request = urllib.request.Request(url)

                response = urllib.request.urlopen(request)
                response_code = response.getcode()

                if response_code == 200:
                    response_body = response.read()

                soup = BeautifulSoup(response_body, 'html.parser') 

                title_list, link_list = make_page_content_link_list(soup)

                total_title_list.append(title_list)
                total_link_list.append(link_list)

            except HTTPError as e:
                err = e.read()
                code = e.getcode()
                print("Topic:", page_topic[0].upper() + page_topic[1:])
                print("The number of page:", page_num - 1) 
                print("Error Code:", code)
                print("\n")
                break

    total_title_list = sum(total_title_list, [])
    total_link_list = sum(total_link_list, [])

    return total_title_list, total_link_list

### Make Page Content & Link Datafram

In [96]:
def make_page_content_link_df(titles, links):
    
    file_names = []
    
    for idx, title in enumerate(titles):
        if idx < 10:
            file_idx = "0" + str(idx)
        elif idx >= 10:
            file_idx = idx
        
        file_name = str(file_idx) + "_" + title
        file_names.append(file_name)
    
    page_content_link_df = pd.DataFrame({'title':titles, 'File Name':file_names, 'link':links})
    
    return page_content_link_df

In [97]:
link_pattern = ['https://www.hankookilbo.com/Collect/1890?Page=']
number_of_page = 10

page_content_link_description_df = pd.DataFrame({'title':["title"], 'link':["link"]})

title_list, link_list = make_total_page_content_link_list(link_pattern, number_of_page)
page_content_link_df = make_page_content_link_df(title_list, link_list)

100%|██████████| 10/10 [00:14<00:00,  1.45s/it]


In [98]:
page_content_link_df

Unnamed: 0,title,File Name,link
0,태국에서 증명된 우리나라 자전거 동호인의 실력,00_태국에서 증명된 우리나라 자전거 동호인의 실력,https://www.hankookilbo.com/News/Read/Print/20...
1,자전거 도로에 압정 뿌린 자전거 수리공,01_자전거 도로에 압정 뿌린 자전거 수리공,https://www.hankookilbo.com/News/Read/Print/20...
2,[박권일의 글쟁이 페달] 세상에서 가장 지루한 스포츠,02_[박권일의 글쟁이 페달] 세상에서 가장 지루한 스포츠,https://www.hankookilbo.com/News/Read/Print/20...
3,[두바퀴찬가] 일주일에 3번 자전거 출근했더니 생긴 놀라운 변화,03_[두바퀴찬가] 일주일에 3번 자전거 출근했더니 생긴 놀라운 변화,https://www.hankookilbo.com/News/Read/Print/20...
4,"끊이지 않는 위협 운전, 목숨 내놓고 달리는 자전거","04_끊이지 않는 위협 운전, 목숨 내놓고 달리는 자전거",https://www.hankookilbo.com/News/Read/Print/20...
...,...,...,...
78,바퀴에 거미줄 칠라… '자전거 권태기' 벗어나기,78_바퀴에 거미줄 칠라… '자전거 권태기' 벗어나기,https://www.hankookilbo.com/News/Read/Print/20...
79,추워지는 날씨.. 자전거족의 피난처를 찾다,79_추워지는 날씨.. 자전거족의 피난처를 찾다,https://www.hankookilbo.com/News/Read/Print/20...
80,"‘얼마짜리 자전거’는 중요하지 않다, 지금 당장 타라","80_‘얼마짜리 자전거’는 중요하지 않다, 지금 당장 타라",https://www.hankookilbo.com/News/Read/Print/20...
81,"쓰고 웃을래, 안 쓰고 맘 졸일래","81_쓰고 웃을래, 안 쓰고 맘 졸일래",https://www.hankookilbo.com/News/Read/Print/20...


In [99]:
page_content_link_df.to_csv("page_content_link.csv", index=False)

### Reference

<b>Github</b>
<br><br>cheris8
<br>[동적 웹 페이지 크롤링 with Python](https://cheris8.github.io/data%20analysis/DC-Dynamic-Webpage-Crawling/)
<br><br><b>Magazine</b>
<br>[요즘IT](https://yozm.wishket.com/magazine/)