## Page Title & Hyperlink Dataframe (Beautifulsoup)

### Development Environment

In [1]:
import re
import requests
import urllib
import pandas as pd
import time 
import tqdm
from bs4 import BeautifulSoup
from urllib.request import urlretrieve, urlopen
from urllib.error import URLError, HTTPError

### Page Information

In [None]:
def make_page_title_link_list(soup, url):

    html_tags = ['div', 'img', 'description', 'thumbnail']
    title_list = []
    link_list = []

    a = soup.select("meta")

    for a_text in a:
        a_text = str(a_text)

        a_title = re.search('meta content="(.+?)" name="keywords"/>', a_text)
        if a_title != None:
            a_title = a_title.group()
            a_title = a_title.replace('" name="keywords"/>', '')
            a_title = a_title.replace('meta content="', '')

            total_length = len(a_title.replace(" " , "").replace("," , ""))
            hangeul_length = len(re.sub(r"[^ㄱ-ㅣ가-힣\s]", "", a_title.replace(" " , "")))
            hangeul_ratio = hangeul_length / total_length
            if hangeul_ratio >= 0.9:
                title_list.append(a_title)
                link_list.append(url)
                link_list = dict.fromkeys(link_list)
                link_list = list(link_list)
                
    return title_list, link_list, hangeul_ratio

In [None]:
def make_total_page_title_link_list(link_pattern, number_of_page, start_num):

    total_title_list = []
    total_link_list = []

    with tqdm.tqdm(range(number_of_page)) as pbar:
        # query = urllib.parse.quote('keyword')
        for page_num in pbar:
            
            time.sleep(0.2)

            url = link_pattern[0] + str(page_num + start_num + 1)
            save_url = link_pattern[1] + str(page_num + start_num + 1)

            try:
                request = urllib.request.Request(url)

                response = urllib.request.urlopen(request)
                response_code = response.getcode()

                if response_code == 200:
                    response_body = response.read()

                soup = BeautifulSoup(response_body, 'html.parser') 


                title_list, link_list, hangeul_ratio = make_page_title_link_list(soup, save_url)

                if hangeul_ratio >= 0.9:
                    title_list.reverse()
                    link_list.reverse()
                    total_title_list.append(title_list)
                    total_link_list.append(link_list)

                elif hangeul_ratio < 0.9: 
                    pass

            except HTTPError as e:
                err = e.read()
                code = e.getcode()
                print("The number of page:", page_num - 1) 
                print("Error Code:", code)
                print("\n")
                pass

    total_title_list.reverse()
    total_link_list.reverse()

    total_title_list = sum(total_title_list, [])
    total_link_list = sum(total_link_list, [])

    return total_title_list, total_link_list

### Make Page Content & Link Datafram

In [None]:
def make_page_title_link_df(titles, links):
    
    file_names = []
    
    for idx, title in enumerate(titles):
        if idx < 10:
            file_idx = "0" + str(idx)
        elif idx >= 10:
            file_idx = idx
        
        file_name = str(file_idx) + "_" + title
        file_names.append(file_name)
    
    page_title_link_df = pd.DataFrame({'title':titles, 'link':links})
    page_title_link_df['hyperlink'] = page_title_link_df['link'].map('=HYPERLINK("{}")'.format)

    return page_title_link_df

In [None]:
link_pattern = ['https://www.easylaw.go.kr/CSP/CsmMainBtr.laf?csmSeq=',
                'https://www.easylaw.go.kr/CSP/FileDownload.laf?flType=pdf&onhunqnaYn=N&csmSeq=']
number_of_page = 1500
start_num = 500

title_list, link_list = make_total_page_title_link_list(link_pattern, number_of_page, start_num)
page_title_hyperlink_df = make_page_title_link_df(title_list, link_list)

100%|██████████| 1500/1500 [21:52<00:00,  1.14it/s]


In [None]:
page_title_hyperlink_df

Unnamed: 0,title,link,link2
0,"법제처,생활법령, [베트남어] 소비자 안전정보",https://www.easylaw.go.kr/CSP/FileDownload.laf...,"=HYPERLINK(""https://www.easylaw.go.kr/CSP/File..."
1,"법제처,생활법령, 자동차 빌리기",https://www.easylaw.go.kr/CSP/FileDownload.laf...,"=HYPERLINK(""https://www.easylaw.go.kr/CSP/File..."
2,"법제처,생활법령, 소상공인 지원",https://www.easylaw.go.kr/CSP/FileDownload.laf...,"=HYPERLINK(""https://www.easylaw.go.kr/CSP/File..."
3,"법제처,생활법령, 전세사기 피해자 지원",https://www.easylaw.go.kr/CSP/FileDownload.laf...,"=HYPERLINK(""https://www.easylaw.go.kr/CSP/File..."
4,"법제처,생활법령, 지역상권 살리기",https://www.easylaw.go.kr/CSP/FileDownload.laf...,"=HYPERLINK(""https://www.easylaw.go.kr/CSP/File..."
...,...,...,...
342,"법제처,생활법령, 해고근로자",https://www.easylaw.go.kr/CSP/FileDownload.laf...,"=HYPERLINK(""https://www.easylaw.go.kr/CSP/File..."
343,"법제처,생활법령, 국유재산 이용자",https://www.easylaw.go.kr/CSP/FileDownload.laf...,"=HYPERLINK(""https://www.easylaw.go.kr/CSP/File..."
344,"법제처,생활법령, 미용실 창업ㆍ운영",https://www.easylaw.go.kr/CSP/FileDownload.laf...,"=HYPERLINK(""https://www.easylaw.go.kr/CSP/File..."
345,"법제처,생활법령, 어린이 식품안전",https://www.easylaw.go.kr/CSP/FileDownload.laf...,"=HYPERLINK(""https://www.easylaw.go.kr/CSP/File..."


In [None]:
page_title_hyperlink_df.to_csv("page_title_hyperlink.csv", index=False, encoding="utf-8-sig")

### Reference

<b>법제처: 찾기쉬운 생활법령정보</b>
<br>[불량식품](https://www.easylaw.go.kr/CSP/CsmMainBtr.laf?csmSeq=670#srch_box_pop)
<br><br><b>Github</b>
<br>cheris8
<br>[동적 웹 페이지 크롤링 with Python](https://cheris8.github.io/data%20analysis/DC-Dynamic-Webpage-Crawling/)
<br><br><b>Newspaper</b>
<br>[한국일보 두바퀴 찬가](https://www.hankookilbo.com/Collect/1890?Page=1)
<br><br><b>Stackoverflow</b>
<br>[Python export dataframe complete column as hyperlinks (anuar)](https://stackoverflow.com/questions/73612917/python-export-dataframe-complete-column-as-hyperlinks)