## Page Content Text to TXT (Beautifulsoup)

<br>

### Development Environment

In [1]:
import re
import glob
import urllib
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlretrieve, urlopen
from urllib.error import URLError, HTTPError

### Page Title & Link Dataframe

In [35]:
def make_page_title_link_description_list(soup):

    pre_title_list = []
    title_list = []
    link_list = []

    a = soup.select("a")
    for a_text in a:
        a_text = str(a_text)

        if 'pages/articles' in a_text:
            a_title = re.search('">(.+?)</a>', a_text)
            if a_title != None:
                a_title = a_title.group()
                a_title = a_title.replace('">', '')
                a_title = a_title.replace('<a>', '')
                a_title = a_title.replace(' </a>', '')
                a_title = a_title.replace("?", "")
                a_title = a_title.replace(":", "")
                a_title = a_title.replace("/", "")
                pre_title_list.append(a_title)

            a_link = re.search('<a href=(.+?)">', a_text)
            if a_link != None:
                a_link = a_link.group()
                a_link = a_link.replace('<a href="', 'https://www.pressian.com')
                a_link = a_link.replace('">', '')
                link_list.append(a_link)
                link_list = dict.fromkeys(link_list)
                link_list = list(link_list)

    for i in range(0, len(pre_title_list)):
        if len(pre_title_list[i]) <= 30:
            pre_title_list[i] = pre_title_list[i].replace('<a>', '')
            title_list.append(pre_title_list[i])

    return title_list, link_list

In [18]:
def make_total_page_title_link_list(link_pattern, number_of_page):

    total_title_list = []
    total_link_list = []

    # query = urllib.parse.quote('keyword')
    for page_num in range(number_of_page):
        
        url = link_pattern +  str(page_num + 1) # + query

        try:
            request = urllib.request.Request(url)

            response = urllib.request.urlopen(request)
            response_code = response.getcode()

            if response_code == 200:
                response_body = response.read()

            soup = BeautifulSoup(response_body, 'html.parser') 

            title_list, link_list  = make_page_title_link_description_list(soup)

            total_title_list.append(title_list)
            total_link_list.append(link_list)

        except HTTPError as e:
            err = e.read()
            code = e.getcode()
            print("The number of page:", page_num - 1) 
            print("Error Code:", code)
            print("\n")
            break

    total_title_list = sum(total_title_list, [])
    total_link_list = sum(total_link_list, [])

    return total_title_list, total_link_list

In [19]:
def make_page_title_link_df(titles, links):
    
    page_title_link_df = pd.DataFrame({'title':titles, 'link':links})
    
    return page_title_link_df

In [36]:
link_pattern = 'https://www.pressian.com/pages/serials/1240?page='
number_of_page = 10

page_title_link_df = pd.DataFrame({'title':["title"], 'link':["link"]})

title_list, link_list = make_total_page_title_link_list(link_pattern, number_of_page)
part_page_title_link_df = make_page_title_link_df(title_list, link_list)
page_title_link_df = pd.concat([page_title_link_df, part_page_title_link_df])

page_title_link_df = page_title_link_df.reset_index()
del page_title_link_df['index']
page_title_link_df = page_title_link_df.drop(page_title_link_df.index[[0]])

In [38]:
pd.set_option('display.max_colwidth', None)
page_title_link_df

Unnamed: 0,title,link
1,최후의 질문,https://www.pressian.com/pages/articles/56653
2,논리와 결정불가능성,https://www.pressian.com/pages/articles/56652
3,가치관념과 과학의 발전,https://www.pressian.com/pages/articles/56651
4,과학과 사회,https://www.pressian.com/pages/articles/56650
5,과학의 위험성,https://www.pressian.com/pages/articles/56649
...,...,...
86,보편 이론체계와 대칭성 깨짐,https://www.pressian.com/pages/articles/54924
87,좋은 이론이란,https://www.pressian.com/pages/articles/54921
88,과학적 사고란,https://www.pressian.com/pages/articles/54811
89,과학은 아름답다,https://www.pressian.com/pages/articles/54810


In [39]:
page_title_link_df.to_excel('page_title_link_df.xlsx', index=False)

### Page Content Text to TXT

In [40]:
def make_page_content_text(url):

    request = urllib.request.Request(url)

    response = urllib.request.urlopen(request)
    response_code = response.getcode()

    if response_code == 200:
        response_body = response.read()

    soup = BeautifulSoup(response_body, 'html.parser') 

    content_list = []

    a = soup.select("div")
    for a_text in a:
        a_text = str(a_text)

        if '<br/>' in a_text:
            a_content = re.search('<br/>(.+?)<br/><table align="center"', a_text)
            if a_content != None:
                a_content = a_content.group()
                a_content = re.sub(r"\<.*?\>", "", a_content)
                a_content = a_content.replace('<table align="center"', '')
                content_list.append(a_content)

            b_content = re.search('<br/>(.+?)<br/><font color=', a_text)
            if b_content != None:
                b_content = b_content.group()
                b_content = re.sub(r"\<.*?\>", "", b_content)
                b_content = b_content.replace('<font color=', '')
                content_list.append(b_content)   

    content_dict = dict.fromkeys(content_list)
    content_list = list(content_dict)  
    content_text = " ".join(content_list)

    return content_text

In [26]:
title_series = page_title_link_df['title']
link_series = page_title_link_df['link']

for title, link in zip(title_series, link_series):
    content_text = make_page_content_text(link)
    with open(title + ".txt", 'w', encoding='utf-8') as f:
        f.write(content_text)

In [5]:
line_num = 0
with open('동역학.txt', 'r') as f:
    lines = f.read()
    lines = lines.split(".")
    for line in lines:
        line_num += 1
        if line_num <= 5:
            print(line)

모든 자연현상은 그 현상의 실체인 물질의 구성원 사이의 상호작용 때문에 생긴다고 했습니다
 간단한 예로 공을 던졌을 때 포물선을 그리며 날아가는 것도 자연현상입니다
 공이라는 구성원과 지구라는 구성원 사이의 상호작용, 곧 중력에 의해 어떻게 날아갈지 운동이 정해집니다
 마찬가지로 어떤 것은 딱딱하고 어떤 것은 물렁물렁하고 어떤 것은 빨갛고 혹은 파랗고 어떤 것은 반짝이고 등 물건의 성질도 모두 자연현상입니다
그런 것을 이해하고 해석하려면 그 물건의 구성원, 즉 분자를 생각해야 합니다


### Merge Content TXT

In [27]:
corpus_list = glob.glob("*.txt")
with open("최무영의 과학이야기.txt", 'w', encoding='utf-8') as f:
    for corpus in corpus_list:
        with open(corpus, encoding='utf-8') as text:
            for line in text:
                f.write(line)

<br>

### Reference

<b>Github</b>
<br>cheris8
<br>[동적 웹 페이지 크롤링 with Python](https://cheris8.github.io/data%20analysis/DC-Dynamic-Webpage-Crawling/)

<br><b>News</b>
<br>[프레시안 <최무영의 과학이야기>](https://www.pressian.com/pages/serials/1240)