In [4]:
import requests
import json
import time
from bs4 import BeautifulSoup
import os
import csv
import re

# constant
TITLE = 'title'
CREATED_AT = 'created_at'
UPDATED_AT = 'updated_at'
CONTENT_ORIGIN = 'content_origin'
CONTENT_TEXT = 'content_text'

def init_article_data():
    return {
        TITLE: None,
        CREATED_AT: None,
        UPDATED_AT: None,
        CONTENT_ORIGIN: None,
        CONTENT_TEXT: None
    }

def extract_article_data(url): 
    article_data = init_article_data()
    
    # HTTP 요청 보내기
    response = requests.get(url)
    
    # HTML 파싱
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # 기사 제목 추출
    title_node = soup.find('h2', {'id': 'title_area'})
    article_data[TITLE] = title_node.text.strip() if title_node else None
    
    # 입력일자 추출
    created_at_node = soup.find('span', {'class': 'media_end_head_info_datestamp_time _ARTICLE_DATE_TIME'})
    article_data[CREATED_AT] = created_at_node.get('data-date-time') if created_at_node else None
    
    
    # 수정일자 추출
    updated_at_node = soup.find('span', {'class': 'media_end_head_info_datestamp_time _ARTICLE_MODIFY_DATE_TIME'})
    article_data[UPDATED_AT] = updated_at_node.get('data-modify-date-time') if updated_at_node else None
    
    # 기사 내용 추출
    content_node = soup.find('article', {'id': 'dic_area'})
    article_data[CONTENT_TEXT] = content_node.text.strip() if content_node else None
    article_data[CONTENT_ORIGIN] = content_node if content_node else None

    return article_data

def load_article_list(data_cursor='20241124110000'):
    # 요청할 URL
    url = "https://news.naver.com/section/template/SECTION_ARTICLE_LIST?sid=101&sid2=&cluid=&pageNo=&date=&next=" + data_cursor
    
    # HTTP GET 요청을 보내고 응답을 받음
    response = requests.get(url)
    
    # 요청이 성공적으로 이루어졌는지 확인 (상태 코드가 200인 경우)
    if response.status_code == 200:
        # JSON 데이터 파싱
        json_data = response.json()
    
        # renderedComponent 내에서 SECTION_ARTICLE_LIST 추출
        try:
            rendered_component = json_data['renderedComponent']
            section_article_list_html = rendered_component.get('SECTION_ARTICLE_LIST', '')
    
            # SECTION_ARTICLE_LIST의 값이 HTML이면 BeautifulSoup으로 파싱
            if section_article_list_html:
                soup = BeautifulSoup(section_article_list_html, 'html.parser')
                
                next_div = soup.find('div', {'data-cursor-name': 'next'})
                next_data_cusor = next_div.get("data-cursor")
                
                # "https://n.news.naver.com/mnews/article/{id}/{id}" 형식의 URL만 필터링 (id는 숫자와 문자가 섞인 형식)
                href_list = list({
                    a.get('href').replace("\\", "").replace("\"", "")
                    for a in soup.find_all('a', href=True)
                    if re.search(r'https://n\.news\.naver\.com/mnews/article/\d{3}/\d+', a.get('href'))
                })
                
                # URL 리스트 정렬
                href_list.sort()

                return (next_data_cusor, href_list)
    
                # article_data_list = []
                # for url in href_list:
                #     article_data_list.append(extract_article_data(url))
                #     time.sleep(3)
                # print(len(article_data_list))
                # print(article_data_list)
            else:
                print("ERROR>>> SECTION_ARTICLE_LIST 내용이 없습니다.")
        except KeyError as e:
            print(f"ERROR>>> '{e}' 키를 찾을 수 없습니다.")
    else:
        print(f"ERROR>>> Failed to retrieve the page. Status code: {response.status_code}")

def save(file_id, data=[]):
    # CSV 파일 경로
    directory = "./data"
    save_file_name = f"article_{file_id}.csv"
    save_file_path = os.path.join(directory, save_file_name)
    
    # 필드 이름 (헤더)
    field_names = [TITLE, CREATED_AT, UPDATED_AT, CONTENT_ORIGIN, CONTENT_TEXT]

    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # CSV 파일 작성 및 데이터 추가
    file_exists = os.path.exists(save_file_path)
    
    # 파일 쓰기 모드
    with open(save_file_path, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=field_names)
    
        # 파일이 없으면 헤더 추가
        if not file_exists:
            writer.writeheader()
    
        # 데이터 추가
        for row in data:
            writer.writerow(row)

In [None]:
data_cursor='20241124110000'
while(data_cursor > '20241123000000'):
    print(f"start loading {data_cursor}...")
    data_cursor, href_list = load_article_list(data_cursor)
    print(f"Complete loading {data_cursor} successfully!")
    article_data_list = []
    for url in href_list:
        print(f"Start extracting {url}...")
        article_data_list.append(extract_article_data(url))
        print(f"Complete extracting Successfully!")
        time.sleep(1)
    print(f"Save data... file_id: {data_cursor}")
    save(data_cursor, article_data_list)
    print(f"Saved.")
    
    

start loading 20241124110000...
Complete loading 20241124100013 successfully!
Start extracting https://n.news.naver.com/mnews/article/001/0015063501...
Complete extracting Successfully!
Start extracting https://n.news.naver.com/mnews/article/001/0015063519...
Complete extracting Successfully!
Start extracting https://n.news.naver.com/mnews/article/003/0012920586...
Complete extracting Successfully!
Start extracting https://n.news.naver.com/mnews/article/003/0012920588...
Complete extracting Successfully!
Start extracting https://n.news.naver.com/mnews/article/003/0012920599...
Complete extracting Successfully!
Start extracting https://n.news.naver.com/mnews/article/008/0005118439...
Complete extracting Successfully!
Start extracting https://n.news.naver.com/mnews/article/008/0005118443...
Complete extracting Successfully!
Start extracting https://n.news.naver.com/mnews/article/009/0005401755...
Complete extracting Successfully!
Start extracting https://n.news.naver.com/mnews/article/00