## 1년치 뉴스 크롤링 code

- 수집기간(date) : 2022.09.01 ~ 2023.08.31  
- 페이지 개수(page) : 1 ~ 40   
- url 구성 > f"https://news.daum.net/newsbox?regDate={date}&tab_cate=NE&page={page}"

### 📅 날짜 생성

In [None]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import re
import requests
import pickle
import time
from tqdm import tqdm # 진행률 확인

In [None]:
# 날짜 생성
dates = pd.date_range("2022-09-01", "2023-08-31")

# 날짜에서 하이픈(-) 제거
kdates = [re.sub('-', '', str(date)[0:10]) for date in dates]
print(kdates)

### 📰 뉴스 크롤링

In [None]:
def news_crawling(date, pages):
    print('date =', date)

    all_news = []

    for page in range(1, pages + 1):
        print('pages =', page)

        try:
            url = f'https://news.daum.net/newsbox?regDate={date}&tab_cate=NE&page={page}'
            res = requests.get(url)
            soup = bs(res.text, 'lxml')
            ul = soup.find("ul", {"class": "list_arrange"}).findAll("li")

            for li in ul:
                data = li.find("a", {"class": "link_txt"})
                press = li.find("span", {"class": "info_news"}).text

                news_url = data.get("href")
                news_res = requests.get(news_url)
                news_soup = bs(news_res.text, 'lxml')
                article = news_soup.find("div", {"class": "article_view"}).find("section").findAll("p")[:-1]
                contents = " ".join([p.text for p in article])

                all_news.append({
                    'title': data.text,
                    'url': news_url,
                    'press': press,
                    'content': contents
                })
        except Exception as e:
            print('오류내용 :', e)

    return all_news

### 🏃뉴스 수집 시작

In [None]:
start_time = time.time()
crawling_news = [news_crawling(date, 40) for date in kdates]
print("spending :", time.time() - start_time)

### ➡️ Dict -> DataFrame

In [None]:
df_news = []

for news_list in crawling_news:
    df_news.extend(news_list)

news = pd.DataFrame(df_news)

### 🥒 피클 저장

In [None]:
path = r'C:\ITWILL\0_Semi_project\data'

with open(path + '/daum_news_20230616_20230630.pkl', mode='wb') as f:
    pickle.dump(news, f)