# 웹 크롤링 예제

## 필요 라이브러리 설치

In [2]:
!pip install beautifulsoup4



In [3]:
!pip install requests



## 필요 라이브러리 불러오기

In [4]:
import requests
from bs4 import BeautifulSoup

## 원하는 URL의 정적 HTML 받아오기

In [None]:
url = 'https://www.naver.com'
response = requests.get(url)
html_content = response.text

### beautifulSoup 객체 생성 후 파싱

In [6]:
soup = BeautifulSoup(html_content, 'html.parser')
print(soup.title.string)

NAVER


### 모든 `<a>` 태그를 찾음

In [19]:
for link in soup.find_all('a'):
    print(link.get('href'))

#topAsideButton
#shortcutArea
#newsstand
#shopping
#feed
#account
#widgetboard
#viewSetting


### 특정 클래스 이름으로 요소 선택

In [8]:
# 'search_group_inner'라는 클래스를 가진 모든 요소를 찾음
for element in soup.find_all(class_='search_group_inner'):
    print(element.get_text())

### css 선택자로 모든 요소 찾기

In [None]:
for item in soup.select("#shortcutArea > ul > li"):
    print(item)

In [31]:
search_area = soup.select("#search_area")
# 찾은 내용 json 형태로 보기
search_area

[<div class="search_area" id="search_area" style="border-color:#fff"> <div class="search_group"> <div class="search_group_inner" id="search"> <h1 class="search_logo" id="special-input-logo"></h1> <form action="https://search.naver.com/search.naver" id="sform" method="get" name="search" role="search"> <fieldset> <legend class="blind">검색</legend> <input name="where" type="hidden" value="nexearch"/> <input id="sm" name="sm" type="hidden" value="top_hty"/> <input id="fbm" name="fbm" type="hidden" value="0"/> <input disabled="disabled" id="acr" name="acr" type="hidden" value=""/> <input disabled="disabled" id="acq" name="acq" type="hidden" value=""/> <input disabled="disabled" id="qdt" name="qdt" type="hidden" value=""/> <input id="ie" name="ie" type="hidden" value="utf8"/> <input disabled="disabled" id="acir" name="acir" type="hidden" value=""/> <input disabled="disabled" id="os" name="os" type="hidden" value=""/> <input disabled="disabled" id="bid" name="bid" type="hidden" value=""/> <inp

### 실제 사용 환경과 동일하게 http 요청해서 html 받아오기

In [45]:
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"}
response = requests.get(url, headers = headers)
soup2 = BeautifulSoup(response.text, 'html.parser')

* 주의 : requests는 서버가 보내주는 초기 HTML 소스 코드만 가져오므로,
페이지가 로드된 후 클라이언트 측 JavaScript 코드가 실행되면서 비동기적으로 생성되는
네이버 뉴스 헤드라인은 아직 비어있거나 존재하지 않음

In [46]:
head_line = soup2.select("#newsstand > div:nth-child(2) > div > div > div > div a")
head_line
for line in head_line:
    print(line.get_text()) 

- 대안 : Selenium을 활용한 크롤링
  - 실제 웹 브라우저를 구동하여 JavaScript 실행을 가디림

# Selenium을 활용한 웹 크롤링

## 필요 라이브러리 설치

In [48]:
!pip install selenium webdriver_manager

Collecting selenium
  Downloading selenium-4.38.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting urllib3<3.0,>=2.5.0 (from urllib3[socks]<3.0,>=2.5.0->selenium)
  Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio<1.0,>=0.31.0 (from selenium)
  Downloading trio-0.32.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket<1.0,>=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2025.10.5 (from selenium)
  Downloading certifi-2025.11.12-py3-none-any.whl.metadata (2.5 kB)
Collecting typing_extensions<5.0,>=4.15.0 (from selenium)
  Using cached typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting sortedcontainers (from trio<1.0,>=0.31.0->selenium)
  Using cached sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio<1.0,>=0.31.0->selenium)

## 필요 라이브러리 불러오기

In [54]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

## 드라이버 설정 및 URL 불러오기

In [58]:
service = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
url = "https://www.naver.com"
driver.get(url)

## 띄워진 브라우저에서 크롤링

In [59]:
try:
    # 뉴스스탠드가 화면에 나타날 때까지 최대 10초간 기다림
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "newsstand"))
    )

    headline_elements = driver.find_elements(By.CSS_SELECTOR, "#newsstand > div:nth-child(2) > div > div > div > div a")
  
    if headline_elements:
        for index, element in enumerate(headline_elements[:10]):
            print(f"{index + 1} : {element.text.strip()}")

except Exception as e:
    print(f"Error : {e}")

finally:
    driver.quit()

1 : 구자현 신임 대검차장 "무거운 책임…조직 안정화 최우선"
