### 트럼프 관련 경제뉴스 크롤링
- Financial Times - 세계 경제 동향을 포함한 거시 경제 분야의 중요한 뉴스를 다룸.
- https://www.ft.com/donald-trump?page=1

In [1]:
from urllib.request import urlopen
import pandas as pd
import bs4
import requests

In [None]:
# 여러 페이지에서 url 추출
base_url = "https://www.ft.com/donald-trump?page="
url_list = []

for i in range(1, 11):
    url = base_url + str(i)
    url_list.append(url)

url_list

['https://www.ft.com/donald-trump?page=1',
 'https://www.ft.com/donald-trump?page=2',
 'https://www.ft.com/donald-trump?page=3',
 'https://www.ft.com/donald-trump?page=4',
 'https://www.ft.com/donald-trump?page=5',
 'https://www.ft.com/donald-trump?page=6',
 'https://www.ft.com/donald-trump?page=7',
 'https://www.ft.com/donald-trump?page=8',
 'https://www.ft.com/donald-trump?page=9',
 'https://www.ft.com/donald-trump?page=10']

In [None]:
# 각 기사별 헤드라인과 url 추출
def get_topic(url):
  headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'}
  res = requests.get(url, headers=headers) # user agent 값 넘겨주기
  bs_obj = bs4.BeautifulSoup(res.text, 'html.parser')

  divs = bs_obj.find_all('div', {'class':'o-teaser__heading'})
  topic_list = []
  link_list = []

  for div in divs:
    try:
      a_tag = div.find('a')
      topic_list.append(a_tag.text.replace('\n',''))
      link = "https://www.ft.com"+ a_tag['href']
      link_list.append(link)
    except:
      pass

  return({'topic':topic_list, 'link':link_list})

In [None]:
# 뉴스 토픽 리스트 데이터 프레임 생성
topic_df = pd.DataFrame({
    'topic' : [],
    'link' : []
})

for i in range(len(url_list)):
  df = pd.DataFrame(get_topic(url_list[i]))
  topic_df = pd.concat([topic_df, df], axis=0, ignore_index=True)

topic_df.head()

Unnamed: 0,topic,link
0,Best of Banx 2024,https://www.ft.com/content/ecec389f-4b58-487c-...
1,The systemic financial risk at the heart of Tr...,https://www.ft.com/content/d23f3861-9659-4d02-...
2,Year in a word: Fascism,https://www.ft.com/content/5b2c1410-3645-46cf-...
3,Musk’s fight with Maga reveals split on immigr...,https://www.ft.com/content/0531a4ab-e587-421c-...
4,Trump asks Supreme Court to delay TikTok ban t...,https://www.ft.com/content/d02e7f1f-9fa3-4a3f-...


- CNBC - 글로벌 주식시장 동향과 기업 소식 등을 전문적으로 다루는 사이트
- https://www.cnbc.com/search/?query=trump&qsearchterm=trump

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_cnbc_articles(query, num_articles):
    base_url = "https://www.cnbc.com/search/?query={}&qsearchterm={}".format(query, query)
    articles = []
    page = 1

    while len(articles) < num_articles:
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'}
        url = base_url + "&page={}".format(page)
        res = requests.get(url, headers=headers)
        html = BeautifulSoup(res.text, "html.parser")

        # 기사 링크 추출
        article_links = [link.get("href") for link in html.find_all("a", class_="resultlink")]

        # 각 기사 링크에서 제목과 내용 추출
        for link in article_links:
            article_url = link  # CNBC 기사 링크는 이미 완전한 URL입니다.
            article_response = requests.get(article_url)
            article_soup = BeautifulSoup(article_response.text, "html.parser")

            # 기사 제목 추출
            if article_soup.find("h1", class_="ArticleHeader-headline") is not None:
              title = article_soup.find("h1", class_="ArticleHeader-headline").text.strip()
            else:
              title = None

            # 기사 내용 추출
            if article_soup.find("div", class_="group") is not None:
              content = " ".join([p.text.strip() for p in article_soup.find_all("div", class_="group")])
            else:
              content = None

            # 제목과 내용이 모두 존재하는 경우에만 articles 리스트에 추가
            if title and content is not None:
                articles.append({"title": title, "content": content})

            # 기사 수 제한
            if len(articles) >= num_articles:
                break

        # 다음 페이지로 이동
        page += 1

        # 페이지에 기사가 없는 경우 루프 종료
        if not article_links:
            break

    return pd.DataFrame(articles)

# 함수 실행
trump_articles = get_cnbc_articles("trump", 100)

# 결과 저장
trump_articles.to_csv("trump_articles.csv", index=False)

In [9]:
trump_articles


In [None]:
# !pip install selenium
# !apt-get update # to update ubuntu to correctly run apt install
# !apt install chromium-chromedriver
# !cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd

def get_cnbc_articles(query, num_articles):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Headless 모드로 실행
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome('chromedriver', options=options)  # Chromedriver 경로 설정

    base_url = "https://www.cnbc.com/search/?query={}&qsearchterm={}".format(query, query)
    articles = []
    page = 1

    while len(articles) < num_articles:
        url = base_url + "&page={}".format(page)
        driver.get(url)

        # JavaScript 로딩 대기
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "resultlink")))

        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        # 기사 링크 추출
        article_links = [link.get("href") for link in soup.find_all("a", class_="resultlink")]

        # 각 기사 링크에서 제목과 내용 추출
        for link in article_links:
            driver.get(link)  # 기사 페이지로 이동

            # JavaScript 로딩 대기 (기사 페이지)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "ArticleHeader-headline")))

            article_soup = BeautifulSoup(driver.page_source, "html.parser")

            # 기사 제목 추출
            title = article_soup.find("h1", class_="ArticleHeader-headline").text.strip() if article_soup.find("h1", class_="ArticleHeader-headline") else None

            # 기사 내용 추출
            content_elements = article_soup.find_all("div", class_="group")
            content = " ".join([p.text.strip() for p in content_elements]) if content_elements else None

            if title and content:
                articles.append({"title": title, "content": content})

            if len(articles) >= num_articles:
                break

        page += 1

        # 페이지에 기사가 없는 경우 루프 종료
        if not article_links:
            break

    driver.quit()  # Selenium 종료
    return pd.DataFrame(articles)

TypeError: WebDriver.__init__() got multiple values for argument 'options'

- BBC Business : 세계 각국의 경제 동향과 이슈를 폭넓게 다루는 사이트
- https://www.bbc.com/business