In [2]:
from flask import Flask, request, jsonify
from bs4 import BeautifulSoup
import requests
import os

In [None]:
# Marketaux API 요청

MARKETAUX_API_KEY = "hMCAuYSbfahDopkGP1DmoEZdn90ky3LPWL6kJzoq"

app = Flask(__name__)

@app.route('/get-news', methods=['POST'])
def get_news():
    req_data = request.get_json()

    # 종목명 필수 체크
    symbol = req_data.get('symbol')
    if not symbol:
        return jsonify({"error": "Symbol is required"}), 400

    marketaux_url = "https://api.marketaux.com/v1/news/all"
    params = {
        "symbols": symbol,
        "filter_entities": "true",
        "published_after": "2025-04-22",
        "limit": 5,
        "language": "en",
        "api_token": MARKETAUX_API_KEY,
    }

    response = requests.get(marketaux_url, params=params)

    if response.status_code != 200:
        return jsonify({"error": "Failed to fetch data from Marketaux"}), 500

    marketaux_data = response.json()

    # 필요한 데이터만 추리기
    articles = []
    for article in marketaux_data.get('data', []):
        article_info = {
            "title": article.get('title'),
            "description": article.get('description'),  # 요약/본문
            "url": article.get('url'),
            "sentiments": []
        }

        # entity 기반 감정 점수 추출
        for entity in article.get('entities', []):
            if entity.get('sentiment_score') is not None:
                article_info['sentiments'].append(entity.get('sentiment_score'))

        articles.append(article_info)

    return jsonify({"articles": articles})

if __name__ == '__main__':
    app.run(debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [27/Apr/2025 22:59:53] "POST /get-news HTTP/1.1" 200 -


In [None]:
'''
# 링크 별로 동작 여부 달라짐

# def crawl_article(url):
#     # 웹 페이지의 HTML 코드 가져오기
#     response = requests.get(url)
    
#     # 응답 상태 코드가 200이면 성공적으로 페이지를 가져온 것
#     if response.status_code == 200:
#         # BeautifulSoup을 사용하여 HTML 파싱
#         soup = BeautifulSoup(response.text, 'html.parser')
        
#         # 예시로 p 태그를 찾아서 본문 내용 추출하기
#         paragraphs = soup.find_all('p')
#         article_text = ''
        
#         for para in paragraphs:
#             article_text += para.get_text()
        
#         return article_text
#     else:
#         return f"Error: Unable to fetch the page. Status code: {response.status_code}"

# # 테스트 URL
# url = "https://newsdata.io/blog/access-yahoo-finance-news-api/"
# # url = "https://www.insidermonkey.com/blog/amazon-com-inc-amzn-the-best-safe-stock-to-buy-according-to-hedge-funds-1516946/"  
# article = crawl_article(url)

# # 추출된 본문 출력
# print(article)
'''


'''
# yahoo api 요청 (뉴스 섹션에서 태그 따오는 수정 해야함) 그 후 밑에 yahoo crawling이랑 결합하기
# import requests
# from bs4 import BeautifulSoup
# import time
# import re

# def get_yahoo_finance_news(ticker="AAPL", max_articles=5):
#     """Yahoo Finance에서 특정 종목의 최신 뉴스와 내용 추출"""
#     headers = {
#         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
#     }
    
#     # 1. 종목 뉴스 페이지에서 기사 링크 추출
#     news_url = f"https://finance.yahoo.com/quote/{ticker}/news"
#     try:
#         response = requests.get(news_url, headers=headers)
#         response.raise_for_status()
        
#         # JavaScript 렌더링 대응을 위한 정규식
#         pattern = re.compile(r'"url":"(https:\\/\\/finance.yahoo.com\\/news\\/[^"]+)"')
#         matches = pattern.findall(response.text)
        
#         if not matches:
#             print("뉴스 링크를 찾을 수 없습니다. 페이지 구조가 변경되었을 수 있습니다.")
#             return []
            
#         # URL 디코딩 및 중복 제거
#         decoded_links = [link.replace('\\/', '/') for link in matches]
#         unique_links = list(set(decoded_links))[:max_articles]
        
#         # 2. 각 기사 내용 추출
#         articles = []
#         for link in unique_links:
#             try:
#                 article_response = requests.get(link, headers=headers)
#                 soup = BeautifulSoup(article_response.text, 'html.parser')
                
#                 # 제목 추출
#                 title = soup.find('h1').get_text() if soup.find('h1') else "제목 없음"
                
#                 # 본문 추출 (최신 Yahoo Finance 구조)
#                 body_div = soup.find('div', class_=re.compile('caas-body'))
#                 if not body_div:
#                     body_div = soup.find('div', {'data-test-locator': 'mfe-article'})
                
#                 content = body_div.get_text(separator='\n').strip() if body_div else "내용을 추출할 수 없음"
                
#                 articles.append({
#                     'title': title,
#                     'url': link,
#                     'content': content
#                 })
                
#                 time.sleep(1)  # 차단 방지
                
#             except Exception as e:
#                 print(f"기사 추출 중 오류 발생 ({link}): {str(e)}")
#                 continue
                
#         return articles
        
#     except Exception as e:
#         print(f"뉴스 페이지 접근 중 오류 발생: {str(e)}")
#         return []

# def main():
#     ticker = "AAPL"  # 원하는 종목 코드 입력
#     max_articles = 3
    
#     print(f"{ticker} 관련 최신 뉴스 수집 중...")
#     news_articles = get_yahoo_finance_news(ticker, max_articles)
    
#     if not news_articles:
#         print("뉴스를 가져오지 못했습니다. 다음 방법을 시도해 보세요:")
#         print("1. Yahoo Finance 사이트 직접 접속하여 페이지 구조 확인")
#         print("2. User-Agent 변경")
#         print("3. selenium 사용 고려")
#         return
    
#     print(f"\n{ticker} 관련 최신 뉴스 {len(news_articles)}건 발견:")
#     for idx, article in enumerate(news_articles, 1):
#         print(f"\n[{idx}] {article['title']}")
#         print(f"URL: {article['url']}")
#         print(f"\n내용 (요약):\n{article['content'][:300]}...")  # 300자만 출력
#         print("-" * 50)

# if __name__ == "__main__":
#     main()
'''


'''
# yahoo 사이드바 차트나 분순물 섞여서 검색해서 이상한 URL 긁어옴

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import requests
from bs4 import BeautifulSoup

# 1. Selenium으로 Yahoo Finance News 페이지 접속
options = Options()
options.add_argument('--headless')  # 창 없이
driver = webdriver.Chrome(options=options)

ticker = "AAPL"
driver.get(f"https://finance.yahoo.com/quote/{ticker}/news/")
time.sleep(3)  # 페이지 로딩 기다리기

# 2. 뉴스 URL 추출
news_links = []
elements = driver.find_elements('css selector', 'a[href*="/news/"]')
for elem in elements:
    url = elem.get_attribute('href')
    if url and '/news/' in url:
        news_links.append(url)

driver.quit()

# 중복 제거
news_links = list(set(news_links))

# 3. URL에 들어가서 기사 본문 긁기
def get_article_text(url):
    res = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(res.text, "html.parser")
    
    paragraphs = []
    
    # Yahoo Finance News 본문은 div[data-testid="article-body"] 안에 있음
    article_body = soup.find('div', {'data-testid': 'article-body'})
    if article_body:
        for p in article_body.find_all('p'):
            paragraphs.append(p.text.strip())
    
    return '\n'.join(paragraphs)

# 4. 전체 기사들 크롤링
for news_url in news_links[:5]:  # 예시로 처음 5개만
    print(f"크롤링: {news_url}")
    article_text = get_article_text(news_url)
    print(article_text)
    print("="*50)


'''

In [None]:
# bbc-news 크롤링

import requests
from bs4 import BeautifulSoup

def extract_bbc_article_text(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # 1. main-content > article 찾기
    main_content = soup.find('main', id='main-content')
    if not main_content:
        print("main-content를 찾을 수 없습니다.")
        return None

    article = main_content.find('article')
    if not article:
        print("article을 찾을 수 없습니다.")
        return None

    # 2. 모든 data-component="text-block" div 찾기
    text_blocks = article.find_all('div', attrs={'data-component': 'text-block'})
    if not text_blocks:
        print("text-block을 찾을 수 없습니다.")
        return None

    paragraphs = []
    for block in text_blocks:
        # 각각의 블록 안에 p 태그들을 찾기
        p_tags = block.find_all('p')
        for p in p_tags:
            text = p.get_text(strip=True)
            if text:  # 빈 문장 아닌 경우만 추가
                paragraphs.append(text)

    if not paragraphs:
        print("본문을 찾을 수 없습니다.")
        return None

    full_text = "\n".join(paragraphs)
    return full_text

# 테스트
# url = "https://www.bbc.com/news/articles/cm248vzg9jwo" 
url = "https://www.bbc.com/news/articles/cly1n7jz587o"
content = extract_bbc_article_text(url)
if content:
    print(content)


'''
def extract_filtered_paragraphs(url, min_length=20, banned_keywords=None):
    if banned_keywords is None:
        banned_keywords = ["Read more", "Sign up", "Subscribe", "Advertisement", "Ad", "Promoted content", "RELATED"]

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # 모든 p 태그 긁기
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]

    if not paragraphs:
        print("p 태그를 찾을 수 없습니다.")
        return None

    # 필터링
    filtered_paragraphs = []
    for para in paragraphs:
        if len(para) < min_length:
            continue
        if any(banned.lower() in para.lower() for banned in banned_keywords):
            continue
        filtered_paragraphs.append(para)

    if not filtered_paragraphs:
        print("유효한 본문이 없습니다.")
        return None

    full_text = "\n".join(filtered_paragraphs)
    return full_text

# 테스트
url = "https://www.bbc.com/news/articles/cm248vzg9jwo"  # 테스트용 BBC 기사 URL
content = extract_filtered_paragraphs(url)
if content:
    print(content)
'''

'''
# News API에서 sources로 bbc-news면 잘 작동함

import requests
from bs4 import BeautifulSoup

def extract_all_paragraphs(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # 그냥 모든 p 태그 다 긁기
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]

    if not paragraphs:
        print("p 태그를 찾을 수 없습니다.")
        return None

    full_text = "\n".join(paragraphs)
    return full_text


# 사용 예시
url = "https://www.bbc.com/news/articles/cm248vzg9jwo"  # 테스트할 BBC 기사 URL
content = extract_all_paragraphs(url)
if content:
    print(content)
'''


During last year's presidential campaign, Donald Trump constantly repeated his intention to bring about dramatic change as soon as he returned to the White House.
But few expected it to come at such breakneck speed.
In the three months since he took the oath of office, the 47th president has deployed his power in a way that compares to few predecessors.
In stacks of bound documents signed off with a presidential pen and policy announcements made in all caps on social media, his blizzard of executive actions has reached into every corner of American life.
To his supporters, the shock-and-awe approach has been a tangible demonstration of an all-action president, delivering on his promises and enacting long-awaited reforms.
But his critics fear he is doing irreparable harm to the country and overstepping his powers - crippling important government functions and perhaps permanently reshaping the presidency in the process.
Here are six turning points from the first 100 days.
For once, it wa

In [4]:
# cbs-news 크롤링

import requests
from bs4 import BeautifulSoup

def extract_cbs_news_text(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
        #"User-Agent": "Mozilla/5.0"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    article_section = soup.select_one("article section.content__body")

    if not article_section:
        print("본문 섹션을 찾을 수 없습니다.")
        return None

    paragraphs = article_section.find_all("p")
    article_text = "\n".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

    return article_text

# 사용 예시:
url = "https://www.cbsnews.com/news/amazon-prices-rising-trump-tariffs-temu-shein/" #"https://www.cbsnews.com/news/eu-fine-apple-meta-breach-digital-markets-act-dma/"
text = extract_cbs_news_text(url)
print(text)


American consumers are starting to feel the impact of President Trump'stariffs on Chinese imports, especially online.
Since the second week in April, sellers on Amazon have raised their prices on nearly 1,000 products, according to data from SmartScout, a price analysis software tool. The average price hike — nearly 30%, according to the company's analysis.
Costs rose on a wide range of items, from tech accessories such as phone chargers to women's clothing, SmartScout founder and CEO Scott Needham told CBS MoneyWatch. Anker, a top-selling brand on Amazon that sells mobile charging devices, has increased prices on roughly 25% of its products on the site, for example.
"It's one of first concerted efforts I've seen where nothing explains the price hikes other than tariffs," Needham said.
Tariffs are paid by importers, and they typically pass on much or even all of those additional costs to consumers. Amazon CEO Andrew Jassy toldCNBCearlier this month that he expected U.S. tariffs to boos

In [None]:
# cnn 크롤링

import requests
from bs4 import BeautifulSoup

def extract_cnn_news_text(url):
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # CNN의 기사 구조에 맞춰 탐색
    try:
        article_root = soup.select_one("body div section article[data-uri] section main")
        if not article_root:
            print("본문 루트(main)를 찾지 못했습니다.")
            return None

        # main 내부의 모든 <p> 태그 수집
        paragraphs = article_root.find_all("p")
        article_text = "\n".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

        return article_text

    except Exception as e:
        print(f"오류 발생: {e}")
        return None

# 사용 예시:
url = "https://edition.cnn.com/2025/04/08/asia/thailand-lese-majeste-paul-chambers-intl-hnk/index.html"
text = extract_cnn_news_text(url)
print(text)


Thailand’sarrest of a prominent American academic on charges of insulting the monarchy has “alarmed” the United States, the State Department said, in a rare case of a foreign national allegedly falling foul of the kingdom’s strict lese majeste law.
Paul Chambers, a lecturer at Naresuan University in central Thailand who writes analysis on the kingdom’s military and politics, could face years in prison after he was formally charged and detained when he presented himself to police and appeared in court on Tuesday.
Thailand has some of the world’s strictest lese majeste laws, and criticizing the king, queen, or heir apparent can lead to a maximum 15-year prison sentence for each offense. Anyone can file a lese majeste complaint and sentences for those convicted can be decades long, with hundreds of peopleprosecuted in recent years.
Chambers’ lawyer, Wannaphat Jenroumjit, said a warrant for his arrest was issued last week after a complaint was filed by a regional army command. Alongside le

In [None]:
# Yahoo News 페이지 동적 크롤링

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

def extract_yahoo_finance_news_links(ticker_url):
    # Selenium의 옵션 설정
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # 브라우저 없이 실행
    chrome_options.add_argument("--disable-gpu")

    # 웹드라이버 시작
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(ticker_url)

    # 페이지 로딩 대기
    time.sleep(5)  # 페이지 로딩 시간이 필요함

    # BeautifulSoup로 페이지 분석
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    news_panel = soup.find('div', id='tabpanel-news')
    if not news_panel:
        print("tabpanel-news를 찾을 수 없습니다.")
        driver.quit()
        return []

    links = []

    # stream-item 블록 순회
    stream_items = news_panel.find_all('div', class_=['stream-item', 'yf-186c5b2'])

    for item in stream_items:
        section = item.find('section')
        if section:
            a_tag = section.find('a', class_='subtle-link', href=True)
            if a_tag:
                links.append(a_tag['href'])

    driver.quit()
    return links


url = "https://finance.yahoo.com/quote/AMZN/"
links = extract_yahoo_finance_news_links(url)

for link in links:
    print(link)


https://finance.yahoo.com/news/invested-10k-amazon-stock-10-020324248.html
https://finance.yahoo.com/news/invested-10k-amazon-stock-10-020324248.html
https://finance.yahoo.com/news/amazon-launches-first-kuiper-internet-230830750.html
https://finance.yahoo.com/news/asian-investors-eye-china-trade-233327069.html
https://finance.yahoo.com/news/oppenheimer-cuts-amazon-target-price-205744829.html
https://finance.yahoo.com/m/77cf79ed-dee0-36c1-bddb-f8dcd6139b74/stock-market-today-dow-jones.html
https://finance.yahoo.com/news/live/stock-market-today-dow-sp-500-erase-slide-to-rise-for-5th-straight-day-in-lead-up-to-big-tech-earnings-200136533.html
https://finance.yahoo.com/news/wedbush-ai-deals-insulate-tech-194415587.html
https://finance.yahoo.com/news/stock-market-sell-off-3-191500627.html
https://finance.yahoo.com/news/anthropic-taps-experts-ais-macro-190020112.html
https://finance.yahoo.com/news/analyst-highlights-alarm-bells-amazon-184145341.html
https://finance.yahoo.com/m/13249db9-2f8a-

In [None]:
# yahoo news 크롤링

import requests
from bs4 import BeautifulSoup

def extract_yahoo_finance_article_text(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # <div id="svelte"> 안에 <main> > <section> 구조 찾아가기
    svelte_div = soup.find('div', id='svelte')
    if not svelte_div:
        print("div#svelte tag error")
        return None

    main_tag = svelte_div.find('main')
    if not main_tag:
        print("<main> tag error")
        return None

    section_tag = main_tag.find('section')
    if not section_tag:
        print("<section> tag error")
        return None

    # 이제 section 안의 모든 <p> 태그 긁어오기
    paragraphs = []
    for p in section_tag.find_all('p'):
        text = p.get_text(strip=True)
        if text:
            paragraphs.append(text)

    if not paragraphs:
        print("<p> tag error")
        return None

    return '\n'.join(paragraphs)

# 테스트 URL
# url = "https://finance.yahoo.com/news/asia-shares-edge-dollar-mercy-012952814.html"  # 잘 됨
url = "https://finance.yahoo.com/news/invested-10k-amazon-stock-10-020324248.html"
content = extract_yahoo_finance_article_text(url)
if content:
    print(content)


Unlock stock picks and a broker-level newsfeed that powers Wall Street.
Amazon.com (NASDAQ:AMZN) is a multinational technology company, engaged in e-commerce, cloud computing, online advertising, digital streaming, and artificial intelligence.
It is set to report its Q1 2025 earnings on May 1. Wall Street analysts expect the company to post EPS of $1.36, up from $0.98 in the prior-year period. According to Benzinga Pro, quarterly revenue is expected to reach $154.94 billion, up from $143.31 billion a year earlier.
Don't Miss:
Deloitte's fastest-growing software company partners with Amazon, Walmart & Target –Many are rushing to grab 4,000 of its pre-IPO shares for just $0.26/share!
Many are using retirement income calculators to check if they’re on pace —here’s a breakdown on what’s behind this formula.
The company's stock traded at approximately $21.93 per share 10 years ago. If you had invested $10,000, you could have bought roughly 456 shares. Currently, shares trade at $187.55, mea

In [None]:
def extract_article_text(url):
    url = url.lower()  # 대소문자 무시

    if 'bbc' in url:
        return extract_bbc_article_text(url)
    elif 'yahoo' in url:
        return extract_yahoo_finance_article_text(url)
    elif 'cbsnews' in url:
        return extract_cbs_news_text(url)
    elif 'cnn' in url:
        return extract_cnn_news_text(url)
    else:
        print("지원되지 않는 뉴스 사이트입니다.")
        return None
    

url = "https://www.cbsnews.com/news/amazon-prices-rising-trump-tariffs-temu-shein/"
text = extract_article_text(url)
print(text)