In [2]:
from flask import Flask, request, jsonify
from bs4 import BeautifulSoup
import requests
import os

In [4]:
MARKETAUX_API_KEY = "hMCAuYSbfahDopkGP1DmoEZdn90ky3LPWL6kJzoq"

In [None]:
# Marketaux API 요청

app = Flask(__name__)

@app.route('/get-news', methods=['POST'])
def get_news():
    req_data = request.get_json()

    # 종목명 필수 체크
    symbol = req_data.get('symbol')
    if not symbol:
        return jsonify({"error": "Symbol is required"}), 400

    marketaux_url = "https://api.marketaux.com/v1/news/all"
    params = {
        "symbols": symbol,
        "filter_entities": "true",
        "published_after": "2025-04-22",
        "limit": 5,
        "language": "en",
        "api_token": MARKETAUX_API_KEY,
    }

    response = requests.get(marketaux_url, params=params)

    if response.status_code != 200:
        return jsonify({"error": "Failed to fetch data from Marketaux"}), 500

    marketaux_data = response.json()

    # 필요한 데이터만 추리기
    articles = []
    for article in marketaux_data.get('data', []):
        article_info = {
            "title": article.get('title'),
            "description": article.get('description'),  # 요약/본문
            "url": article.get('url'),
            "sentiments": []
        }

        # entity 기반 감정 점수 추출
        for entity in article.get('entities', []):
            if entity.get('sentiment_score') is not None:
                article_info['sentiments'].append(entity.get('sentiment_score'))

        articles.append(article_info)

    return jsonify({"articles": articles})

if __name__ == '__main__':
    app.run(debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [27/Apr/2025 22:59:53] "POST /get-news HTTP/1.1" 200 -


In [None]:
# 링크 별로 동작 여부 달라짐

def crawl_article(url):
    # 웹 페이지의 HTML 코드 가져오기
    response = requests.get(url)
    
    # 응답 상태 코드가 200이면 성공적으로 페이지를 가져온 것
    if response.status_code == 200:
        # BeautifulSoup을 사용하여 HTML 파싱
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 예시로 p 태그를 찾아서 본문 내용 추출하기
        paragraphs = soup.find_all('p')
        article_text = ''
        
        for para in paragraphs:
            article_text += para.get_text()
        
        return article_text
    else:
        return f"Error: Unable to fetch the page. Status code: {response.status_code}"

# 테스트 URL
url = "https://newsdata.io/blog/access-yahoo-finance-news-api/"
# url = "https://www.insidermonkey.com/blog/amazon-com-inc-amzn-the-best-safe-stock-to-buy-according-to-hedge-funds-1516946/"  
article = crawl_article(url)

# 추출된 본문 출력
print(article)


Our #1 AI Stock Pick is on a steep discount - 29.99$ instead of 99.99$! Click here to access exclusive investment research and ad free browsing!Our #1 AI Stock Pick is on a steep discount - 29.99$ instead of 99.99$! Click here to access exclusive research!We recently published a list of 11 Best Safe Stocks to Buy According to Hedge Funds. In this article, we are going to take a look at where Amazon.com, Inc. (NASDAQ:AMZN) stands against other best safe stocks to buy according to hedge funds.In times when you never know what you’ll wake up to the next morning, playing safe seems to be the wisest choice. Amid consistent market shifts and global uncertainties, it’s difficult not to lean towards reliability. With rising global recession risks and political uncertainties, protecting the capital has become a priority for many. As Charlie Munger, Vice Chairman of Berkshire Hathaway, once said,“The idea of investing in a company just because it’s safe is not necessarily a good idea. But it’s a

In [None]:
# URL 파싱 잘 안됨

import requests
from bs4 import BeautifulSoup

def crawl_article(url):
    # 웹 페이지의 HTML 코드 가져오기
    response = requests.get(url)
    
    # 응답 상태 코드가 200이면 성공적으로 페이지를 가져온 것
    if response.status_code == 200:
        # BeautifulSoup을 사용하여 HTML 파싱
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 뉴스 기사 본문을 담고 있는 다양한 태그를 대상으로 크롤링
        # 1. <article> 태그 (일반적인 기사 영역)
        # 2. <div> 태그 중 클래스가 'article', 'news-content', 'content' 등인 것
        # 3. <section> 태그 (섹션으로 나뉘어진 기사)
        # 4. <p> 태그 (본문 텍스트가 있는 문단)
        
        article_text = ''
        
        # <article> 태그에서 추출
        article = soup.find_all('article')
        for tag in article:
            article_text += tag.get_text()
        
        # <div> 태그에서 추출 (다양한 클래스 기준)
        divs = soup.find_all('div', class_=lambda x: x and 'content' in x or 'article' in x or 'news' in x)
        for div in divs:
            article_text += div.get_text()
        
        # <section> 태그에서 추출
        sections = soup.find_all('section')
        for section in sections:
            article_text += section.get_text()
        
        # <p> 태그에서 추출
        paragraphs = soup.find_all('p')
        for para in paragraphs:
            article_text += para.get_text()

        # 기사 본문 내용이 길어질 수 있으므로 일부만 반환
        return article_text.strip()[:3000]  # 예시로 첫 3000자만 반환
    else:
        return f"Error: Unable to fetch the page. Status code: {response.status_code}"

# 테스트 URL
url = "https://newsdata.io/blog/access-yahoo-finance-news-api/"  # 여기에 크롤링할 실제 기사 URL을 넣으세요
article = crawl_article(url)

# 추출된 본문 출력
print(article)


Error: Unable to fetch the page. Status code: 403


In [None]:
# 이상한 기사만 뽑아옴

import feedparser
from newspaper import Article
import requests
from bs4 import BeautifulSoup
import time

def get_yahoo_finance_news_urls(ticker="AIG", max_articles=5):
    """Yahoo Finance RSS 피드에서 특정 종목의 뉴스 URL 추출"""
    rss_url = f"https://finance.yahoo.com/news/rss/{ticker}"
    feed = feedparser.parse(rss_url)
    urls = [entry.link for entry in feed.entries[:max_articles]]
    return urls

def scrape_article_text(url):
    """기사 URL에서 원문 텍스트 추출 (newspaper3k 사용)"""
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text
    except Exception as e:
        print(f"[Error] Failed to scrape {url}: {e}")
        return None

def main():
    # 예시: Apple (AAPL) 종목 뉴스 크롤링
    ticker = "AIG"
    urls = get_yahoo_finance_news_urls(ticker=ticker, max_articles=3)
    
    print(f"Found {len(urls)} articles for {ticker}:")
    for i, url in enumerate(urls, 1):
        print(f"\n{i}. URL: {url}")
        text = scrape_article_text(url)
        if text:
            print(f"Text (first 200 chars): {text}...")  # 일부만 출력
        time.sleep(1)  # 사이트 차단 방지

if __name__ == "__main__":
    main()

Found 3 articles for AIG:

1. URL: https://finance.yahoo.com/news/4-pipeline-stocks-buy-1-084100295.html
Text (first 200 chars): Pipeline companies remain well positioned despite the current disruption in the energy markets. By and large, these are toll-road businesses where energy prices have only a moderate direct impact on their results.

At the same time, demand for natural gas is growing. This is coming from the increased power consumption stemming from artificial intelligence (AI), as well as from export demand from Mexico and for LNG (liquified natural gas) to Asia and Europe.

Where to invest $1,000 right now? Our analyst team just revealed what they believe are the 10 best stocks to buy right now. Continue »

Let's look at four pipeline stocks that you can buy and hold for the long term.

Energy Transfer

Energy Transfer (NYSE: ET) operates one of the largest integrated midstream systems in the country, with various pipeline, storage, and processing assets. The company is part

In [45]:
import requests
from bs4 import BeautifulSoup
import time
import re

def get_yahoo_finance_news(ticker="AAPL", max_articles=5):
    """Yahoo Finance에서 특정 종목의 최신 뉴스와 내용 추출"""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
    }
    
    # 1. 종목 뉴스 페이지에서 기사 링크 추출
    news_url = f"https://finance.yahoo.com/quote/{ticker}/news"
    try:
        response = requests.get(news_url, headers=headers)
        response.raise_for_status()
        
        # JavaScript 렌더링 대응을 위한 정규식
        pattern = re.compile(r'"url":"(https:\\/\\/finance.yahoo.com\\/news\\/[^"]+)"')
        matches = pattern.findall(response.text)
        
        if not matches:
            print("뉴스 링크를 찾을 수 없습니다. 페이지 구조가 변경되었을 수 있습니다.")
            return []
            
        # URL 디코딩 및 중복 제거
        decoded_links = [link.replace('\\/', '/') for link in matches]
        unique_links = list(set(decoded_links))[:max_articles]
        
        # 2. 각 기사 내용 추출
        articles = []
        for link in unique_links:
            try:
                article_response = requests.get(link, headers=headers)
                soup = BeautifulSoup(article_response.text, 'html.parser')
                
                # 제목 추출
                title = soup.find('h1').get_text() if soup.find('h1') else "제목 없음"
                
                # 본문 추출 (최신 Yahoo Finance 구조)
                body_div = soup.find('div', class_=re.compile('caas-body'))
                if not body_div:
                    body_div = soup.find('div', {'data-test-locator': 'mfe-article'})
                
                content = body_div.get_text(separator='\n').strip() if body_div else "내용을 추출할 수 없음"
                
                articles.append({
                    'title': title,
                    'url': link,
                    'content': content
                })
                
                time.sleep(1)  # 차단 방지
                
            except Exception as e:
                print(f"기사 추출 중 오류 발생 ({link}): {str(e)}")
                continue
                
        return articles
        
    except Exception as e:
        print(f"뉴스 페이지 접근 중 오류 발생: {str(e)}")
        return []

def main():
    ticker = "AAPL"  # 원하는 종목 코드 입력
    max_articles = 3
    
    print(f"{ticker} 관련 최신 뉴스 수집 중...")
    news_articles = get_yahoo_finance_news(ticker, max_articles)
    
    if not news_articles:
        print("뉴스를 가져오지 못했습니다. 다음 방법을 시도해 보세요:")
        print("1. Yahoo Finance 사이트 직접 접속하여 페이지 구조 확인")
        print("2. User-Agent 변경")
        print("3. selenium 사용 고려")
        return
    
    print(f"\n{ticker} 관련 최신 뉴스 {len(news_articles)}건 발견:")
    for idx, article in enumerate(news_articles, 1):
        print(f"\n[{idx}] {article['title']}")
        print(f"URL: {article['url']}")
        print(f"\n내용 (요약):\n{article['content'][:300]}...")  # 300자만 출력
        print("-" * 50)

if __name__ == "__main__":
    main()

AAPL 관련 최신 뉴스 수집 중...
뉴스 링크를 찾을 수 없습니다. 페이지 구조가 변경되었을 수 있습니다.
뉴스를 가져오지 못했습니다. 다음 방법을 시도해 보세요:
1. Yahoo Finance 사이트 직접 접속하여 페이지 구조 확인
2. User-Agent 변경
3. selenium 사용 고려


In [None]:
# yahoo 사이드바 차트나 분순물 섞여서 검색해서 이상한 URL 긁어옴

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import requests
from bs4 import BeautifulSoup

# 1. Selenium으로 Yahoo Finance News 페이지 접속
options = Options()
options.add_argument('--headless')  # 창 없이
driver = webdriver.Chrome(options=options)

ticker = "AAPL"
driver.get(f"https://finance.yahoo.com/quote/{ticker}/news/")
time.sleep(3)  # 페이지 로딩 기다리기

# 2. 뉴스 URL 추출
news_links = []
elements = driver.find_elements('css selector', 'a[href*="/news/"]')
for elem in elements:
    url = elem.get_attribute('href')
    if url and '/news/' in url:
        news_links.append(url)

driver.quit()

# 중복 제거
news_links = list(set(news_links))

# 3. URL에 들어가서 기사 본문 긁기
def get_article_text(url):
    res = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(res.text, "html.parser")
    
    paragraphs = []
    
    # Yahoo Finance News 본문은 div[data-testid="article-body"] 안에 있음
    article_body = soup.find('div', {'data-testid': 'article-body'})
    if article_body:
        for p in article_body.find_all('p'):
            paragraphs.append(p.text.strip())
    
    return '\n'.join(paragraphs)

# 4. 전체 기사들 크롤링
for news_url in news_links[:5]:  # 예시로 처음 5개만
    print(f"크롤링: {news_url}")
    article_text = get_article_text(news_url)
    print(article_text)
    print("="*50)


크롤링: https://finance.yahoo.com/news/live/stock-market-today-dow-gains-1000-points-sp-500-and-nasdaq-gain-over-25-on-signs-of-tariff-progress-200233354.html?.tsrc=fin-notif

크롤링: https://finance.yahoo.com/news/watch-live-tesla-earnings-insights-call-snippets-and-in-depth-analysis-after-turbulent-quarter-193102862.html?.tsrc=fin-notif

크롤링: https://finance.yahoo.com/news/live/stock-market-today-dow-sp-500-nasdaq-jump-in-bid-to-recover-from-sell-off-152301711.html?.tsrc=fin-notif

크롤링: https://finance.yahoo.com/news/commentary-doge-gets-downsized-163129763.html?.tsrc=fin-notif

크롤링: https://finance.yahoo.com/news/trump-administration-pressures-europe-ditch-171051001.html



In [None]:
# yahoo finance는 api 안돼서 yahooquery로 우회했으나 못 가져옴

from yahooquery import Ticker
import requests

# 세션 설정
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})

ticker = Ticker('AAPL', session=session)

# 가져오기
news_data = ticker.news()

print(news_data)  # 먼저 전체 구조 확인!

# 만약 news_data가 dict라면 예를 들면 이렇게 접근
if isinstance(news_data, dict) and 'news' in news_data:
    for news in news_data['news'][:5]:  # news 리스트
        print(news['title'])
        print(news['link'])
        print('-' * 50)
else:
    print("Unexpected format:", type(news_data))


['error']
Unexpected format: <class 'list'>


In [None]:
from newsapi import NewsApiClient

# Init
newsapi = NewsApiClient(api_key='97f1e601562e49ccbe6b33f2dcf1ebc3')

# /v2/top-headlines
top_headlines = newsapi.get_top_headlines(q='bitcoin',
                                          sources='bbc-news,the-verge',
                                          language='en')

# /v2/everything
all_articles = newsapi.get_everything(q='bitcoin',
                                      sources='bbc-news,the-verge',
                                      domains='bbc.co.uk,techcrunch.com',
                                      from_param='2025-04-23',
                                      to='2025-04-27',
                                      language='en',
                                      sort_by='relevancy',
                                      page=2)

# /v2/top-headlines/sources
# sources = newsapi.get_sources()

In [None]:
# bbc-news 크롤링

import requests
from bs4 import BeautifulSoup

def extract_bbc_article_text(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # 1. main-content > article 찾기
    main_content = soup.find('main', id='main-content')
    if not main_content:
        print("main-content를 찾을 수 없습니다.")
        return None

    article = main_content.find('article')
    if not article:
        print("article을 찾을 수 없습니다.")
        return None

    # 2. 모든 data-component="text-block" div 찾기
    text_blocks = article.find_all('div', attrs={'data-component': 'text-block'})
    if not text_blocks:
        print("text-block을 찾을 수 없습니다.")
        return None

    paragraphs = []
    for block in text_blocks:
        # 각각의 블록 안에 p 태그들을 찾기
        p_tags = block.find_all('p')
        for p in p_tags:
            text = p.get_text(strip=True)
            if text:  # 빈 문장 아닌 경우만 추가
                paragraphs.append(text)

    if not paragraphs:
        print("본문을 찾을 수 없습니다.")
        return None

    full_text = "\n".join(paragraphs)
    return full_text

# 테스트
# url = "https://www.bbc.com/news/articles/cm248vzg9jwo" 
url = "https://www.bbc.com/news/articles/cly1n7jz587o"
content = extract_bbc_article_text(url)
if content:
    print(content)


'''
def extract_filtered_paragraphs(url, min_length=20, banned_keywords=None):
    if banned_keywords is None:
        banned_keywords = ["Read more", "Sign up", "Subscribe", "Advertisement", "Ad", "Promoted content", "RELATED"]

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # 모든 p 태그 긁기
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]

    if not paragraphs:
        print("p 태그를 찾을 수 없습니다.")
        return None

    # 필터링
    filtered_paragraphs = []
    for para in paragraphs:
        if len(para) < min_length:
            continue
        if any(banned.lower() in para.lower() for banned in banned_keywords):
            continue
        filtered_paragraphs.append(para)

    if not filtered_paragraphs:
        print("유효한 본문이 없습니다.")
        return None

    full_text = "\n".join(filtered_paragraphs)
    return full_text

# 테스트
url = "https://www.bbc.com/news/articles/cm248vzg9jwo"  # 테스트용 BBC 기사 URL
content = extract_filtered_paragraphs(url)
if content:
    print(content)
'''

'''
# News API에서 sources로 bbc-news면 잘 작동함

import requests
from bs4 import BeautifulSoup

def extract_all_paragraphs(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # 그냥 모든 p 태그 다 긁기
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]

    if not paragraphs:
        print("p 태그를 찾을 수 없습니다.")
        return None

    full_text = "\n".join(paragraphs)
    return full_text


# 사용 예시
url = "https://www.bbc.com/news/articles/cm248vzg9jwo"  # 테스트할 BBC 기사 URL
content = extract_all_paragraphs(url)
if content:
    print(content)
'''


During last year's presidential campaign, Donald Trump constantly repeated his intention to bring about dramatic change as soon as he returned to the White House.
But few expected it to come at such breakneck speed.
In the three months since he took the oath of office, the 47th president has deployed his power in a way that compares to few predecessors.
In stacks of bound documents signed off with a presidential pen and policy announcements made in all caps on social media, his blizzard of executive actions has reached into every corner of American life.
To his supporters, the shock-and-awe approach has been a tangible demonstration of an all-action president, delivering on his promises and enacting long-awaited reforms.
But his critics fear he is doing irreparable harm to the country and overstepping his powers - crippling important government functions and perhaps permanently reshaping the presidency in the process.
Here are six turning points from the first 100 days.
For once, it wa

In [None]:
# yahoo news 크롤링

import requests
from bs4 import BeautifulSoup

def extract_yahoo_finance_article_text(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # <div id="svelte"> 안에 <main> > <section> 구조 찾아가기
    svelte_div = soup.find('div', id='svelte')
    if not svelte_div:
        print("div#svelte 를 찾을 수 없습니다.")
        return None

    main_tag = svelte_div.find('main')
    if not main_tag:
        print("<main> 태그를 찾을 수 없습니다.")
        return None

    section_tag = main_tag.find('section')
    if not section_tag:
        print("<section> 태그를 찾을 수 없습니다.")
        return None

    # 이제 section 안의 모든 <p> 태그 긁어오기
    paragraphs = []
    for p in section_tag.find_all('p'):
        text = p.get_text(strip=True)
        if text:
            paragraphs.append(text)

    if not paragraphs:
        print("본문 <p> 태그를 찾을 수 없습니다.")
        return None

    return '\n'.join(paragraphs)

# 테스트 URL
# url = "https://finance.yahoo.com/news/asia-shares-edge-dollar-mercy-012952814.html"  # 잘 됨
url = "https://finance.yahoo.com/news/live/stock-market-today-sp-500-dow-nasdaq-futures-slip-ahead-of-huge-week-of-big-tech-earnings-economic-data-234959158.html"
content = extract_yahoo_finance_article_text(url)
if content:
    print(content)



Unlock stock picks and a broker-level newsfeed that powers Wall Street.
Stock futures edged lower on Monday ahead of a big week of earnings reports and macroeconomic data that will continue to paint an early picture of the US economy's response toPresident Trump's tariffs.
Futures tied to the S&P 500 (ES=F) slipped 0.2%, while futures for the Dow Jones Industrial Average (YM=F) also dipped 0.2%. Futures attached to the Nasdaq (NQ=F) fell 0.3%.
Wall Street is coming off arebounding runlast week, with the S&P 500 notching its longest daily positive streak since January. The gains came as Trumpeased pressure on Federal Reserve Chair Jerome Powell, as well as hinted atlight at the end of the tunnel for 145% tariffs on China. Wall Street looks positive,yet skittish, with plenty of room for growth before closing out the last trading week of what has been an eventful April.
Earnings are the highlight of the week ahead, with 180 S&P 500 companies expected to report quarterly financial results.