In [9]:
# 필요 패키지 설치: pip install feedparser pandas
import feedparser
from datetime import datetime, timedelta
import pandas as pd

In [10]:
def generate_google_rss_url(query, start_date, end_date):
    base_url = "https://news.google.com/rss/search?"
    q = f"q={query}+after:{start_date}+before:{end_date}"
    params = "&hl=en-US&gl=US&ceid=US:en"
    return base_url + q + params

def fetch_news_rss_day(query, day: datetime):
    start_date = day.strftime("%Y-%m-%d")
    end_date = (day + timedelta(days=1)).strftime("%Y-%m-%d")
    url = generate_google_rss_url(query, start_date, end_date)
    feed = feedparser.parse(url)

    items = []
    for entry in feed.entries:
        try:
            pub_date = datetime(*entry.published_parsed[:6])
        except Exception:
            continue
        items.append({
            "date": pub_date.strftime("%Y-%m-%d"),
            "title": entry.title,
            "link": entry.link,
            "source": entry.source.title if "source" in entry else "Unknown"
        })

    return items

def get_news_data(start_day, end_day):
    # ✅ 수집 범위: 2025-01-01 ~ 오늘까지

    all_news = []
    current_day = start_day

    while current_day <= end_day:
        daily_news = fetch_news_rss_day("Tesla", current_day)
        all_news.extend(daily_news)
        print(f"✅ {current_day.strftime('%Y-%m-%d')} - {len(daily_news)} items")
        current_day += timedelta(days=1)
    return all_news

def remove_duplicate_titles_by_prefix(df, prefix_length=50):
    seen = set()
    keep_rows = []

    for _, row in df.iterrows():
        prefix = row["title"][:prefix_length].strip().lower()
        if prefix not in seen:
            seen.add(prefix)
            keep_rows.append(row)

    return pd.DataFrame(keep_rows).reset_index(drop=True)

In [11]:
all_news = get_news_data(datetime(2024, 1, 1), datetime(2024,12,31))

✅ 2024-01-01 - 89 items
✅ 2024-01-02 - 100 items
✅ 2024-01-03 - 100 items
✅ 2024-01-04 - 100 items
✅ 2024-01-05 - 72 items
✅ 2024-01-06 - 34 items
✅ 2024-01-07 - 43 items
✅ 2024-01-08 - 61 items
✅ 2024-01-09 - 88 items
✅ 2024-01-10 - 100 items
✅ 2024-01-11 - 93 items
✅ 2024-01-12 - 58 items
✅ 2024-01-13 - 27 items
✅ 2024-01-14 - 42 items
✅ 2024-01-15 - 82 items
✅ 2024-01-16 - 100 items
✅ 2024-01-17 - 100 items
✅ 2024-01-18 - 84 items
✅ 2024-01-19 - 39 items
✅ 2024-01-20 - 20 items
✅ 2024-01-21 - 53 items
✅ 2024-01-22 - 73 items
✅ 2024-01-23 - 100 items
✅ 2024-01-24 - 100 items
✅ 2024-01-25 - 100 items
✅ 2024-01-26 - 79 items
✅ 2024-01-27 - 13 items
✅ 2024-01-28 - 59 items
✅ 2024-01-29 - 87 items
✅ 2024-01-30 - 100 items
✅ 2024-01-31 - 100 items
✅ 2024-02-01 - 100 items
✅ 2024-02-02 - 100 items
✅ 2024-02-03 - 40 items
✅ 2024-02-04 - 66 items
✅ 2024-02-05 - 92 items
✅ 2024-02-06 - 75 items
✅ 2024-02-07 - 71 items
✅ 2024-02-08 - 63 items
✅ 2024-02-09 - 43 items
✅ 2024-02-10 - 27 items
✅ 2

In [12]:
# ✅ DataFrame으로 정리 후 저장
df = pd.DataFrame(all_news)
df.sort_values(by="date", inplace=True)
df = df[["date", "title", "link", "source"]]
df["date"] = pd.to_datetime(df["date"])

# ✅ 중복 제거 (title 기준)
df.drop_duplicates(subset="title", inplace=True)


# 2차로 앞부분 50글자 기준 중복 제거
df = remove_duplicate_titles_by_prefix(df, prefix_length=50)

df.to_csv("~/Downloads/tsla_news_train.csv", index=False)
print("✅ 저장 완료: tsla_news.csv")
df.head(10)


✅ 저장 완료: tsla_news.csv


Unnamed: 0,date,title,link,source
0,2024-01-01,Tesla is sneakily testing Fremont-built refres...,https://news.google.com/rss/articles/CBMinAFBV...,Electrek
1,2024-01-01,Tesla Cybertruck’s eyebrow-raising windshield ...,https://news.google.com/rss/articles/CBMiigFBV...,Yahoo! Autos
2,2024-01-01,Can I Charge My Non-Tesla at a Tesla Charging ...,https://news.google.com/rss/articles/CBMihAFBV...,How-To Geek
3,2024-01-01,The World's Best Factory Drag Car Is No Match ...,https://news.google.com/rss/articles/CBMiggFBV...,Motor1.com
4,2024-01-01,"Tesla Model 3 Battery Size, Voltage, And Charg...",https://news.google.com/rss/articles/CBMihgFBV...,Top Speed
5,2024-01-01,Tesla lashes Reuters for 'wildly misleading' s...,https://news.google.com/rss/articles/CBMiqgFBV...,CarExpert
6,2024-01-01,Tesla's EV Business Faces 'Challenges' In 2024...,https://news.google.com/rss/articles/CBMiygFBV...,Investor's Business Daily
7,2024-01-01,BYD beats Tesla for a second straight year aft...,https://news.google.com/rss/articles/CBMiogFBV...,CNBC
8,2024-01-01,Here are the only Tesla vehicles that now have...,https://news.google.com/rss/articles/CBMie0FVX...,Electrek
9,2024-01-01,"Tesla car spotted on Bengaluru roads, sparks b...",https://news.google.com/rss/articles/CBMi1AFBV...,Hindustan Times


In [8]:
import requests
from bs4 import BeautifulSoup

def extract_article_text(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        res = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(res.text, 'html.parser')

        # 기사 본문 파싱: <p> 태그 모두 연결
        paragraphs = soup.find_all("p")
        text = '\n'.join(p.get_text() for p in paragraphs)

        return text.strip()
    except Exception as e:
        return "[ERROR] " + str(e)

In [None]:
from tqdm import tqdm

df["content"] = [
    extract_article_text(url) for url in tqdm(df["link"][:10])
]
df.to_csv("tesla_news_with_body_2025.csv", index=False)

100%|██████████| 5998/5998 [1:32:56<00:00,  1.08it/s] 
