In [None]:
# S90G4LGOSBQDQINX

In [None]:
!pip install yfinance alpha_vantage pandas
# If you haven't installed transformers (for FinBERT)
!pip install transformers torch

In [None]:
import yfinance as yf
import pandas as pd

# 1. Define stock ticker and time range
ticker = "AAPL"
start_date = "2023-01-01"
end_date = "2024-12-31"

# 2. Download data
stock_data = yf.download(ticker, start=start_date, end=end_date)

# 3. Simple cleaning (keep only necessary columns)
# yfinance columns are often MultiIndex, let's simplify
stock_data = stock_data[['Open', 'High', 'Low', 'Close', 'Volume']]
stock_data.reset_index(inplace=True) # Make 'Date' a regular column
stock_data['Date'] = stock_data['Date'].dt.date # Keep only the date part, remove time

print(f"Successfully retrieved {len(stock_data)} days of market data")
stock_data.head()

In [None]:
import os
import yfinance as yf
import pandas as pd

file_path = "GOOGL_stock_data.csv"

# === Check: if file exists, read directly; otherwise, download ===
if os.path.exists(file_path):
    print("Reading data from local CSV...")
    # index_col=0 means set the first column ('Date') as index, parse_dates=True automatically recognizes date format
    stock_data = pd.read_csv(file_path, index_col=0, parse_dates=True)
else:
    print("No local data, downloading from Yahoo Finance...")
    ticker = "GOOGL"
    stock_data = yf.download(ticker, start="2023-01-01", end="2024-12-31")

    # Simplify column structure (yfinance downloads sometimes have MultiIndex, making it cleaner)
    stock_data = stock_data[['Open', 'High', 'Low', 'Close', 'Volume']]

    # === Key step: save to CSV ===
    stock_data.to_csv(file_path)
    print(f"Data saved to {file_path}")

# View first few rows
print(stock_data.head())

In [None]:
from google.colab import drive
# Adding force_remount=True resolves the issue
drive.mount('/content/drive', force_remount=True)

In [None]:
import requests
import pandas as pd
import time
import os
import calendar # New: for accurately calculating days in each month
from datetime import datetime

# === Configuration Area ===
API_KEY = "need your api"
TICKER = "AAPL"
SAVE_DIR = '/content/drive/MyDrive/StockData'
FINAL_CSV_PATH = f"{SAVE_DIR}/{TICKER}_2023_2024_Full_News.csv"

# Ensure save directory exists
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

def generate_monthly_periods(start_year, end_year):
    """Generates monthly start and end time strings from start_year to end_year (precisely fixed version)"""
    periods = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            # Get the number of days in the month (calendar.monthrange returns (weekday, total_days))
            _, last_day = calendar.monthrange(year, month)

            start_date = f"{year}{month:02d}01T0000"
            end_date = f"{year}{month:02d}{last_day}T2359"

            periods.append((f"{year}-{month:02d}", start_date, end_date))
    return periods

def fetch_historical_news():
    # 1. Generate all months for 2023-2024
    all_periods = generate_monthly_periods(2023, 2024)

    # 2. Check existing data for resume functionality
    existing_months = []
    all_news_data = []

    if os.path.exists(FINAL_CSV_PATH):
        print(f"Existing file detected: {FINAL_CSV_PATH}, reading for resume...")
        try:
            df_exist = pd.read_csv(FINAL_CSV_PATH)
            if 'Date' in df_exist.columns:
                # Convert to datetime objects for month extraction
                df_exist['DateObj'] = pd.to_datetime(df_exist['Date'], errors='coerce')
                # Extract existing months (format YYYY-MM)
                existing_months = df_exist['DateObj'].dt.strftime('%Y-%m').unique().tolist()
                # Restore data to list, ready to append
                # Note: drop the temporary 'DateObj' column
                all_news_data = df_exist.drop(columns=['DateObj']).to_dict('records')
                print(f"‚úÖ Detected {len(existing_months)} months of data (e.g.: {existing_months[:3]}).")
        except Exception as e:
            print(f"Failed to read existing file, starting over: {e}")

    print(f"Preparing to continue fetching news for {TICKER}...")

    requests_made = 0

    for month_label, start_str, end_str in all_periods:
        if month_label in existing_months:
            continue # Skip already fetched months

        print(f"Requesting: {month_label} (Range: {start_str[:8]}-{end_str[:8]})...", end=" ")

        url = (
            f"https://www.alphavantage.co/query?"
            f"function=NEWS_SENTIMENT"
            f"&tickers={TICKER}"
            f"&time_from={start_str}"
            f"&time_to={end_str}"
            f"&limit=1000"
            f"&apikey={API_KEY}"
        )

        try:
            response = requests.get(url)
            data = response.json()
            requests_made += 1

            # Error check
            if "Error Message" in data:
                print(f"\n‚ùå Parameter error: {data['Error Message']}")
                break
            if "Information" in data or "Note" in data:
                print(f"\n‚ö†Ô∏è API limit warning: {data.get('Information', data.get('Note'))}")
                print("Stopping fetch. Please save current progress.")
                break

            if "feed" in data:
                items = data["feed"]
                print(f"Success, retrieved {len(items)} items.")

                for item in items:
                    raw_time = item.get('time_published', '')
                    date_val = f"{raw_time[:4]}-{raw_time[4:6]}-{raw_time[6:8]}" if len(raw_time) >= 8 else "Unknown"

                    all_news_data.append({
                        "Date": date_val,
                        "Title": item.get('title', ''),
                        "Summary": item.get('summary', ''),
                        "Source": item.get('source', ''),
                        "URL": item.get('url', ''),
                        "Sentiment_Score": item.get('overall_sentiment_score', 0),
                        "Sentiment_Label": item.get('overall_sentiment_label', 'Neutral')
                    })
            else:
                # May be an empty month, or quota exhausted without standard error
                print(f"No data or abnormal response (Keys: {list(data.keys())})")

        except Exception as e:
            print(f"Network request error: {e}")

        # Save immediately after each successful request
        if all_news_data:
            df_temp = pd.DataFrame(all_news_data)
            # Simple deduplication
            df_temp.drop_duplicates(subset=['Title', 'Date'], inplace=True)
            df_temp.sort_values(by='Date', inplace=True)
            df_temp.to_csv(FINAL_CSV_PATH, index=False, encoding='utf-8-sig')

        # Rate control: sleep 15 seconds (safer)
        time.sleep(15)

    # Final results
    if all_news_data:
        df_final = pd.DataFrame(all_news_data)
        df_final.drop_duplicates(subset=['Title', 'Date'], inplace=True)
        print(f"\nüéâ Task finished! Collected a total of {len(df_final)} news items.")
        print(f"File location: {FINAL_CSV_PATH}")
    else:
        print("\nFailed to retrieve any data.")

fetch_historical_news()

In [None]:
!pip install finnhub-python gnews

In [None]:
import finnhub
import pandas as pd
import time
from datetime import datetime
import os

# === Enter your Finnhub API Key here ===
FINNHUB_API_KEY = "need your api"
# ================================

finnhub_client = finnhub.Client(api_key=FINNHUB_API_KEY)
TICKER = "AAPL"
SAVE_DIR = '/content/drive/MyDrive/StockData'
CSV_PATH = f"{SAVE_DIR}/{TICKER}_Finnhub_2023_2024.csv"

def fetch_finnhub_news():
    print("Starting data retrieval from Finnhub...")

    # Generate monthly splits from 2023-01-01 to 2024-12-31
    # Finnhub format requirement: YYYY-MM-DD
    periods = []
    for year in [2023, 2024]:
        for month in range(1, 13):
            # Simple handling for the last day of each month
            if month in [1,3,5,7,8,10,12]: d=31
            elif month==2: d=29 # Sufficient to cover leap years
            else: d=30
            periods.append((f"{year}-{month:02d}-01", f"{year}-{month:02d}-{d}"))

    all_news = []

    for start_date, end_date in periods:
        print(f"Fetching: {start_date} to {end_date} ...", end=" ")

        try:
            # Finnhub API call
            res = finnhub_client.company_news(TICKER, _from=start_date, to=end_date)

            if len(res) > 0:
                print(f"‚úÖ Retrieved {len(res)} items")
                for item in res:
                    # Finnhub returns Unix timestamp, needs conversion
                    ts = int(item.get('datetime', 0))
                    date_str = datetime.fromtimestamp(ts).strftime('%Y-%m-%d')

                    all_news.append({
                        "Date": date_str,
                        "Title": item.get('headline'),
                        "Summary": item.get('summary'),
                        "Source": item.get('source'),
                        "URL": item.get('url')
                    })
            else:
                print("‚ùå No data (possibly due to free tier historical range limitation)")

        except Exception as e:
            print(f"Error: {e}")

        # Finnhub limit is 60 requests per second (very generous), but for safety we pause for 1 second
        time.sleep(1)

    if all_news:
        df = pd.DataFrame(all_news)
        df.drop_duplicates(subset=['Title', 'Date'], inplace=True)
        df.sort_values(by='Date', inplace=True)
        df.to_csv(CSV_PATH, index=False, encoding='utf-8-sig')
        print(f"\nüéâ Finnhub fetch complete! Total {len(df)} items. Saved to: {CSV_PATH}")
        print(df.head())
    else:
        print("\nFailed to retrieve data. Please check if Key is correct or free tier limitations.")

fetch_finnhub_news()

In [None]:
!pip install gnews --quiet

import pandas as pd
from gnews import GNews
import time
import random
import os
import calendar
from datetime import datetime

# === Configuration ===
KEYWORD = "Apple"  # Keep broad, filter by code
SAVE_DIR = '/content/drive/MyDrive/StockData'
CSV_PATH = f"{SAVE_DIR}/AAPL_GNews_2023_2024_Cleaned.csv"

# === üçé Anti-fruit/Anti-noise Blacklist ===
# If title or summary contains these words, discard directly
NOISE_BLACKLIST = [
    'pie', 'tart', 'recipe', 'sauce', 'cider', 'vinegar', # Food
    'fruit', 'harvest', 'orchard', 'farm', 'agriculture', # Agriculture
    'juice', 'smoothie', 'nutrition', 'diet', 'baking',   # Diet
    'fiona apple', 'apple martin' # Celebrities (singer/star children)
]

# === Ensure directory exists ===
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

def is_relevant(text):
    """Checks if the text contains words from the blacklist"""
    if not text:
        return True
    text_lower = text.lower()
    for noise_word in NOISE_BLACKLIST:
        # Adding spaces is to prevent false positives (e.g., 'grapefruit' contains 'fruit', but only 'fruit' as a standalone word should be filtered)
        # Simple handling here, directly matching substring
        if f" {noise_word} " in f" {text_lower} ":
            return False
    return True

def fetch_gnews_smart():
    print(f"üöÄ Starting smart fetch: 2023-2024 Apple News")
    print(f"üõ°Ô∏è 'Anti-fruit' filter enabled: automatically discarding words like {NOISE_BLACKLIST[:3]}... etc.")

    google_news = GNews(language='en', country='US', max_results=100)

    all_news = []
    total_months = 24
    processed_months = 0

    for year in [2023, 2024]:
        for month in range(1, 13):

            _, last_day = calendar.monthrange(year, month)
            start_date = (year, month, 1)
            end_date = (year, month, last_day)

            print(f"[{processed_months+1}/{total_months}] üîç Searching: {year}-{month:02d} ...", end=" ")

            try:
                google_news.start_date = start_date
                google_news.end_date = end_date

                news_chunk = google_news.get_news(KEYWORD)

                if news_chunk:
                    count_before = len(news_chunk)
                    valid_items = []

                    for item in news_chunk:
                        title = item.get('title', '')
                        summary = item.get('description', '')

                        # === Core filtering logic ===
                        # Only keep if neither title nor summary contains "fruit words"
                        if is_relevant(title) and is_relevant(summary):
                            # Format alignment
                            raw_date = item.get('published date', '')
                            valid_items.append({
                                "Date": raw_date,
                                "Title": title,
                                "Summary": summary,
                                "Source": item.get('publisher', {}).get('title', 'Google News'),
                                "URL": item.get('url', '')
                            })

                    count_after = len(valid_items)
                    filtered_count = count_before - count_after

                    if filtered_count > 0:
                        print(f"‚úÖ Fetched {count_before} items (discarded {filtered_count} irrelevant news)")
                    else:
                        print(f"‚úÖ Fetched {count_after} items")

                    all_news.extend(valid_items)

                else:
                    print("‚ö†Ô∏è No results")

            except Exception as e:
                print(f"‚ùå Error: {e}")

            processed_months += 1
            # Random sleep
            time.sleep(random.uniform(5, 8))

    # === Save ===
    if all_news:
        df = pd.DataFrame(all_news)
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce').dt.strftime('%Y-%m-%d')
        df.dropna(subset=['Date'], inplace=True)
        df.sort_values(by='Date', inplace=True)
        df.drop_duplicates(subset=['Title'], inplace=True)

        df.to_csv(CSV_PATH, index=False, encoding='utf-8-sig')
        print(f"\nüéâ Complete! Fetched {len(df)} high-quality data items.")
        print(f"File location: {CSV_PATH}")
    else:
        print("\nüò≠ No data fetched.")

fetch_gnews_smart()

In [None]:
!pip install gnews --quiet

import pandas as pd
from gnews import GNews
import time
import random
import os
import calendar
from datetime import datetime

# === üìÅ Save Path ===
SAVE_DIR = '/content/drive/MyDrive/StockData'
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

# === üß† Core Configuration: Stock-specific Keywords and Blacklists ===
# Format: Ticker: { 'keyword': search_term, 'blacklist': [exclusion_terms] }
TICKER_CONFIG = {
    "AAPL": {
        "keyword": "Apple",
        "blacklist": [
            'pie', 'tart', 'recipe', 'sauce', 'cider', 'vinegar', 'juice',
            'fruit', 'harvest', 'orchard', 'farm', 'nutrition', 'diet',
            'fiona apple', 'bake', 'cake'
        ]
    },
    "MSFT": {
        "keyword": "Microsoft",
        "blacklist": [
            # Operations/Beginner Tutorials
            'how to fix', 'error code', 'blue screen', 'bsod', 'kb50', 'update fail',
            'download', 'install', 'wallpaper', 'shortcut', 'regedit',
            # Pure Gaming/Entertainment
            'walkthrough', 'guide', 'boss fight', 'controller skin', 'giveaway',
            'deal alert', 'price drop'
        ]
    },
    "AMZN": {
        "keyword": "Amazon",
        "blacklist": [
            # Geography/Environment
            'rainforest', 'jungle', 'deforestation', 'river', 'brazil', 'tribe',
            'wildfire', 'indigenous', 'carbon sink',
            # Pure Shopping/Marketing
            'gift guide', 'best deal', 'coupon', 'discount', 'dupe',
            'bestseller', 'fashion find'
        ]
    },
    "GOOGL": {
        "keyword": "Google",
        "blacklist": [
            # Miscellaneous
            'doodle', 'easter egg', 'funny', 'meme', 'song',
            # Basic Education
            'phonics', 'toddler', 'alphabet song', 'preschool', 'soup', 'noodle'
        ]
    },
    "META": {
        "keyword": "Meta",
        "blacklist": [
            # Academic/Medical (Meta-analysis is the biggest noise source)
            'meta-analysis', 'systematic review', 'clinical trial', 'genome',
            'metabolism', 'metaphysics', 'poetry', 'fiction', 'rpg',
            'tier list', 'loadout'
        ]
    }
}

def is_relevant(text, blacklist):
    """General filter: checks if the text contains blacklisted words for the stock"""
    if not text:
        return True
    text_lower = text.lower()
    for noise_word in blacklist:
        # Match with spaces before and after to prevent false positives
        # Simple handling here, directly matching substring
        if f" {noise_word} " in f" {text_lower} ":
            return False
        # Special handling for hyphenated words like meta-analysis
        if noise_word in text_lower:
            return False
    return True

def fetch_ticker_news(ticker):
    config = TICKER_CONFIG[ticker]
    keyword = config['keyword']
    blacklist = config['blacklist']

    csv_filename = f"{SAVE_DIR}/{ticker}_GNews_2023_2024.csv"

    print(f"\n{'='*50}")
    print(f"üíº Processing stock: {ticker} (Search term: {keyword})")
    print(f"üö´ Blacklist loaded ({len(blacklist)} items): {blacklist[:3]}...")
    print(f"{'='*50}")

    google_news = GNews(language='en', country='US', max_results=100)

    all_news = []
    total_months = 24
    processed_months = 0

    # Iterate over years
    for year in [2023, 2024]:
        for month in range(1, 13):

            _, last_day = calendar.monthrange(year, month)
            start_date = (year, month, 1)
            end_date = (year, month, last_day)

            print(f"[{ticker}][{processed_months+1}/{total_months}] üîç {year}-{month:02d} ...", end=" ")

            try:
                google_news.start_date = start_date
                google_news.end_date = end_date

                news_chunk = google_news.get_news(keyword)

                if news_chunk:
                    valid_items = []
                    for item in news_chunk:
                        title = item.get('title', '')
                        summary = item.get('description', '')

                        # === Filtering using stock-specific blacklist ===
                        if is_relevant(title, blacklist) and is_relevant(summary, blacklist):
                            valid_items.append({
                                "Date": item.get('published date', ''),
                                "Title": title,
                                "Summary": summary,
                                "Source": item.get('publisher', {}).get('title', 'Google News'),
                                "URL": item.get('url', '')
                            })

                    print(f"‚úÖ Fetched {len(valid_items)} items (original {len(news_chunk)} items)")
                    all_news.extend(valid_items)
                else:
                    print("‚ö†Ô∏è No data")

            except Exception as e:
                print(f"‚ùå Error: {e}")

            processed_months += 1
            # Small sleep between months
            time.sleep(random.uniform(3, 6))

    # === Save data for this stock ===
    if all_news:
        df = pd.DataFrame(all_news)
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce').dt.strftime('%Y-%m-%d')
        df.dropna(subset=['Date'], inplace=True)
        df.sort_values(by='Date', inplace=True)
        df.drop_duplicates(subset=['Title'], inplace=True)

        df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
        print(f"\nüéâ {ticker} Complete! Total {len(df)} items. Saved to: {csv_filename}")
    else:
        print(f"\nüò≠ {ticker} No data fetched.")

# === üî• Main Loop: Process 5 stocks sequentially ===
target_tickers = ["AAPL", "AMZN", "GOOGL", "META", "MSFT"]

for i, t in enumerate(target_tickers):
    fetch_ticker_news(t)

    # === Key: Long sleep between stocks ===
    # To prevent Google from detecting continuous high-intensity scraping from the same IP
    if i < len(target_tickers) - 1:
        sleep_time = random.uniform(30, 60)
        print(f"\n‚òï Taking a break... Pausing for {int(sleep_time)} seconds to prevent IP blocking...\n")
        time.sleep(sleep_time)

print("\nüèÜ All tasks completed!")