In [None]:
import requests
import pandas as pd
import datetime
import time
import csv

def fetch_all_btc_data():
    """
    Fetch all available BTC/USDT historical data from Binance API at 1-hour intervals
    and save to CSV file in the specified format.
    """
    # Base URL for Binance API
    base_url = "https://api.binance.com/api/v3/klines"

    # Symbol and interval
    symbol = "BTCUSDT"
    interval = "1h"

    # CSV file name
    csv_filename = "btc_historical_data.csv"

    # Initialize data collection
    all_data = []
    start_time = None

    print("Starting to fetch BTC historical data from Binance...")

    # First API call to get the most recent data
    params = {
        "symbol": symbol,
        "interval": interval,
        "limit": 1000  # Maximum allowed by Binance
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an exception for HTTP errors

        data = response.json()
        if not data:
            print("No data returned from Binance API.")
            return

        # Process and store this batch of data
        for candle in data:
            timestamp = int(candle[0])
            dt_object = datetime.datetime.fromtimestamp(timestamp / 1000)
            formatted_date = dt_object.strftime("%d-%m-%y %H:%M")

            open_price = float(candle[1])
            close_price = float(candle[4])
            high_price = float(candle[2])
            low_price = float(candle[3])
            volume = float(candle[5])

            all_data.append({
                "Date Time": formatted_date,
                "Crypto": "Bitcoin",
                "Open Price": f"{open_price:.0f}",
                "Close Price": f"{close_price:.0f}",
                "High Price": f"{high_price:.0f}",
                "Low Price": f"{low_price:.0f}",
                "Volume": f"{volume:.2f} BTC"
            })

        # Set the start time for the next batch (oldest entry from current batch)
        start_time = int(data[0][0])

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return

    # Number of successful API calls
    api_call_count = 1

    # Continue fetching older data until we reach the beginning or an error occurs
    while True:
        # Add a delay to avoid rate limiting
        time.sleep(0.5)

        # Update parameters to get the next batch of older data
        params = {
            "symbol": symbol,
            "interval": interval,
            "endTime": start_time - 1,  # Get data older than the oldest entry we have
            "limit": 1000
        }

        try:
            response = requests.get(base_url, params=params)
            response.raise_for_status()

            data = response.json()

            # If no more data is returned, we've reached the beginning
            if not data:
                print("Reached the beginning of available data.")
                break

            # Process and store this batch of data
            for candle in data:
                timestamp = int(candle[0])
                dt_object = datetime.datetime.fromtimestamp(timestamp / 1000)
                formatted_date = dt_object.strftime("%d-%m-%y %H:%M")

                open_price = float(candle[1])
                close_price = float(candle[4])
                high_price = float(candle[2])
                low_price = float(candle[3])
                volume = float(candle[5])

                all_data.append({
                    "Date Time": formatted_date,
                    "Crypto": "Bitcoin",
                    "Open Price": f"{open_price:.0f}",
                    "Close Price": f"{close_price:.0f}",
                    "High Price": f"{high_price:.0f}",
                    "Low Price": f"{low_price:.0f}",
                    "Volume": f"{volume:.2f} BTC"
                })

            # Update the start time for the next batch
            start_time = int(data[0][0])

            # Print progress
            api_call_count += 1
            if api_call_count % 10 == 0:
                oldest_date = datetime.datetime.fromtimestamp(start_time / 1000).strftime("%d-%m-%Y %H:%M")
                print(f"Progress: Downloaded {len(all_data)} records, oldest data point: {oldest_date}")

        except requests.exceptions.RequestException as e:
            print(f"Error fetching data: {e}")
            break

    # Sort data by date (oldest to newest)
    all_data.sort(key=lambda x: datetime.datetime.strptime(x["Date Time"], "%d-%m-%y %H:%M"))

    # Write to CSV
    try:
        with open(csv_filename, "w", newline="") as csvfile:
            fieldnames = ["Date Time", "Crypto", "Open Price", "Close Price", "High Price", "Low Price", "Volume"]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            for row in all_data:
                writer.writerow(row)

        print(f"Successfully downloaded {len(all_data)} records of BTC historical data.")
        print(f"Data saved to {csv_filename}")
        print(f"Date range: {all_data[0]['Date Time']} to {all_data[-1]['Date Time']}")

    except IOError as e:
        print(f"Error writing to CSV: {e}")

if __name__ == "__main__":
    fetch_all_btc_data()

Starting to fetch BTC historical data from Binance...
Progress: Downloaded 10000 records, oldest data point: 12-03-2024 21:00
Progress: Downloaded 20000 records, oldest data point: 21-01-2023 04:00
Progress: Downloaded 30000 records, oldest data point: 30-11-2021 12:00
Progress: Downloaded 40000 records, oldest data point: 09-10-2020 01:00
Progress: Downloaded 50000 records, oldest data point: 18-08-2019 17:00
Progress: Downloaded 60000 records, oldest data point: 25-06-2018 21:00
Reached the beginning of available data.
Successfully downloaded 67465 records of BTC historical data.
Data saved to btc_historical_data.csv
Date range: 17-08-17 04:00 to 03-05-25 12:00


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
import concurrent.futures
from datetime import datetime, timedelta
import os

def generate_dates(start_date_str="2017-04-20", end_date_str="2019-11-11"):
    """Generate a list of dates from the start date to the end date."""
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")

    dates = []
    current = start_date
    while current <= end_date:
        # Format for forexfactory URL: lowercase month abbreviation + day + year
        formatted_date = current.strftime("%b%d.%Y").lower()
        dates.append(formatted_date)
        current += timedelta(days=1)

    return dates

def scrape_forex_factory(date_str):
    """Scrape data for a specific date from ForexFactory."""
    url = f"https://www.forexfactory.com/calendar?day={date_str}"
    print(f"Scraping: {url}")

    # Set up headless Chrome
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("start-maximized")
    options.add_argument("window-size=1920,1080")
    options.add_argument("user-agent=Mozilla/5.0")

    driver = webdriver.Chrome(options=options)
    driver.get(url)

    # Wait for JavaScript to load content
    time.sleep(5)

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    data = []
    current_date = None

    rows = soup.select("tr.calendar__row")

    for row in rows:
        if "calendar__row--day-breaker" in row.get("class", []):
            date_span = row.select_one("td.calendar__cell span")
            if date_span:
                current_date = date_span.text.strip()
            continue

        time_td = row.select_one("td.calendar__time span")
        if not time_td:
            continue

        time_val = time_td.text.strip()
        currency = row.select_one("td.calendar__currency span")
        currency = currency.text.strip() if currency else ""

        impact_span = row.select_one("td.calendar__impact span[title]")
        impact = impact_span["title"].replace(" Impact Expected", "") if impact_span else ""

        event_title = row.select_one("td.calendar__event span.calendar__event-title")
        event_title = event_title.text.strip() if event_title else ""

        actual = row.select_one("td.calendar__actual span")
        actual = actual.text.strip() if actual else ""

        forecast = row.select_one("td.calendar__forecast span")
        forecast = forecast.text.strip() if forecast else ""

        previous = row.select_one("td.calendar__previous span")
        previous = previous.text.strip() if previous else ""

        data.append({
            "Date": current_date,
            "Time": time_val,
            "Currency": currency,
            "Impact": impact,
            "Event Title": event_title,
            "Actual": actual,
            "Forecast": forecast,
            "Previous": previous
        })

    driver.quit()

    return data

def main():
    # Generate all dates to scrape
    dates = generate_dates()
    all_data = []

    # Define output path
    output_path = "forexfactory_calendar_full.csv"
    temp_output_path = "temp_" + output_path

    # Create a counter for completed dates
    completed_dates = 0

    # Use ThreadPoolExecutor to run multiple scrapers concurrently
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Submit scraping tasks
        future_to_date = {executor.submit(scrape_forex_factory, date): date for date in dates}

        # Process results as they complete
        for future in concurrent.futures.as_completed(future_to_date):
            date = future_to_date[future]
            try:
                data = future.result()
                if data:
                    all_data.extend(data)
                    completed_dates += 1
                    print(f"Completed scraping for {date}, got {len(data)} entries")

                    # Save data after every completed date
                    temp_df = pd.DataFrame(all_data)
                    temp_df.to_csv(temp_output_path, index=False)
                    print(f"Saved {len(all_data)} entries to temporary file after {completed_dates} dates")

                    # Create more frequent backups - every 5 dates instead of 10
                    if completed_dates % 5 == 0:
                        backup_path = f"backup_{completed_dates}_{output_path}"
                        temp_df.to_csv(backup_path, index=False)
                        print(f"Created backup at {backup_path}")

                    # Also create time-based backups every hour
                    current_hour = datetime.now().strftime("%Y%m%d_%H")
                    hourly_backup_path = f"hourly_backup_{current_hour}_{output_path}"
                    if not os.path.exists(hourly_backup_path):
                        temp_df.to_csv(hourly_backup_path, index=False)
                        print(f"Created hourly backup at {hourly_backup_path}")

            except Exception as exc:
                # Save data even when an exception occurs
                if all_data:
                    error_backup_path = f"error_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{output_path}"
                    pd.DataFrame(all_data).to_csv(error_backup_path, index=False)
                    print(f"Error occurred. Created emergency backup at {error_backup_path}")
                print(f"Error scraping {date}: {exc}")

    # Convert results to DataFrame and save final output
    df = pd.DataFrame(all_data)
    df.to_csv(output_path, index=False)
    print(f"Scraping complete. Saved {len(df)} entries to {output_path}")

    print(f"Temporary file {temp_output_path} kept as additional backup")

if __name__ == "__main__":
    main()


Scraping: https://www.forexfactory.com/calendar?day=apr20.2017Scraping: https://www.forexfactory.com/calendar?day=apr21.2017

Scraping: https://www.forexfactory.com/calendar?day=apr22.2017
Scraping: https://www.forexfactory.com/calendar?day=apr23.2017
Scraping: https://www.forexfactory.com/calendar?day=apr24.2017
Scraping: https://www.forexfactory.com/calendar?day=apr25.2017
Scraping: https://www.forexfactory.com/calendar?day=apr26.2017
Scraping: https://www.forexfactory.com/calendar?day=apr27.2017
Scraping: https://www.forexfactory.com/calendar?day=apr28.2017
Scraping: https://www.forexfactory.com/calendar?day=apr29.2017
Scraping: https://www.forexfactory.com/calendar?day=apr30.2017
Completed scraping for apr29.2017, got 1 entries
Saved 1 entries to temporary file after 1 dates
Created hourly backup at hourly_backup_20250503_19_forexfactory_calendar_full.csv
Scraping: https://www.forexfactory.com/calendar?day=may01.2017
Completed scraping for apr22.2017, got 2 entries
Saved 3 entries 

In [None]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.32.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.32.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post0-py2.py3-