In [1]:
from xml.dom.domreg import well_known_implementations

import requests
import pandas as pd
import time
from datetime import datetime, timedelta
import plotly.graph_objects as go

In [12]:
# Polygon.io API Key
API_KEY = '2l_X1NgaJhbmxY0irf8XbrlrRF4Y_cy4'
df_1min = None
df_15min = None
df_1day = None


In [13]:
# Function to fetch data from Polygon.io
def fetch_data(symbol, start_date, end_date):
    url = f'https://api.polygon.io/v2/aggs/ticker/{symbol}/range/1/minute/{start_date}/{end_date}'
    params = {
        'adjusted': 'false',  # Adjusted for splits/dividends
        'sort': 'asc',
        'limit': 50000,  # Max data points per call
        'apiKey': API_KEY
    }
    response = requests.get(url, params=params)
    data = response.json()

    if 'results' in data:
        return data['results']
    else:
        return []


In [14]:
# Convert timestamp to human-readable datetime
def convert_timestamp(ts):
    return datetime.utcfromtimestamp(ts / 1000) # the time from polygon.io is in milliseconds

In [15]:
# Function to check data completeness
# this function doesn't currently do anything OTHER THAN check id data is EMPTY
def is_data_complete(data, start_time, end_time):
    if not data:
        print("Warning: No data received!")
        return False

    # Convert timestamps to minute-based list
    received_timestamps = {datetime.utcfromtimestamp(d['t'] / 1000) for d in data}

    # Generate expected timestamps
    expected_timestamps = set()
    current_time = start_time
    while current_time <= end_time:
        expected_timestamps.add(current_time)
        current_time += timedelta(minutes=1)

    # Compare expected vs. received timestamps
    missing = expected_timestamps - received_timestamps #Note sufficient as this include non market day and times as well.

    # Condition for checking if data is complete.
    # if missing:
    #     print(f"Warning: {len(missing)} missing minutes of data.")
    #     return False

    return True

In [16]:
# Aggregate minute data into daily OHLCV
def aggregate_daily(data):
    df = pd.DataFrame(data)
    df['timestamp'] = df['t'].apply(convert_timestamp)

    # Convert timestamp to date (removing time)
    df['date'] = df['timestamp'].dt.date

    # Group by date and calculate daily OHLCV
    daily_data = df.groupby('date').agg(
        open=('o', 'first'),
        high=('h', 'max'),
        low=('l', 'min'),
        close=('c', 'last'),
        volume=('v', 'sum')
    ).reset_index()

    return daily_data


In [17]:
# Main function to fetch data and create the plot
def create_daily_candlestick_plot():
    symbol = 'AAPL'
    end_date = datetime.now()
    start_date = end_date - timedelta(days=90)  # 3 months back

    # Convert dates to string format
    start_str = start_date.strftime('%Y-%m-%d')
    end_str = end_date.strftime('%Y-%m-%d')

    """
    UNCOMMENT THE ONE OF THE BELOW CODEs to directly fetch data from Polygon.io OR USE glovally saved dataand create the plot
    ## In future change the fetch_data function to also recieve the multiplier and timespan as well so that we don't have to manually change it

    NOTE ADDITIONAL CHANGES:
    - Uncomment the right daily-data assigned code
    """
    # Fetch data from Polygon.io
    data = fetch_data(symbol, start_str, end_str)

    # # Fetch data from the global variable
    # ## IN FUTURE; set it up so that we can set the duration of data to be plotted
    # data = df_SPY_1day
    # # Convert timestamp to date (removing time)
    # data['date'] = data['timestamp'].dt.date

    # Check if data was received
    if not data:
        print("No data received")
        return

    # Aggregate the minute data into daily data
    ## UNCOMMENT ONE OF THE CODES BELOW
    # daily_data = aggregate_daily(data)
    df = pd.DataFrame(data)
    df['timestamp'] = df['t'].apply(convert_timestamp)

    # Convert timestamp to date (removing time)
    df['date'] = df['timestamp'].dt.date
    daily_data = df.rename(columns={'o': 'open', 'h': 'high', 'l': 'low', 'c': 'close', 'v': 'volume'})


    # UNCOMMENT 1ST ONE IF use fetch_data, IF NOT THEN UNCOMMENT THE 2ND ONE
    # Create the candlestick chart using Plotly
    fig = go.Figure(data=[go.Candlestick(x=daily_data['date'],
                                         open=daily_data['open'],
                                         high=daily_data['high'],
                                         low=daily_data['low'],
                                         close=daily_data['close'])])

    # # Create the candlestick chart using Plotly
    # fig = go.Figure(data=[go.Candlestick(x=daily_data['date'],
    #                                      open=daily_data['o'],
    #                                      high=daily_data['h'],
    #                                      low=daily_data['l'],
    #                                      close=daily_data['c'])])

    # Customize layout
    fig.update_layout(
        title=f'SPY Candlestick Chart for 2 yaers (Daily)', # may add this if needed; for Last 3 Months
        xaxis_title='Date',
        yaxis_title='Price (USD)',
        xaxis_rangeslider_visible=False,  # Disable range slider
    )

    # Show the plot
    print(f"\n Plotting data for 2 year using 1 day data)")
    fig.show()

    # save the graph to a file
    fig.write_image('daily_candlestick_plot.png')

In [18]:
# Main function to collect data and save to Excel
def main():
    symbol = 'SPY'

    # Set the tick length of the returned data
    multiplier_timespan = 1
    timespan_tick = 'minute' # 'day' or 'minute' PLEASE READ THE BELOW NOTE....
    duration_days = 7 # total duration in minutes
    """
    When changing the minute or day, make sure to change the following variables as well;
    - timespan_tick
    - change felch_data api call to say minute or day
    - chage which global variable we are calling such as " global df_SPY_*
    - change the df_SPY_* variable at the end to save the data to the correct global variable
    - change the excel file name appropriately
    """

    end_date = datetime.now() - timedelta(days=1) #Yesterday because the historical data from Polygon.io is 1 day behind as it id End-of-Day data for Free basic tier

    # start_date = end_date - timedelta(days=2*365)  # 2 years back
    # start_date = end_date - timedelta(days=90)  # 90 days
    start_date = end_date - timedelta(days=duration_days)  # 7 days

    # Split data into 2-month chunks (12 API calls for 2 years) for 'x' minute data (any no of x minutes)
    # Don't split data for end of day data
    num_calls = 1 if timespan_tick == 'minute' else 1

    delta = (end_date - start_date) / num_calls
    date_ranges = [(start_date + i*delta, start_date + (i+1)*delta) for i in range(num_calls)]

    all_data = []

    for i, (start, end) in enumerate(date_ranges):
        start_str = start.strftime('%Y-%m-%d')
        end_str = end.strftime('%Y-%m-%d')
        print(f"\nFetching data from {start_str} to {end_str} (API call {i+1}/{num_calls})")

        data = fetch_data(symbol, start_str, end_str)
        all_data.extend(data)

        # Validate completeness before adding
        if is_data_complete(data, start, end):
            print(f"Adding Data from API call {i+1}/{num_calls} to complete data set")
            all_data.extend(data)
        else:
            print(f"Data incomplete for {start_str} to {end_str}, skipping...")

        # Enforce rate limit: Wait 15 seconds after *every* API call
        if i < num_calls - 1:  # Avoid sleeping after the last call
            print("Waiting 15 seconds to respect rate limit...")
            time.sleep(15)

    # Convert data to DataFrame

    df = pd.DataFrame(all_data)
    df['timestamp'] = df['t'].apply(convert_timestamp)
    df = df[['timestamp', 'o', 'h', 'l', 'c', 'v']]  # Keep relevant columns

    # add values to all the global variable for later access
    global df_1min
    df_1min = df

 # Save to Excel
    df.to_excel(f'data/generated_data/{symbol}_Historical_Data_polygon_{multiplier_timespan}{timespan_tick}_dur{duration_days}d_{start_str}_to_{end_str}.xlsx', index=False)
    print(f"\nData saved to {symbol}_Historical_Data_polygon_{multiplier_timespan}{timespan_tick}_dur{duration_days}d_{start_str}_to_{end_str}.xlsx")


In [19]:
if __name__ == '__main__':
    main()



Fetching data from 2025-01-30 to 2025-02-06 (API call 1/1)
Adding Data from API call 1/1 to complete data set

Data saved to SPY_Historical_Data_polygon_1minute_dur7d_2025-01-30_to_2025-02-06.xlsx


In [20]:
# create_daily_candlestick_plot()

# daily_data = df_SPY_1day
# daily_data['date'] = daily_data['timestamp'].dt.date
#
# fig = go.Figure(data=[go.Candlestick(x=daily_data['date'],
#                                          open=daily_data['o'],
#                                          high=daily_data['h'],
#                                          low=daily_data['l'],
#                                          close=daily_data['c'])])
#
# # save the graph to a file
# fig.write_image('daily_candlestick_plot.png')
#
# print(f"\n Plotting data for 2 year using 1 day data)")
# fig.show()
