In [None]:
import requests
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime, timedelta

In [None]:

API_KEY = 'Fjw1hBBdbh0XIrbYIQkpFZcAez1vJhDe'
BASE_URL = 'https://financialmodelingprep.com/api/v3/historical-price-full'

symbols = ['AAPL', 'NVDA', 'MSFT', 'GOOGL', 'AMZN', 'GOOG', 'META', 'TSLA', 'AVGO', 'COST', 
          'NFLX', 'ASML', 'TMUS', 'CSCO', 'LIN', 'ISRG', 'PEP', 'AZN', 'AMD', 'ADBE', 
          'QCOM', 'TXN', 'INTU', 'BKNG', 'PLTR', 'ARM', 'AMGN', 'AMAT', 'HON', 'CMCSA',
          'PDD', 'ADP', 'GILD', 'MU', 'PANW', 'APP', 'ADI', 'VRTX', 'SBUX', 'MRVL',
          'LRCX', 'KLAC', 'CEG', 'MELI', 'PYPL', 'MSTR', 'CRWD', 'INTC', 'CDNS', 'ABNB',
          'CTAS', 'MAR', 'SNPS', 'MDLZ', 'REGN', 'FTNT', 'DASH', 'ORLY', 'WDAY', 'TEAM',
          'ADSK', 'CSX', 'TTD', 'PCAR', 'CHTR', 'ROP', 'CPRT', 'NXPI', 'PAYX', 'FANG',
          'AEP', 'ROST', 'DDOG', 'MNST', 'LULU', 'BKR', 'AXON', 'FAST', 'KDP', 'ODFL',
          'GEHC', 'VRSK', 'CTSH', 'EXC', 'XEL', 'EA', 'IDXX', 'KHC', 'CCEP', 'TTWO',
          'DXCM', 'MCHP', 'ANSS', 'CSGP', 'ZS', 'CDW', 'WBD', 'ON', 'GFS', 'BIIB', 'MDB']

In [37]:
symbols = [
    'AMD', 'TTD', 'EXC', 'GRUB', 'TSM', 'BKNG'
]

In [38]:

all_data = []
from_date = (datetime.now() - timedelta(days=5*365)).strftime('%Y-%m-%d')

for symbol in symbols:
    url = f'{BASE_URL}/{symbol}?from={from_date}&apikey={API_KEY}'
    response = requests.get(url)
    data = response.json()
    
    if 'historical' in data:
        for entry in data['historical']:
            record = {
                'symbol': symbol,
                'date': entry['date'],
                'open': entry['open'],
                'high': entry['high'],
                'low': entry['low'],
                'close': entry['close'],
                'volume': entry['volume']
            }
            all_data.append(record)
    print(f"Processed {symbol}")

df_final = pd.DataFrame(all_data)
table = pa.Table.from_pandas(df_final)
pq.write_table(table, 'nasdaq_daily.parquet')

Processed AMD
Processed TTD
Processed EXC
Processed GRUB
Processed TSM
Processed BKNG


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import math

df = pd.read_parquet('nasdaq_daily.parquet')

def plot_all_stocks(df):
    symbols = df['symbol'].unique()
    n = len(symbols)
    cols = 5
    rows = math.ceil(n/cols)
    
    fig, axs = plt.subplots(rows, cols, figsize=(20, 4*rows))
    axs = axs.ravel()
    
    for idx, symbol in enumerate(symbols):
        data = df[df['symbol'] == symbol]
        axs[idx].plot(pd.to_datetime(data['date']), data['close'])
        axs[idx].set_title(symbol)
        axs[idx].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

plot_all_stocks(df)

In [54]:
import pandas as pd

df = pd.read_parquet('nasdaq_daily.parquet')

print(f"Absolute earliest date: {df['date'].min()}")
print(f"Absolute latest date: {df['date'].max()}")

symbol_count = df['symbol'].nunique()
print(f"\nTotal unique symbols: {symbol_count}")

daily_symbol_count = df.groupby('date')['symbol'].count()
complete_dates = daily_symbol_count[daily_symbol_count == symbol_count]

print(f"\nEarliest date with all symbols: {complete_dates.index.min()}")
print(f"Latest date with all symbols: {complete_dates.index.max()}")

Absolute earliest date: 2020-01-21
Absolute latest date: 2025-01-16

Total unique symbols: 96

Earliest date with all symbols: 2020-09-30
Latest date with all symbols: 2022-03-11


In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet('nasdaq_daily_cleaned.parquet')

# Convert date to datetime if not already
df['date'] = pd.to_datetime(df['date'])

# Get trading days per symbol
symbol_days = df.groupby('symbol')['date'].agg(['count', 'min', 'max'])

# Get unique trading days across all symbols
all_trading_days = df['date'].unique()
total_trading_days = len(all_trading_days)

print(f"Total unique trading days in dataset: {total_trading_days}")
print("\nPer Symbol Analysis:")
print(f"{'Symbol':<10} {'Days':<8} {'Expected':<10} {'Missing':<10} {'Start':<12} {'End':<12}")
print("-" * 70)

for symbol in df['symbol'].unique():
    symbol_data = symbol_days.loc[symbol]
    days_count = symbol_data['count']
    start_date = symbol_data['min']
    end_date = symbol_data['max']
    
    # Calculate expected trading days for this symbol's date range
    mask = (all_trading_days >= start_date) & (all_trading_days <= end_date)
    expected_days = mask.sum()
    
    missing = expected_days - days_count
    
    print(f"{symbol:<10} {days_count:<8} {expected_days:<10} {missing:<10} {start_date.strftime('%Y-%m-%d')} {end_date.strftime('%Y-%m-%d')}")

# Check if we have any dates that aren't business days
print("\nChecking for non-business days:")
all_dates_df = pd.DataFrame({'date': all_trading_days})
all_dates_df['is_business_day'] = all_dates_df['date'].dt.dayofweek < 5
weekend_dates = all_dates_df[~all_dates_df['is_business_day']]['date']

if len(weekend_dates) > 0:
    print(f"\nFound {len(weekend_dates)} weekend dates in the data!")
    print("First few weekend dates:", weekend_dates[:5].tolist())

Total unique trading days in dataset: 1256

Per Symbol Analysis:
Symbol     Days     Expected   Missing    Start        End         
----------------------------------------------------------------------
AAPL       1256     1256       0          2020-01-21 2025-01-16
ADBE       1256     1256       0          2020-01-21 2025-01-16
ADI        1256     1256       0          2020-01-21 2025-01-16
ADP        1256     1256       0          2020-01-21 2025-01-16
ADSK       1256     1256       0          2020-01-21 2025-01-16
AEP        1256     1256       0          2020-01-21 2025-01-16
AMAT       1256     1256       0          2020-01-21 2025-01-16
AMD        1257     1256       -1         2020-01-21 2025-01-16
AMGN       1256     1256       0          2020-01-21 2025-01-16
AMZN       1256     1256       0          2020-01-21 2025-01-16
ANSS       1256     1256       0          2020-01-21 2025-01-16
ASML       1256     1256       0          2020-01-21 2025-01-16
AVGO       1256     1256    

In [55]:
import pandas as pd

df = pd.read_parquet('nasdaq_daily.parquet')

# Remove GRUB and PLTR
df = df[~df['symbol'].isin(['GRUB', 'PLTR'])]

# Find duplicates
dupes = df.groupby(['symbol', 'date']).size().reset_index(name='count')
dupes = dupes[dupes['count'] > 1]
print("Duplicates found:")
print(dupes)

# Remove duplicates and sort
df = df.drop_duplicates(['symbol', 'date'])
df = df.sort_values(['symbol', 'date'])

# Verify cleanup
print("\nSymbols remaining:", df['symbol'].nunique())
print("Days per symbol:")
print(df.groupby('symbol')['date'].count().sort_values())

# Save cleaned data
df.to_parquet('nasdaq_daily_cleaned.parquet')

Duplicates found:
       symbol        date  count
10047     AMD  2025-01-16      2
21351    BKNG  2025-01-16      2
43959     EXC  2025-01-16      2
108015    TTD  2025-01-16      2

Symbols remaining: 94
Days per symbol:
symbol
AAPL    1256
ON      1256
ODFL    1256
NXPI    1256
NVDA    1256
        ... 
CSGP    1256
CSCO    1256
CRWD    1256
EXC     1256
ZS      1256
Name: date, Length: 94, dtype: int64


## Data Cleaning