In [193]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import requests
import os
from datetime import datetime
from dotenv import load_dotenv

load_dotenv()

True

### Two Types of investments
####    Long Term
####    Short Term

In [194]:
# one Exchange-Traded Fund (ETF) representing a broad market index, 
#   QQQ
#   SPY
# one index mutual fund for diversified exposure to the overall market

# one Forex pair to hedge against currency risk.

# Split them into Long Term and Short Term 25%
# QQQ
# SPY
# AGI GOLD
# MARA Energy
# REGN Pharma

# DOGE
# 

In [195]:
# Database
# Portfolio (Net value, cash balance, Realized profit, Unrealized profit)
# Investement (ID, Type, #ofShares, Avg Cost, mrkt value)
# Buy/Sell (investement_ID, #ofShares, Date)
# candle_sticks(investement_ID, date, closePrice)

In [196]:
start_date = '2023-01-01'
end_date = '2024-07-03'

### Gather Stock data through Polygon.ai and clean it using Pandas df

* Convert date from unix timestamp to datetime object
* Rename Columns     

In [197]:

def polygon(stock_ticker, date_start, date_end):

    # Set up the GET request
    polygon_url = 'https://api.polygon.io/v2/aggs/ticker'
    stocksTicker = f'/{stock_ticker}'
    timespan = '/range/1/day'
    date_from = f'/{date_start}'
    date_to = f'/{date_end}'
    params = '?adjusted=true&sort=asc&'
    polygon_key = os.environ['polygon_api_key']

    params = {'adjusted':'true', 
            'sort': 'asc',
            'apiKey': polygon_key
            }

    full_url = polygon_url + stocksTicker + timespan + date_from + date_to

    # Request the data
    response2 = requests.get(full_url, params)
    polygon_data = response2.json()

    # Access the "results" part of the response
    results = polygon_data["results"]
    
    df_polygon = pd.DataFrame(results)
    df_polygon.head(5)

    # Convert the Unix timestamp to datetime
    df_polygon['date'] = pd.to_datetime(df_polygon['t'], unit='ms')

    # Format the datetime to the desired string format (YYYY-MM-DD)
    df_polygon['date'] = df_polygon['date'].dt.strftime('%Y-%m-%d')

    # Drop the unix time column
    df_polygon = df_polygon.drop(columns=['t'])

    # Rename cols
    df_polygon.rename(columns={'v': 'volume', 'o':'open', 'c':'close', 'h':'high', 'l':'low'}, inplace=True)

    #df_polygon = df_polygon[['date', 'open', 'high', 'low', 'close', 'volume', 'vw', 'n']]
    df_polygon = df_polygon[['date', 'open', 'high', 'low', 'close', 'volume']]
    
    # Convert date from object to datetime
    df_polygon[['date']] = df_polygon[['date']].apply(pd.to_datetime)
    
    #df_polygon['volume'] = df_polygon['volume'].astype(int)
    
    return df_polygon


### Gather Stock data through AlphaVantage(second source to validate the data) and clean it using Pandas df

* Filter the entries to only include dates from 01/01/2023 to 07/03/2024
* sort by date in asc order
* Convert values from object to numerics

In [198]:
def alpha_vantage(stock_ticker, date_start, date_end):

    # Set up GET request
    base_url = 'https://www.alphavantage.co'
    stocks_endpoint = '/query'

    stocks_url = base_url+stocks_endpoint

    api_key = os.environ['alphavantage_api_key']

    params = {'function':'TIME_SERIES_DAILY', 
            'symbol': stock_ticker,
            'outputsize': 'full',
            'apikey' : api_key}

    # Request the data        
    alpha_response = requests.get(stocks_url, params)
    alpha_data = alpha_response.json()
    time_series = alpha_data["Time Series (Daily)"]

    # Filter the entries to only include dates from 01/01/2023 to 07/03/2024
    start_date = datetime.strptime(date_start, "%Y-%m-%d")
    end_date = datetime.strptime(date_end, "%Y-%m-%d")
    filtered_data = {date: values for date, values in time_series.items() if (datetime.strptime(date, "%Y-%m-%d") >= start_date and datetime.strptime(date, "%Y-%m-%d") <= end_date)}
    
    # Switch cols and rows
    df_alpha = pd.DataFrame.from_dict(filtered_data, orient="index")
    df_alpha.index = pd.to_datetime(df_alpha.index)
    # Rename columns to remove the numeric prefix
    df_alpha = df_alpha.rename(columns=lambda x: x.split(". ")[1])  

    # Reset the index to create a new numerical index
    df_alpha = df_alpha.reset_index()
    df_alpha.rename(columns={'index': 'date'}, inplace=True)

    # Sort DataFrame by 'date' column in ascending order
    df_alpha = df_alpha.sort_values(by='date', ascending=True)
    df_alpha = df_alpha.reset_index()

    df_alpha = df_alpha.drop(columns=['index'])
    
    # Convert values from object to numerics
    df_alpha[['open', 'high', 'low', 'close', 'volume']] = df_alpha[['open', 'high', 'low', 'close', 'volume']].apply(pd.to_numeric, errors='coerce')

    return df_alpha
 

## Data Validation
- Summary statistics
- Comparing to other source (Alpha Vantage)

### Check for Null and NaN Values through through summary statistics

In [199]:
# Check for nulls
def null_check(df):
    null_counts = df.isnull().sum()
    return(null_counts)

In [200]:
df_1 = polygon('QQQ', start_date, end_date)
df_2 = alpha_vantage('QQQ', start_date, end_date)

print(null_check(df_1))
print('________________________')
print(null_check(df_2))

date      0
open      0
high      0
low       0
close     0
volume    0
dtype: int64
________________________
date      0
open      0
high      0
low       0
close     0
volume    0
dtype: int64



### Compare the dataframes obtained
* If the data matches, load it to our SQL database
* If some of the data doesn't match, replace it with the average of both dfs, then store in SQL database

In [201]:
# Check if DataFrames from polygon and alphavantage are equal
if df_1.equals(df_2):
    print("DataFrames are identical.")
else:
    print("DataFrames are different.")

DataFrames are different.


In [202]:
df_1.head()
df_QQQ_av_no_vol = df_1[['date','open', 'high', 'low', 'close']]
df_QQQ_av_no_vol.head()

Unnamed: 0,date,open,high,low,close
0,2023-01-03,268.65,270.155,262.13,264.48
1,2023-01-04,266.64,267.45,262.53,265.74
2,2023-01-05,264.04,264.21,261.26,261.58
3,2023-01-06,263.35,269.94,260.34,268.8
4,2023-01-09,270.83,275.29,269.92,270.54


In [203]:
df_2.head()
df_QQQ_poly_no_vol = df_2[['date','open', 'high', 'low', 'close']]
df_QQQ_poly_no_vol.head()

Unnamed: 0,date,open,high,low,close
0,2023-01-03,268.65,270.155,262.13,264.48
1,2023-01-04,266.64,267.45,262.53,265.74
2,2023-01-05,264.04,264.21,261.26,261.58
3,2023-01-06,263.35,269.94,260.34,268.8
4,2023-01-09,270.83,275.29,269.92,270.54


In [204]:
# Check if DataFrames from polygon and alphavantage are equal
if df_QQQ_poly_no_vol.equals(df_QQQ_av_no_vol):
    print("DataFrames are identical.")
else:
    print("DataFrames are different.")

DataFrames are identical.


In [205]:
# Compare DataFrames and filter differences
mask = df_QQQ_av_no_vol != df_QQQ_poly_no_vol
diff_df = df_QQQ_av_no_vol[mask.any(axis=1)]  # Select rows where any difference exists
diff_df2 = df_QQQ_poly_no_vol[mask.any(axis=1)]  # Select rows where any difference exists

print("Differences:")
print(diff_df)
print('_________________________________________')
print(diff_df2)


Differences:
Empty DataFrame
Columns: [date, open, high, low, close]
Index: []
________________-----_________________
Empty DataFrame
Columns: [date, open, high, low, close]
Index: []
