<a href="https://colab.research.google.com/github/muchcreative/15Yr-Free-Historical-Data-and-Cleaning/blob/main/Missing_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import math

import copy
import seaborn as sns
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive')

import importlib
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/April/pipelines')

import loaders
import filters
import interpolators

print("Importing Complete")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Importing Complete


In [None]:
changes_in_sp500 = loaders.load_changes_in_sp500()
iex_sp500_constituents = loaders.load_iex_sp500_constituents()
historicals = loaders.load_sp500_historicals(iex_sp500_constituents, '5Yhistoricals')
print('Data Loaded')

Data Loaded


In [None]:
#Filter out incomplete data and interpolate the rest when avaliable
#For now, don't directly edit the database. Just run when needed. Incase you want to do changes to your interpolator or filters
ticker_with_full_data = 'AAL'
tolerance = 0.05

missing_historicals, removed_tickers = filters.find_tickers_that_require_interpolation(historicals, 
                                            changes_in_sp500, 
                                            ticker_with_full_data,
                                            tolerance)

historicals = filters.delete_removed_tickers_from_historicals(historicals, removed_tickers)

known_interpolation_data, unknown_interpolation_data = interpolators.prep_data_for_interpolation(historicals, missing_historicals)

historicals, solved_interpolated_data = interpolators.linearly_interpolate_to_historicals_df(known_interpolation_data, 
                                                unknown_interpolation_data, 
                                                historicals)

historicals = interpolators.insert_interpolated_dates_into_historicals(historicals, solved_interpolated_data)

In [None]:
#Length of missing data
def find_amount_of_missing_data_for_each_ticker(backtest_missing_historicals):
  amount_of_missing_data_for_each_ticker = []
  for missing_dates in backtest_missing_historicals.values():
    amount_of_missing_data_for_each_ticker.append(len(missing_dates))
  return amount_of_missing_data_for_each_ticker

def find_days_spent_in_the_sp500(missing_data_summary, historicals, date_range_for_tickers_in_sp500):
  days_spent_in_the_sp500 = []
  for ticker in missing_data_summary['Ticker']:
    days_spent_in_the_sp500.append(len(historicals['AAL'].loc[date_range_for_tickers_in_sp500[ticker][0]:date_range_for_tickers_in_sp500[ticker][1]]))
  return days_spent_in_the_sp500

def remove_tickers_not_in_sp500(missing_data_summary):
  missing_data_summary = missing_data_summary[missing_data_summary['Days Spent in SP500'] != 0]
  return missing_data_summary

#For interval sensitivity analysis
def find_missing_data_at_different_intervals(tickers_with_missing_data, interval_size):
  missing_data_at_each_interval = [[i,0] for i in range(0, 1300, interval_size)] #1300 to ensure it gets all missing data at 1258 days
  for missing_days_amount in tickers_with_missing_data['Missing Days Amount']:
    interval = math.floor(missing_days_amount / interval_size)
    missing_data_at_each_interval[interval][1] += 1
  return missing_data_at_each_interval

# Find Tickers with Missing Data #
missing_data_summary = pd.DataFrame(historicals.keys(), columns=['Ticker'])
missing_data_summary['Missing Dates'] = pd.Series(historicals.values(), dtype='object')
missing_data_summary['Missing Days Amount'] = pd.Series(find_amount_of_missing_data_for_each_ticker(historicals), dtype = 'int64')
missing_data_summary['Days Spent in SP500'] = pd.Series(find_days_spent_in_the_sp500(missing_data_summary, historicals, changes_in_sp500), dtype = 'int64')
missing_data_summary = remove_tickers_not_in_sp500(missing_data_summary)

# Find Cumulative Sum of Missing Data if Allotted Tickers Increase #
missing_data_summary = missing_data_summary.sort_values('Missing Days Amount', ascending = True)
missing_data_summary = missing_data_summary.reset_index(drop = True)
missing_data_summary['Cumulative Sum of Missing Days'] = missing_data_summary['Missing Days Amount'].cumsum()

# Find Missing Data for Different Intervals #
interval_size = 50
missing_data_at_each_interval = find_missing_data_at_different_intervals(missing_data_summary, interval_size)
missing_data_at_each_interval_columns = ['Missing Days Amount', 'Tickers Amount']
missing_data_at_each_interval = pd.DataFrame(missing_data_at_each_interval, columns = missing_data_at_each_interval_columns)

# Plot Both Missing Data at Each Interval and Cum Sum of Missing Data
fig, ax = plt.subplots(1, 2, figsize = (30,10))
interval_plot = sns.barplot(x = 'Missing Days Amount', y = 'Tickers Amount', data = missing_data_at_each_interval, ax = ax[0])
interval_plot.set_title('Missing Days and Ticker Amount for each Interval')
cum_sum_plot = sns.lineplot(x = missing_data_summary.index, y = 'Cumulative Sum of Missing Days', data = missing_data_summary, ax=ax[1])
cum_sum_plot.set_title('Missing Days as Allotted Tickers Increase')
cum_sum_plot.set_xlabel('Ticker Amount')
plt.show()

In [None]:
#Sort by average market cap
#Sort by average realized volatility
#Sort by implied volaility
#Sort versus volatility realized and implied between months

In [None]:
## Introspective EDA and Internal Correlatiosn ##
#Triple Correlation Between Stocks 

#Get 15 year data now
#Add VIX
#Add SPY

#Hpyothesize: Simalirly grouped insdustry and sectors have high kendall correlations and may be able to utilize similar trading strategies
#Stocks of companies in the same industry will usually trade in the same direction, as their fundamentals are affected by market factors in the same way.

#Check Skew of stocks, month to month movements?? HUH?
#Model distribution and check for outliers and what dates those happened in
#Skewer of earning dates gains / losses
#Earning dates, and fundamentals, can you check revised earning estimates, priced in vs what is not priced in

#Should try to understand some weakness of the model

#SP500 Common Industries and Stock Correlations
#SP500 Financials
#SP500 Valuation Fundamentals
#Company

In [None]:
## Interspective EDA and External Correlations ##

#Indicators and Top Momentum Stocks
#Indicators and Top Market Cap Stocks

In [None]:
## Volatility EDA ##
#Will need to pull vix from yahoo, can de-noise areas as well for future model ensembling

#Correlation between indicators and realized volatility
#Implied voltility from VIX and historical realized volatility, take a look at bollinger bands and average true range
#Internal spikes and deviations from VIX, try de-noising (Wavelet Denoising or Moving Average Denosising)

In [None]:
#Basic EDA and Fundamental Theory, Over long periods of time, companies will obtain their true valuation

#Define what fundamentals you want
#Fundamental valulations endpoint
#Fundamental filling date and earning report date

#Only EDA on current SP500 and every 5 years
#Start EDA with current SP500

#What will EDA look like?

#Percentage of stocks that generated the top returns, graphed
#How high volatility is
#Compare growth metrics and volatility for the top companies
#Beta of the stocks
#Compare market cap
#Compare frequency of volatility

#1.	Percentage gains for each stock in comparison to market cap
#2.	Volatility distribution of each stock
#3.	Beta, covariance, and variance compared to sp500 index and nasdaq