# Data Collection

### Imports

In [2]:
import pandas as pd
import yfinance as yf
import os
import numpy as np
import pickle

### Constants

In [3]:
# S&P500 Data
SANDP_500_CSV = os.path.join(os.getcwd(), 'SAndP500.csv')
SP_500_DATA_CSV = os.path.join(os.getcwd(), 'SP500_data.csv')
SP_500_DATA_PKL = os.path.join(os.getcwd(), 'SP500_data.pkl')

# Russell 3000 Data
RUSSELL_3000_CSV = os.path.join(os.getcwd(), 'russell3000StockTickersList.csv')
RUSSELL_3000_DATA_PKL = os.path.join(os.getcwd(), 'Russell3000_data.pkl')

### S&P500 Data Fetching

In [4]:
def get_sp500_data():
    df = pd.read_csv(SANDP_500_CSV)
    symbol_list = df['Symbol'].to_list()
    
    data = yf.download(
        tickers = symbol_list,
#         period = '1y',
        start='2018-12-31',
        end='2020-01-01',
        interval = '1d',
        group_by = 'ticker',
        auto_adjust = False,
        prepost = False,
        threads = True,
        proxy = None)
    
    data = data.sort_values(by=['Date'])
    first_row = data.iloc[0]
    last_row = data.iloc[0]
    
    valid_symbols = [tup[0] for tup in zip(
        df['Symbol'].to_list(),
        (data[[(s, 'Open') for s in df['Symbol'].to_list()]].isnull().sum() == 0).tolist())
                     if tup[1]]
    
    return data[valid_symbols]

In [5]:
data = get_sp500_data()

[*********************100%***********************]  505 of 505 completed

29 Failed downloads:
- ARNC: Data doesn't exist for startDate = 1546243200, endDate = 1577865600
- ESRX: No data found for this date range, symbol may be delisted
- APC: No data found, symbol may be delisted
- HRS: No data found, symbol may be delisted
- TSS: No data found, symbol may be delisted
- EVHC: No data found for this date range, symbol may be delisted
- MON: No data found for this date range, symbol may be delisted
- CSRA: No data found for this date range, symbol may be delisted
- KORS: No data found for this date range, symbol may be delisted
- SYMC: No data found, symbol may be delisted
- DPS: No data found for this date range, symbol may be delisted
- GGP: No data found for this date range, symbol may be delisted
- BRK.B: No data found, symbol may be delisted
- BF.B: No data found for this date range, symbol may be delisted
- VIAB: No data found, symbol may be delisted
- CELG: No data found, symbol 

In [136]:
data.to_csv(SP_500_DATA_CSV)

In [19]:
symbols = sorted(list(set(map(lambda tup: tup[0], list(data.columns)))))

In [17]:
dimensions = sorted(list(set(map(lambda tup: tup[1], list(data.columns)))))

In [28]:
organized_data = [np.vstack([data[sym][dim].values for dim in dimensions]) for sym in symbols]

In [34]:
with open(SP_500_DATA_PKL, 'wb') as file:
    pickle.dump((symbols, dimensions, organized_data), file)

In [38]:
# with open(SP_500_DATA_PKL, 'rb') as file:
#     print(pickle.load(file))

### Russell 3000 Index Data Fetching

In [30]:
def get_russell_3000_data():
    df = pd.read_csv(RUSSELL_3000_CSV, skiprows=3)[['Ticker', 'Company']]
    symbol_list = df['Ticker'].to_list()
    data = yf.download(
        tickers = symbol_list,
#         period = '1y',
        start='2018-12-31',
        end='2020-01-01',
        interval = '1d',
        group_by = 'ticker',
        auto_adjust = False,
        prepost = False,
        threads = True,
        proxy = None)
    data = data.sort_values(by=['Date'])
    first_row = data.iloc[0]
    last_row = data.iloc[0]
    
    valid_symbols = [tup[0] for tup in zip(
        df['Ticker'].to_list(),
        (data[[(s, 'Open') for s in df['Ticker'].to_list()]].isnull().sum() == 0).tolist())
                     if tup[1]]
    
    return data[valid_symbols]

In [31]:
russell_data = get_russell_3000_data()

[*********************100%***********************]  2938 of 2938 completed

409 Failed downloads:
- ESV: No data found, symbol may be delisted
- LFGR: No data found for this date range, symbol may be delisted
- LEXEA: No data found, symbol may be delisted
- UCP: Data doesn't exist for startDate = 1546243200, endDate = 1577865600
- ACTA: No data found for this date range, symbol may be delisted
- SFR: No data found for this date range, symbol may be delisted
- BF.B: No data found for this date range, symbol may be delisted
- BBT: No data found, symbol may be delisted
- SFS: No data found, symbol may be delisted
- CHFN: No data found for this date range, symbol may be delisted
- CASC: No data found for this date range, symbol may be delisted
- BF.A: No data found, symbol may be delisted
- AREX: No data found, symbol may be delisted
- PAH: No data found for this date range, symbol may be delisted
- NXEO: No data found, symbol may be delisted
- GLBL: No data found for this date range, symb

In [38]:
symbols = sorted(list(set(map(lambda tup: tup[0], list(russell_data.columns)))))

In [39]:
dimensions = sorted(list(set(map(lambda tup: tup[1], list(russell_data.columns)))))

In [40]:
organized_data = [np.vstack([russell_data[sym][dim].values for dim in dimensions]) for sym in symbols]

In [41]:
with open(RUSSELL_3000_DATA_PKL, 'wb') as file:
    pickle.dump((symbols, dimensions, organized_data), file)

In [24]:
len(organized_data)

100

In [29]:
len(russell_data.columns)/6

2938.0