<a href="https://colab.research.google.com/github/Amelrich/Capstone-Fall-2020/blob/kassie-preprocessing/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
!pip install pandas_market_calendars

In [3]:
import pandas_market_calendars as mcal

In [4]:
# Start and End date of stock data
start_date = pd.to_datetime('1999-11-18')
end_date   = pd.to_datetime('2020-09-02')

In [5]:
# Read names of Stocks we are interested in
symbols = pd.read_csv('https://raw.githubusercontent.com/Amelrich/Capstone-Fall-2020/kassie-preprocessing/sp500.csv',index_col=False)
symbols = list(symbols['Symbol'].values)
symbols = sorted(symbols)

In [6]:
symbols = ['BF-B' if x=='BF.B' else x for x in symbols]
symbols = ['BRK-B' if x=='BRK.B' else x for x in symbols]

###Set start & end date and find the trading days between them
From the `pandas_market_calendars` package, find all the trading dates given a specific range of dates

In [7]:
# get trading days calendar
def create_market_cal(start, end):
    nyse = mcal.get_calendar('NYSE')
    schedule = nyse.schedule(start, end)
    market_cal = mcal.date_range(schedule, frequency='1D')
    market_cal = market_cal.tz_localize(None)
    market_cal = [i.replace(hour=0) for i in market_cal]
    return market_cal 


# Start and End date of stock data
start_date = pd.to_datetime('1999-11-18')
end_date   = pd.to_datetime('2020-09-02')

# Create a calendar
calendar = create_market_cal(start_date, end_date)



### Split the calendar dates in chunks of length 150
We do this because we are interested in data points that capture multiple time scales. We want to capture a stocks behavior over 50 & 100 & 150 days, and have all of this information in one data point that has a fixed length of 50.

In [8]:
# divide the calendar in 150 day chunks
def divide_chunks(l, n): 
      
    # looping till length l 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 

# split trading days from start to end date in 50 day chunks
n = 150
list_of_date_chunk = list(divide_chunks(calendar, n)) 

# we skip the last date chunk bc it is not 150 days long
list_of_date_chunk = list_of_date_chunk[:-1]
len(list_of_date_chunk[-1])

150

### How we achieve multiscale data points

Two methods

* By picking prices every consecutive, alterate and every 2 days 

* By picking consecutive prices for 50, 100, 150 days.

Please see methods `get_multiscale_skipped_values` and `get_multiscale_consecutive_values`



In [9]:
def get_multiscale_skipped_values(df_start_end):
  days_50 = df_start_end[-50:]
  days_100 = df_start_end[-100::2]
  days_150 = df_start_end[-150::3]
  dictionary = {'50_days': days_50['Adj Close'].values, '100_days': days_100['Adj Close'].values,
        '150_days': days_150['Adj Close'].values}
  return dictionary

def get_multiscale_consecutive_values(df_start_end):
  days_50 = df_start_end[-50:]
  days_100 = df_start_end[-100:]
  days_150 = df_start_end[-150:]

  days_50_all = pd.DataFrame(np.append(np.zeros((100,3)), days_50 ,0))
  days_50_all.loc[:, 0] = pd.to_numeric(days_50_all.loc[:, 0])
  days_50_all.columns = days_50.columns
  days_100_all = pd.DataFrame(np.append(np.zeros((50,3)), days_100,0))
  days_100_all.loc[:, 0] = pd.to_numeric(days_100_all.loc[:, 0])
  days_100_all.columns = days_50.columns
  dictionary = {'50_days': days_50_all['Adj Close'].values, '100_days': days_100_all['Adj Close'].values,
                    '150_days': days_150['Adj Close'].values}
  return dictionary

def scrape_yahoo(stock_name, start_date, end_date):
  # scrape data of each stock from yahoo
  try:
    df = web.DataReader(stock_name,'yahoo', start_date, end_date)
    df = df[['Adj Close','Volume']]
    df['Symbol'] =  stock_name
    find_flag = 1
    return df, find_flag
  except KeyError:
    print("Could not find data on ".format(stock_name))
    find_flag = 0
    return pd.DataFrame(), find_flag

total_prices_list_skipped_values = []
total_labels_list_consecutive_values = []
total_labels_list = []
n = 0
for stock_name in symbols:
  n = n + 1
  if n%100 == 0:
    print("{} stocksout of {} completed".format(n,len(symbols)))
  stock_df, find_flag = scrape_yahoo(stock_name, start_date, end_date)
  
  if find_flag == 0:
    print("Could not find data on {}".format(stock_name))
    continue

  prices_list_skipped_values = []
  prices_list_consecutive_values = []
  stock_name_list_values = []

  for item in list_of_date_chunk:
    start = item[0]
    end = item[-1]
    
    df_start_end = stock_df.loc[start:end]
    if len(df_start_end) >= 150:
      dictionary_skipped_values = get_multiscale_skipped_values(df_start_end)
      dictionary_consecutive_values = get_multiscale_consecutive_values(df_start_end)
    else:
      continue
    
    datapoint_skipped_values = pd.DataFrame(dictionary_skipped_values).to_numpy()
    datapoint_consecutive_values = pd.DataFrame(dictionary_consecutive_values).to_numpy()

    prices_list_skipped_values.append(datapoint_skipped_values)
    prices_list_consecutive_values.append(datapoint_consecutive_values)
    stock_name_list_values.append(stock_name)
  
  total_prices_list_skipped_values.append(prices_list_skipped_values)
  total_labels_list_consecutive_values.append(prices_list_consecutive_values)
  total_labels_list.append(stock_name_list_values)
  #print('Stock {} is done'.format(stock_name))

100 stocksout of 505 completed
200 stocksout of 505 completed
300 stocksout of 505 completed
400 stocksout of 505 completed
500 stocksout of 505 completed


In [12]:
# we flatten the above total lists that contain all prices for all stocks in S&P
flattened_list_skipped_values = [y for x in total_prices_list_skipped_values for y in x]
flattened_list_consecutive_values = [y for x in total_labels_list_consecutive_values for y in x]
flattened_list_stock_names = [y for x in total_labels_list for y in x]

In [15]:
np.save("/content/drive/My Drive/capstone/skipped_values.npy", flattened_list_skipped_values)
np.save("/content/drive/My Drive/capstone/consecutive_values.npy", flattened_list_consecutive_values)
np.save("/content/drive/My Drive/capstone/stock_names.npy", flattened_list_stock_names)