In [2]:
import pandas as pd
import os

We use only the stocks in S&P 500 Index here for demonstration.

In [3]:
ticks_snp = pd.read_csv('../data/raw/S&P500_constituents.csv')['Symbol'].values

The raw data consists of .csv files, one for each stock, which contains market data such as $open, $close, $high, $low and $volume. We convert them into data frames containing all stocks of interest, one for each data type. Also, we take a limited time frame, Year 2018 to 2019 for the sake of training time in this demo.

In [4]:
in_path = '../data/raw/stocks'
out_path = '../data/processed'

for col in ['Open', 'Close', 'High', 'Low', 'Volume']:
    dfs = []
    for tick in ticks_snp: # Subset to stocks in S&P500
        file_name = tick + '.csv'
        file_path = os.path.join(in_path, file_name)
        if os.path.exists(file_path):
            df = pd.read_csv(file_path, index_col=0)
            # Subset to Year 2018-2019
            df_sub = df[[col]][(df.index > '2018') & (df.index < '2020')].round(2)
            df_sub.columns = [tick]
            dfs.append(df_sub)
    df_comb = pd.concat(dfs, axis=1).sort_index()
    df_comb = df_comb[sorted(df_comb.columns)].dropna(how='all')
    df_comb.to_csv(os.path.join(out_path, col + '.csv'))

We use the $open data to compute the daily forward return, which will be used a the response that we want to predict.

In [5]:
df_open = pd.read_csv(os.path.join(out_path, 'Open.csv'), index_col=0)
df_open_forward1 = df_open.shift(periods=-1)

In [6]:
forward_return = (df_open_forward1 / df_open - 1).round(4)
forward_return.to_csv(os.path.join(out_path, 'ForwardReturn.csv'))