In [21]:
import os
from tqdm.notebook import tqdm
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [22]:
files = os.listdir('./data/')
dfs = list()

for file in tqdm(files):
    # processing file name
    ticker, csv = file.split('.')
    
    # processing dates
    df = pd.read_csv(f'./data/{file}', header=3, usecols=range(8))
    df.dropna(subset='Dates', inplace=True)
    df['Dates'] = pd.to_datetime(df.Dates)
    df.set_index('Dates', inplace=True)

    # bounds => starting with x=9.30 am and y=12.00pm on first date on record
    x, ub = df.index.min(), df.index.max()
    y = x + pd.DateOffset(hours=2, minutes=30)
    day = pd.DateOffset(days=1)

    returns = dict()
    i = 0
    # iterating over every single day
    while x < ub and y < ub:
        try:
            open = df.loc[x, 'Open']    # open price at 9.30 am
            close = df.loc[y, 'Close']  # closing price at 12.00 pm
            returns[y] = close / open - 1
        # sometimes we do not have a return on record for some date
        except: i += 1 # i is a counter for failed parsing

        # incrementing x and y datetimes by 1 day to move forward into the future
        x, y = x + day, y + day

    # we will print out how many days we failed to parse just for a sanity check
    if i > 0: print(f'Failed to parse {i} returns for {ticker}.')

    # save returns per ticker as defined in HW4
    returns = pd.DataFrame.from_dict(returns, orient='index')
    returns.columns = [ticker]
    
    dfs.append(returns)
    df = returns = None

dfs = pd.concat(dfs, axis=1).dropna(how='any')
dfs.to_csv('./data/combined.csv', index=True)

  0%|          | 0/8 [00:00<?, ?it/s]

Failed to parse 69 returns for UBS.
Failed to parse 61 returns for AMZN.
Failed to parse 59 returns for NVDA.
Failed to parse 66 returns for ABR.
Failed to parse 61 returns for GS.
Failed to parse 59 returns for NIO.
Failed to parse 58 returns for NFLX.
Failed to parse 59 returns for TSLA.
