In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import datetime as dt
import gc
import logging
import os
import sys
import time

import numpy as np  # Arrays
import pandas as pd  # DataFrames
import pandas_datareader as web  # Gets stock data from Yahoo
from tqdm import tqdm  # Progress bar

In [3]:
import requests  # HTTP requests
import matplotlib.pyplot as plt  # Plots q
import bs4 as bs  # BeautifulSoup, HTML scraping
from IPython.display import display  # IPython display
# Pandas fancy tables
pd.set_option('display.notebook_repr_html', True)
pd.set_option('max_rows', 10)
# Matplotlib fancy plots
plt.style.use('ggplot')
# Logger setup
importlib.reload(logging)

# FORMAT = "[%(filename)s:%(lineno)s - %(funcName)20s() ] %(message)s"
logging.basicConfig(format='%(levelname)s | line %(lineno)s '
                    '| %(funcName)s | %(message)s',
                    level=logging.INFO, stream=sys.stdout,
                    datefmt='%H:%M:%S')

# Numpy printing setup
np.set_printoptions(threshold=10, linewidth=79, edgeitems=5)

NameError: name 'importlib' is not defined

In [4]:
def get_stock_data(start_dt, end_dt, reload_tickers=False,
                   max_tries=50, timeout=2, provider='yahoo',
                   ticker_folder=os.path.join('data', 'tickers'),
                   ticker_fname='tickers.csv',
                   dest_folder=os.path.join('data', 'stocks')):
    '''
    Gets stock data of S&P500 from yahoo and saves it in the {folder}/{tick}.csv

    Throws an exception is anything goes wrong.
    '''
    logging.debug('Obtaining the tickers...')
    if reload_tickers:
        tickers = save_tickers()
    else:
        tickers = pd.read_csv(os.path.join(ticker_folder, ticker_fname))
    logging.debug('Obtained the tickers...')
    logging.debug(tickers)
    # We have to check whether the dest folder exists
    dest_path = dest_folder
    if not os.path.exists(dest_path):
        logging.debug('Creating the destination folder')
        os.makedirs(dest_path)

    # Downloading the prices
    logging.debug('Starting processing tickers...')
    down_cnt, to_down_cnt = 0, len(tickers)
    for index, ticker in tqdm(tickers.itertuples(), desc='Tickers processed',
                              leave=False, file=sys.stderr, unit='company',
                              total=tickers.shape[0]):
        df = None
        logging.debug(f'Starting a new outer loop iteration for {ticker}')
        dest_fpath = os.path.join(dest_path, f'{ticker}.csv')
        if not os.path.exists(dest_fpath):
            # Try to download the stock for max_tries tries, waiting
            # for timeout in between tries
            pbar = tqdm(range(max_tries), desc='Number of tries',
                        leave=False, file=sys.stderr, unit='try')
            tries = max_tries
            while tries > 0:
                pbar.update(1)
                try:
                    logging.debug(f'Trying to get {ticker} data'
                                  f' from {provider}...')
                    df = web.DataReader(ticker, provider, start_dt, end_dt)
                    tries = 0
                    down_cnt += 1
                except Exception as e:
                    tries -= 1
                    logging.debug(e)
                    logging.debug(f'{provider} has denied our request - '
                                  f'sleeping for {timeout} seconds')
                    time.sleep(timeout)
            pbar.close()

            if df is None:
                logging.debug(f'Couldn\'t get the {ticker} data. Continuing')
                continue

            logging.debug(f'Successfully got {ticker} data from {provider}. '
                          'Now saving it...')
            df.to_csv(dest_fpath)
            logging.debug(f'Saved the {ticker} data.')
        else:
            to_down_cnt -= 1
            logging.debug(f'Not downloading data for {ticker}, '
                          'since we already have it')
    logging.info('Finished processing all tickers!')
    logging.info(f'Downloaded: {down_cnt}/{to_down_cnt} items')
    logging.info(f'You can find the results in the folder {dest_path}')


In [5]:
def list_csv(path):
    for f in os.listdir(path):
        if f.endswith('.csv'):
            yield f

def get_timearr(stock_dir, example_timefile='A.csv'):
    time_df = pd.read_csv(os.path.join(stock_dir, example_timefile), index_col=0)
    return time_df.index.values

def reindex_csv(stock_dir, save_dir, time_arr, reload_data=False):
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)

    for stock_fname in tqdm(list_csv(stock_dir)):
        if not reload_data and os.path.isfile(os.path.join(save_dir,
                                                           stock_fname)):
            continue
        stock = pd.read_csv(os.path.join(stock_dir, stock_fname), index_col=0)
        stock = stock.reindex(time_arr, fill_value=np.nan)
        stock.to_csv(os.path.join(save_dir, stock_fname))
    gc.collect()

In [6]:
def get_per_diff(old, new):
    return abs(new - old) / old


def merge_dfs(stock_folder, save_folder, save_fname='stocks_all_merged.csv',
              reload_data=False, add_per_oc=True, add_per_lohi=True,
              add_volume=True):
    '''
    Merges the stock csv files into one big file with all adj. closes

    Raises an exception if something goes wrong.
    '''
    if (os.path.isfile(os.path.join(save_folder, save_fname)) and
            not reload_data):
        logging.warning('The target file is already present in the save_folder.'
                        ' Please use the reload_data argument to overwrite it.')
        return
    logging.debug('Started merging the stock data - getting the files')
    fnames = sorted(list(list_csv(stock_folder)))
    logging.debug('Number of csv files in the folder: {}'.format(len(fnames)))
    logging.debug(f'Filelist: {fnames}')

    time_arr = get_timearr(stock_folder, fnames[0])
    to_stack = []
    col_names = []

    logging.debug('Starting merging dataframes')
    for cur_fname in tqdm(fnames, desc='Files processed', file=sys.stdout,
                          leave=True, unit='file'):
        cur_fpath = os.path.join(stock_folder, cur_fname)
        cur_ticker = cur_fname[:-4]
        col_names.append(cur_ticker)

        logging.debug(f'Processing the file {cur_fname}')

        cur_df = pd.read_csv(cur_fpath, index_col=0)

        if add_volume:
            col_names.append(f'{cur_ticker}_Vol')
        else:
            cur_df.drop(['Volume'], inplace=True, axis=1)

        if add_per_oc:
            cur_df['PerOC'] = get_per_diff(cur_df['Open'], cur_df['Close'])
            col_names.append(f'{cur_ticker}_OC')

        if add_per_lohi:
            cur_df['PerLH'] = get_per_diff(cur_df['Low'], cur_df['High'])
            col_names.append(f'{cur_ticker}_LH')

        cur_df.drop(['Open', 'Close', 'High', 'Low'],
                    inplace=True, axis=1)

        to_stack.append(cur_df.as_matrix())

    # It's faster to just stack a list of numpy arrays than to try and merge dfs
    merged_df = pd.DataFrame(np.concatenate(to_stack, axis=1),
                             index=time_arr, columns=col_names)
    logging.debug('Finished merging dataframes')
    save_path = os.path.join(save_folder, save_fname)

    logging.debug(f'Saving the data to {save_path}')
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    merged_df.to_csv(save_path)
    return merged_df

In [7]:
def make_corr_matrix(merge_folder, save_folder,
                     merged_close_fname='stocks_close_merged.csv',
                     save_fname='corr_matrix.csv', reload_data=False):
    if (os.path.isfile(os.path.join(save_folder, save_fname)) and
            not reload_data):
        logging.warning('The target file is already present in the save_folder.'
                        ' Please use the reload_data argument to overwrite it.')
        return
    merged_path = os.path.join(merge_folder, merged_close_fname)
    save_path = os.path.join(save_folder, save_fname)
    logging.debug(f'Opening the merged closes folder at {merged_path}')
    merged_df = pd.read_csv(merged_path)
    corr_df = merged_df.corr()
    logging.debug(f'Saving the corr_df to {save_path}')
    corr_df.to_csv(save_path)
    return corr_df

In [8]:
# We want to get the data from 2000 till 2017
START_DT = dt.datetime(2000, 1, 1)
END_DT = dt.datetime(2017, 1, 1)
# We want to place the merged file in data/merged
STOCK_FOLDER = os.path.join('data', 'stocks')
MERGED_FOLDER = os.path.join('data', 'merged')
# Downloading the data
# get_stock_data(START_DT, END_DT, max_tries=5,
#                ticker_fname='NYSE.csv',
#                dest_folder=STOCK_FOLDER, timeout=0.1,
#                provider='yahoo')
# Reindexing the csvs so we can np.concatenate them later
# reindex_csv(STOCK_FOLDER, STOCK_FOLDER, get_timearr(STOCK_FOLDER, 'A.csv'), True)
# Merging the dataframes
merge_dfs(STOCK_FOLDER, MERGED_FOLDER, reload_data=True)
merge_dfs(STOCK_FOLDER, MERGED_FOLDER, save_fname='stocks_close_merged.csv', reload_data=True, add_per_oc=False, add_per_lohi=False, add_volume=False)
# Making the correlation matrix
make_corr_matrix(MERGED_FOLDER, MERGED_FOLDER, reload_data=True)

Files processed: 100%|██████████| 2557/2557 [00:28<00:00, 89.70file/s]


Unnamed: 0,A,A_Vol,A_OC,A_LH,AA,AA_Vol,AA_OC,AA_LH,AAC,AAC_Vol,...,ZTR_OC,ZTR_LH,ZTS,ZTS_Vol,ZTS_OC,ZTS_LH,ZX,ZX_Vol,ZX_OC,ZX_LH
1999-12-31,49.174622,1931100.0,0.027516,0.048361,74.057159,434900.0,0.030800,0.033780,,,...,0.040000,0.060000,,,,,,,,
2000-01-03,45.795609,4674300.0,0.085714,0.171614,71.915764,1291300.0,0.028916,0.038883,,,...,0.000000,0.019231,,,,,,,,
2000-01-04,42.297325,4765000.0,0.023853,0.063707,72.451111,1859900.0,0.007444,0.016459,,,...,0.019048,0.019048,,,,,,,,
2000-01-05,39.673607,5758600.0,0.058491,0.102591,76.733910,2598000.0,0.059113,0.065679,,,...,0.009524,0.019048,,,,,,,,
2000-01-06,38.162998,2534400.0,0.026369,0.066667,75.449089,3740800.0,0.016744,0.020350,,,...,0.009434,0.038095,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-23,45.756180,727700.0,0.002604,0.006532,29.709999,2881800.0,0.002686,0.026926,7.30,100600.0,...,0.002519,0.009275,53.476963,1942400.0,0.010902,0.011282,1.39,6800.0,0.037313,0.052239
2016-12-27,46.063206,1535500.0,0.003019,0.007340,29.650000,1708900.0,0.002366,0.021769,7.16,210600.0,...,0.008410,0.014382,53.417301,1345000.0,0.001116,0.009320,1.37,5900.0,0.007246,0.029630
2016-12-28,45.280796,1177300.0,0.016986,0.022329,29.430000,2751100.0,0.011753,0.027939,7.00,248000.0,...,0.016667,0.018676,53.138874,1241900.0,0.005952,0.009372,1.22,73300.0,0.109489,0.177966
2016-12-29,45.332439,891000.0,0.000876,0.009436,28.889999,3224400.0,0.018349,0.029371,6.86,186500.0,...,0.005055,0.011036,53.317860,1046700.0,0.002805,0.007116,1.20,30800.0,0.034483,0.110092


Files processed: 100%|██████████| 2557/2557 [00:27<00:00, 93.76file/s]


Unnamed: 0,A,AA,AAC,AAN,AAP,AAT,AAV,AB,ABB,ABBV,...,ZBH,ZBK,ZEN,ZF,ZNH,ZOES,ZTO,ZTR,ZTS,ZX
1999-12-31,49.174622,74.057159,,5.156340,5.156340,,,9.404634,9.404634,,...,,,,6.505584,6.883125,,,3.746678,,
2000-01-03,45.795609,71.915764,,4.979754,4.979754,,,9.345733,9.345733,,...,,,,6.465176,6.502842,,,3.818731,,
2000-01-04,42.297325,72.451111,,4.962097,4.962097,,,9.306464,9.306464,,...,,,,6.424769,6.578899,,,3.854756,,
2000-01-05,39.673607,76.733910,,4.962097,4.962097,,,9.345733,9.345733,,...,,,,6.583098,6.388757,,,3.859124,,
2000-01-06,38.162998,75.449089,,4.962097,4.962097,,,9.502802,9.502802,,...,,,,6.624763,6.540870,,,3.895531,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-23,45.756180,29.709999,7.30,32.143578,32.143578,41.619080,6.75,22.116867,22.116867,60.145332,...,101.477570,27.279884,21.040001,9.544504,25.777685,24.799999,12.57,10.284743,53.476963,1.39
2016-12-27,46.063206,29.650000,7.16,32.203434,32.203434,41.746628,6.85,22.163527,22.163527,60.299698,...,102.290810,27.011965,21.740000,9.812712,25.954514,25.070000,12.59,10.426487,53.417301,1.37
2016-12-28,45.280796,29.430000,7.00,31.924099,31.924099,41.383610,6.80,21.930227,21.930227,60.077797,...,102.449860,27.021532,21.240000,9.812712,25.463324,24.690001,12.31,10.261262,53.138874,1.22
2016-12-29,45.332439,28.889999,6.86,31.973978,31.973978,41.687759,6.75,22.116867,22.116867,60.521603,...,102.887260,27.031103,21.190001,9.723579,25.492796,24.330000,12.44,10.269961,53.317860,1.20


Unnamed: 0,A,AA,AAC,AAN,AAP,AAT,AAV,AB,ABB,ABBV,...,ZBH,ZBK,ZEN,ZF,ZNH,ZOES,ZTO,ZTR,ZTS,ZX
A,1.000000,-0.230773,-0.211821,0.471405,0.471405,0.852060,-0.322920,0.097909,0.097909,0.806842,...,0.811611,0.706695,0.640190,0.773598,0.395823,-0.233898,-0.665559,0.615614,0.667253,-0.708029
AA,-0.230773,1.000000,-0.063026,-0.656833,-0.656833,0.046464,0.845029,0.539875,0.539875,0.370344,...,-0.151300,-0.274659,-0.132481,-0.245353,-0.358901,-0.172039,-0.510062,-0.497066,0.067708,-0.086316
AAC,-0.211821,-0.063026,1.000000,0.449891,0.449891,0.280041,-0.152314,0.500929,0.500929,0.425118,...,0.240722,-0.129365,-0.008063,0.403188,0.656847,0.483968,0.681022,0.095834,0.305121,-0.409313
AAN,0.471405,-0.656833,0.449891,1.000000,1.000000,0.272298,-0.601351,0.020502,0.020502,0.052518,...,0.646415,-0.257072,-0.425802,0.653546,0.663922,0.286100,-0.817864,0.792557,-0.047408,-0.138856
AAP,0.471405,-0.656833,0.449891,1.000000,1.000000,0.272298,-0.601351,0.020502,0.020502,0.052518,...,0.646415,-0.257072,-0.425802,0.653546,0.663922,0.286100,-0.817864,0.792557,-0.047408,-0.138856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZOES,-0.233898,-0.172039,0.483968,0.286100,0.286100,0.191771,-0.263458,0.399782,0.399782,0.279601,...,0.052727,0.025530,-0.118678,0.201420,0.478493,1.000000,-0.548349,0.051871,0.127011,-0.280936
ZTO,-0.665559,-0.510062,0.681022,-0.817864,-0.817864,-0.812814,-0.145890,-0.483099,-0.483099,-0.649692,...,0.264899,0.761619,0.622878,-0.290295,0.713255,-0.548349,1.000000,-0.624908,-0.807038,-0.675704
ZTR,0.615614,-0.497066,0.095834,0.792557,0.792557,0.965503,-0.387791,0.208806,0.208806,0.940357,...,0.816945,0.884259,0.707480,0.872096,0.694363,0.051871,-0.624908,1.000000,0.904016,-0.877283
ZTS,0.667253,0.067708,0.305121,-0.047408,-0.047408,0.925741,0.671359,0.689461,0.689461,0.873570,...,0.793962,0.870964,0.616750,0.512922,0.721997,0.127011,-0.807038,0.904016,1.000000,-0.840080
