In [None]:
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import datetime as dt
from datetime import datetime
import bz2
import pickle
import _pickle as cPickle

%matplotlib inline

In [None]:
tickers = pd.read_csv('russel2000.csv', header=None)

In [None]:
tickers = tickers[0].to_list()

In [None]:
#tickers = ['SPY', 'EZA', 'ASTGRPJ.CO']
stocks = download_tickers(tickers)

In [None]:
stocks

In [None]:
sns.lineplot(data=stocks.loc[:, stocks.columns.get_level_values(1).isin(['Volume'])])

In [None]:
def download_tickers(tickers):
    # Fetch downloaded ticker from raw_data
    downloaded_tickers = [x[0:-5] if '.pbz2' in x else '' for x in os.listdir('raw_data/')]
    downloaded_tickers.remove('')
    # Check if tickers already are downloaded
    tickers_to_download = []
    for ticker in tickers:
        if ticker in downloaded_tickers:
            # If already downloaded, check if they need updates
            if update_ticker(ticker):
                tickers_to_download.append(ticker)
        else:
            tickers_to_download.append(ticker)

    not_downloaded = download_dump(tickers_to_download)
    tickers_to_load = [x for x in tickers if x not in not_downloaded]
    # Read tickers from json and return
    return load_stocks(tickers_to_load)

In [None]:
def load_stocks(tickers):
    cols = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
    index = pd.MultiIndex.from_product([tickers, cols], names=['Stock ticker', 'Data type'])
    if len(tickers) > 1:
        data = pd.concat([decompress_pickle('raw_data/' + x + '.pbz2') for x in tickers], axis=1, sort=True)
    else:
        data = decompress_pickle('raw_data/' + tickers[0] + '.pbz2')
    
    data.columns = index
    try:
        data.index = pd.to_datetime(data.index, unit='ms')
    except ValueError:
        data.index = pd.to_datetime(data.index)
    return data

In [None]:
def update_ticker(ticker):
    last_bday = last_weekday()
    newest_date = decompress_pickle('raw_data/' + ticker + '.pbz2').index[-1]
    if newest_date == float:
        newest_date = dt.datetime.fromtimestamp(newest_date/1000).date()
    if newest_date != last_bday:
        return True 
    else:
        return False

In [None]:
def download_dump(tickers):
    not_downloaded = []
    if tickers != []:
        data = yf.download(tickers, period='100y', group_by='tickers')
        if len(tickers) > 1:
            for ticker in tickers:
                    clean_data = clean_df(data[ticker])
                    if len(clean_data) > 1:
                        compressed_pickle(ticker, clean_data)
                    else:
                        not_downloaded.extend(tickers)
        else:
            clean_data = clean_df(data)
            if len(clean_data) > 1:
                compressed_pickle(tickers, clean_data)
            else:
                not_downloaded.extend(tickers)
                
    return not_downloaded 

In [None]:
def last_weekday():
    todays_day = dt.date.today().day
    if todays_day in [5, 6]:
        return dt.date.today() - dt.timedelta(days=todays_day-4)
    else:
        return dt.date.today()


In [None]:
def clean_df(df):
    df = df.dropna(axis = 0, how='all')
    return df

In [None]:
def compressed_pickle(ticker, data):
    with bz2.BZ2File('raw_data/' + ticker + '.pbz2', 'w') as f: 
        cPickle.dump(data, f)

In [None]:
def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = cPickle.load(data)
    return data

In [281]:
test = decompress_pickle('raw_data/Y.pbz2')

In [282]:
test

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980-01-02,0.000000,15.985246,15.910548,15.985246,2.992994,3300.0
1980-01-03,0.000000,15.835851,15.088877,15.835851,2.965021,12400.0
1980-01-04,0.000000,15.686457,15.537062,15.537062,2.909078,12200.0
1980-01-07,0.000000,15.537062,15.238272,15.537062,2.909078,84400.0
1980-01-08,0.000000,15.761153,15.387667,15.761153,2.951036,45700.0
...,...,...,...,...,...,...
2020-08-31,560.000000,560.000000,554.229980,554.559998,554.559998,51000.0
2020-09-01,554.000000,561.070007,551.000000,557.880005,557.880005,39200.0
2020-09-02,557.760010,571.890015,557.760010,567.210022,567.210022,55800.0
2020-09-03,573.059998,584.989990,553.460022,555.789978,555.789978,49900.0


In [None]:
sns.lineplot(data=test['Adj Close'])

In [None]:
import os

def clean_raw_files():
    for filename in os.listdir('raw_data'):
        if filename.endswith(".pbz2"): 
            df = pd.DataFrame(decompress_pickle('raw_data/' + filename))
            non_zeroes = len(df) - df.iloc[:, 4].isna().sum()
            if non_zeroes < 2:
                os.remove('raw_data/' + filename)
            else:
                df = df.dropna(how='all', axis=0)
                ticker = filename.replace('.pbz2', '')
                compressed_pickle(ticker, df)
                  
remove_empties()