# This notebook builds the stock market / numerai dataset

#### Make cells wider

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Imports
All other necessary imports are imported via python scripts

In [1]:
#####################
###### Imports ######
#####################

import os
from configparser import ConfigParser
import sys
from IPython.display import display
from datetime import datetime
import time
import numerapi

start_time = time.time()

if not os.getcwd().endswith('trading'): os.chdir('../../..') # local machine

assert os.getcwd().endswith('trading'), 'Wrong path!'
os.environ['NUMEXPR_MAX_THREADS'] = '32'
os.environ['NUMEXPR_NUM_THREADS'] = '16'

sys.path.append(os.getcwd())
from dev.scripts.ML_utils import * # run if on local machine
from dev.scripts.trading_utils import * # run if on local machine
from numerai.dev.scripts.numerai_utils import *
from numerai.dev.configs.build_numerai_dataset_cfg import *


###  pd options / configs ###

pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', 10)
config = ConfigParser()
config.read('numerai/numerai_keys.ini')

### connect to the numerai signals API ###

napi = numerapi.SignalsAPI(config['KEYS']['NUMERAI_PUBLIC_KEY'], config['KEYS']['NUMERAI_SECRET_KEY'])

In [2]:
### download data ###

if DOWNLOAD_NUMERAI_COMPETITION_DATA:
    # napi = numerapi.NumerAPI(NUMERAI_PUBLIC_KEY, NUMERAI_SECRET_KEY)
    napi.download_current_dataset(unzip=True)
if LOAD_NUMERAI_COMPETITION_DATA:
    df_numerai_comp = dd.read_csv(DF_NUMERAI_COMP_TRAIN_PATH).compute()


### Load eligible tickers ###

eligible_tickers = pd.Series(napi.ticker_universe(), name='ticker')

ticker_map = pd.read_csv('https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_ticker_map_w_bbg.csv')
ticker_map = ticker_map[ticker_map[TICKER_COL].isin(eligible_tickers)]

if VERBOSE:
    print(f"Number of eligible tickers: {len(eligible_tickers)}")
    print(f"Number of eligible tickers in map: {len(ticker_map)}")

# Remove null / empty tickers from the yahoo tickers
valid_tickers = [i for i in ticker_map['yahoo']
     if not pd.isnull(i)
     and not str(i).lower()=='nan' \
     and not str(i).lower()=='null' \
     and not str(i).lower()==''\
     and len(i) > 0\
]

if VERBOSE: print('tickers before cleaning:', ticker_map.shape) # before removing bad tickers
ticker_map = ticker_map[ticker_map['yahoo'].isin(valid_tickers)]
if VERBOSE: print('tickers after cleaning:', ticker_map.shape)

Number of eligible tickers: 5435
Number of eligible tickers in map: 5435
tickers before cleaning: (5435, 3)
tickers after cleaning: (5385, 3)


In [3]:

DOWNLOAD_YAHOO_DATA

False

In [7]:
### Download or load in yahoo finance data in the expected numerai format using the yfinance library ###
# Yahoo Finance wrappers: https://github.com/ranaroussi/yfinance and https://pypi.org/project/yfinance/.
# Downloading ~2 hours on a single-thread

if DOWNLOAD_YAHOO_DATA:
    df_yahoo = download_yfinance_data(list(ticker_map['yahoo']), **DOWNLOAD_YFINANCE_DATA_PARAMS) # all valid yahoo tickers
else: # read in file
    if YAHOO_READ_FILEPATH.lower().endswith('pq') or YAHOO_READ_FILEPATH.lower().endswith('parquet'):
        df_yahoo = dd.read_parquet(YAHOO_READ_FILEPATH, DASK_NPARTITIONS=DASK_NPARTITIONS).compute()
    elif YAHOO_READ_FILEPATH.lower().endswith('feather'):
        df_yahoo = pd.read_feather(YAHOO_READ_FILEPATH)
# df_yahoo = df_yahoo.tail(1000000)# debugging

In [8]:
if VERBOSE: print(df_yahoo.info())
gc.collect()
if CONVERT_DF_DTYPES:
    print('\nconverting dtypes...\n')
    df_yahoo = convert_df_dtypes(df_yahoo, **CONVERT_DTYPE_PARAMS)

if CREATE_BLOOMBERG_TICKER_FROM_YAHOO or DOWNLOAD_YAHOO_DATA:
    if ('yahoo_ticker' not in df_yahoo.columns) or ('ticker' in df_yahoo.columns):
        df_yahoo.rename(columns={'ticker': 'yahoo_ticker'}, inplace=True)
    df_yahoo.loc[:, 'bloomberg_ticker'] = df_yahoo['yahoo_ticker'].map(dict(zip(ticker_map['yahoo'], ticker_map['bloomberg_ticker'])))


### Ensure no [DATETIME_COL, TICKER_COL] are duplicated. If so then there is an issue. ###

print('\nvalidating unique date + ticker index...\n')

datetime_ticker_cat = (df_yahoo[DATETIME_COL].astype(str) + ' ' + df_yahoo[TICKER_COL].astype(str)).tolist()
assert len(datetime_ticker_cat) == len(set(datetime_ticker_cat)), 'TICKER_COL and DATETIME_COL do not make a unique index!'
del datetime_ticker_cat

if DROP_DUPLICATE_ROWS: df_yahoo.drop_duplicates(inplace=True)

print('\nsorting...\n')

df_yahoo.sort_values(by=[DATETIME_COL, TICKER_COL], inplace=True)

print('\nsaving...\n')

if INIT_SAVE_FILEPATH.endswith('feather'):
    if 'date' in df_yahoo.index.names or 'ticker' in df_yahoo.index.names:
        df_yahoo.reset_index().to_feather(INIT_SAVE_FILEPATH)
    else:
        df_yahoo.reset_index(drop=True).to_feather(INIT_SAVE_FILEPATH)
elif INIT_SAVE_FILEPATH.endswith('pq') or INIT_SAVE_FILEPATH.endswith('parquet'):
    df_yahoo.to_parquet(INIT_SAVE_FILEPATH)

print('\nreading targets...\n')

targets = pd.read_csv(NUMERAI_TARGETS_URL).assign(date=lambda df: pd.to_datetime(df['friday_date'], format='%Y%m%d'))

if VERBOSE: targets['target'].value_counts(), targets['target'].value_counts(normalize=True)

### Merge targets into df_yahoo ###

# - From an inner join on `['date', 'bloomberg_ticker']` we lose about 85% of rows.
# - If we drop rows with NAs we have 0 rows left no matter what.
# - The best bet seems to be an outer join without dropping NA rows.
# df_yahoo.set_index(DATETIME_COL, inplace=True)
# df_yahoo.sort_index(inplace=True)
print('\nmerging numerai target...\n')
df_yahoo = pd.merge(df_yahoo, targets, on=TARGET_JOIN_COLS, how=TARGET_JOIN_METHOD)
del targets # reduce memory
TICKERS = df_yahoo[TICKER_COL].unique().tolist()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21198389 entries, 0 to 21198388
Columns: 153 entries, date to bloomberg_ticker
dtypes: category(2), datetime64[ns](1), float32(150)
memory usage: 12.1 GB
None

converting dtypes...


validating unique date + ticker index...


sorting...


saving...


reading targets...


merging numerai target...



In [37]:
df_yahoo['day_name'] = pd.to_datetime(df_yahoo['date'], format='%Y%m%d').dt.day_name()
df_yahoo['friday_date_name'] = pd.to_datetime(df_yahoo['friday_date'], format='%Y%m%d').dt.day_name()
df_yahoo['is_friday'] = 0
df_yahoo.loc[df_yahoo['day_name'].str.lower() == 'friday', 'is_friday'] = 1
df_yahoo.head()

Unnamed: 0,date,yahoo_ticker,adj_close_1d,close_1d,high_1d,...,data_type,target,day_name,friday_date_name,is_friday
0,1990-01-01,ABF.L,112.04471,236.93201,236.93201,...,,,Monday,,0
1,1990-01-01,AHT.L,22.82121,50.407,50.407,...,,,Monday,,0
2,1990-01-01,ALPHA.AT,72.28684,81.671,81.671,...,,,Monday,,0
3,1990-01-01,ANN.AX,14.14669,22.51082,22.51082,...,,,Monday,,0
4,1990-01-01,ANTO.L,8.75124,19.3654,19.3654,...,,,Monday,,0


In [39]:
df_yahoo['friday_date'].notnull().sum(), df_yahoo['is_friday'].sum(), df_yahoo[df_yahoo['target'].notnull()]['is_friday'].sum()

(4305154, 6114763, 4305154)

In [40]:
# run submit test
df_val = pd.read_csv(napi.download_validation_data())
df_val.head()

./numerai_signals_historical.csv: 100%|█████████▉| 127M/127M [00:16<00:00, 9.58MB/s] 

Unnamed: 0,bloomberg_ticker,friday_date,data_type,target
0,000270 KS,20030131,train,0.5
1,000810 KS,20030131,train,0.5
2,000830 KS,20030131,train,0.5
3,002790 KS,20030131,train,0.25
4,003450 KS,20030131,train,0.25


In [41]:
df_val['is_friday'] = pd.to_datetime(df_val['friday_date'], format='%Y%m%d').dt.day_name()
df_val['is_friday'].value_counts()

Friday    4305154
Name: is_friday, dtype: int64

./numerai_signals_historical.csv: 127MB [00:29, 9.58MB/s]                           

In [44]:
df_val = df_val[df_val['data_type']=='validation']

In [53]:
df_to_submit = pd.merge(df_yahoo, df_val, on=['bloomberg_ticker', 'friday_date'], how='inner')
df_to_submit.shape, df_val.shape, df_yahoo[(df_yahoo['target'].notnull()) & (df_yahoo['is_friday'] == 1) & (df_yahoo['data_type'] == 'validation')].shape

((2156196, 162), (2156196, 5), (2156196, 159))

In [None]:
### conditionally drop NAs ###

if RUN_CONDITIONAL_DROPNA:
    print('\nrunning conditional drop_nas...\n')
    df_yahoo = drop_nas(df_yahoo, **DROPNA_PARAMS)
gc.collect()

### create naive features ###

print('\ncreating naive features...\n')

df_yahoo = df_yahoo.groupby(TICKER_COL, group_keys=False).apply(lambda df: create_naive_features_single_symbol(df, **NAIVE_FEATURES_PARAMS)) # Create naive features (e.g. moves, ranges, etc...)

### create diff features ###

diff_params = eval(DIFF_PARAMS_STRING)
df_yahoo = df_yahoo.groupby(TICKER_COL, group_keys=False).apply(lambda df: calc_diffs(df, **diff_params))

### create pct_change features ###

pct_change_params = eval(PCT_CHG_PARAMS_STRING)
df_yahoo = df_yahoo.groupby(TICKER_COL, group_keys=False).apply(lambda df: calc_pct_changes(df, **pct_change_params))

### create custom targets ###

print('\ncreating custom targets...\n')

df_yahoo = CreateTargets(df_yahoo, copy=False).create_targets_HL3(**TARGETS_HL3_PARAMS) # create target_HL3
df_yahoo = CreateTargets(df_yahoo, copy=False).create_targets_HL5(**TARGETS_HL5_PARAMS) # create target_HL5

if VERBOSE:
    display(df_yahoo[TARGETS_HL3_PARAMS['target_suffix']].value_counts()), display(df_yahoo[TARGETS_HL3_PARAMS['target_suffix']].value_counts(normalize=True))
    display(df_yahoo[TARGETS_HL5_PARAMS['target_suffix']].value_counts()), display(df_yahoo[TARGETS_HL5_PARAMS['target_suffix']].value_counts(normalize=True))


### For each ticker, for non-numerai data, shift the target backwards one timestamp, where each row is the unit of measure (e.g. each row is a day) ###

if SHIFT_TARGET_HL_UP_TO_PRED_FUTURE:
    df_yahoo[TARGETS_HL3_PARAMS['target_suffix']] = df_yahoo.groupby(TICKER_COL)[TARGETS_HL3_PARAMS['target_suffix']].transform(lambda col: col.shift(-1))
    df_yahoo[TARGETS_HL5_PARAMS['target_suffix']] = df_yahoo.groupby(TICKER_COL)[TARGETS_HL5_PARAMS['target_suffix']].transform(lambda col: col.shift(-1))

### save memory 2 ###

if CONVERT_DF_DTYPES:
    print('\nconverting dtypes...\n')
    df_yahoo = convert_df_dtypes(df_yahoo, **CONVERT_DTYPE_PARAMS)

### Create lagging features ###
print('\ncreating lagging features...\n')
df_yahoo = df_yahoo.groupby(TICKER_COL, group_keys=False).apply(lambda df: create_lagging_features(df, **LAGGING_FEATURES_PARAMS))

gc.collect()

### Create rolling features ###

# df_yahoo = create_rolling_features(df_yahoo, **ROLLING_FEATURES_PARAMS)
print('\ncreating rolling features...\n')
df_yahoo = df_yahoo.groupby(TICKER_COL, group_keys=False).apply(lambda df: create_rolling_features(df, **ROLLING_FEATURES_PARAMS))

gc.collect()

### Create move_iar features ###

print('\ncalculating move_iar...\n')
df_yahoo = df_yahoo.groupby(TICKER_COL, group_keys=False).apply(lambda df: calc_move_iar(df, **IAR_PARAMS))
gc.collect()

### save memory 3 ###

if CONVERT_DF_DTYPES:
    print('\nconverting dtypes...\n')
    df_yahoo = convert_df_dtypes(df_yahoo, **CONVERT_DTYPE_PARAMS)

### Save df ###

print('\nfinal save...\n')
if FINAL_SAVE_FILEPATH.endswith('feather'):
    if 'date' in df_yahoo.index.names or 'ticker' in df_yahoo.index.names:
        df_yahoo.reset_index().to_feather(FINAL_SAVE_FILEPATH)
    else:
        df_yahoo.reset_index(drop=True).to_feather(FINAL_SAVE_FILEPATH)
elif FINAL_SAVE_FILEPATH.endswith('pq') or FINAL_SAVE_FILEPATH.endswith('parquet'):
    df_yahoo.to_parquet(FINAL_SAVE_FILEPATH)

end_time = time.time()
if VERBOSE: print('Script took:', round((end_time - start_time) / 60, 3), 'minutes')


### Only run the below if on google colab notebook

In [3]:
# from google.colab import drive
# drive.mount('/content/gdrive/')

In [4]:
# sys.path.append('/content/gdrive/trading/dev/scripts/')

In [5]:
# from gdrive.MyDrive.trading.dev.scripts.ML_utils import *

###  pd options / configs

In [6]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', 10)
config = ConfigParser()

config.read('numerai/numerai_keys.ini')

['numerai/numerai_keys.ini']

### connect to the numerai signals API

In [7]:
napi = numerapi.SignalsAPI(config['KEYS']['NUMERAI_PUBLIC_KEY'], config['KEYS']['NUMERAI_SECRET_KEY'])

### download data ###

if DOWNLOAD_NUMERAI_COMPETITION_DATA:
    # napi = numerapi.NumerAPI(NUMERAI_PUBLIC_KEY, NUMERAI_SECRET_KEY)
    napi.download_current_dataset(unzip=True)
if LOAD_NUMERAI_COMPETITION_DATA:
    df_numerai_comp = dd.read_csv(DF_NUMERAI_COMP_TRAIN_PATH).compute()

### Load eligible tickers

In [8]:
eligible_tickers = pd.Series(napi.ticker_universe(), name='ticker')

ticker_map = pd.read_csv('https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_ticker_map_w_bbg.csv')
ticker_map = ticker_map[ticker_map[TICKER_COL].isin(eligible_tickers)]

if VERBOSE:
    print(f"Number of eligible tickers: {len(eligible_tickers)}")
    print(f"Number of eligible tickers in map: {len(ticker_map)}")

# Remove null / empty tickers from the yahoo tickers
valid_tickers = [i for i in ticker_map['yahoo']
     if not pd.isnull(i)
     and not str(i).lower()=='nan' \
     and not str(i).lower()=='null' \
     and not str(i).lower()==''\
]

if VERBOSE: print('tickers before:', ticker_map.shape) # before removing bad tickers
ticker_map = ticker_map[ticker_map['yahoo'].isin(valid_tickers)]
if VERBOSE: print('tickers after:', ticker_map.shape)

Number of eligible tickers: 5430
Number of eligible tickers in map: 5430
tickers before: (5430, 3)
tickers after: (5380, 3)


## Download or load in yahoo finance data in the expected numerai format using the yfinance library
Yahoo Finance wrappers: https://github.com/ranaroussi/yfinance and https://pypi.org/project/yfinance/. <br>
Downloading ~2 hours on a single-thread

In [9]:
intervals_to_download = ['1d', '1h']
join_method = 'outer'
max_intraday_lookback_days = 363
n_chunks = 600
yfinance_params = {'start': '2021-01-01', 'threads': False}

In [10]:
delayed_list = []
i = intervals_to_download[0]
yfinance_params['interval'] = i
yfinance_params2 = yfinance_params.copy()
yfinance_params2['interval'] = i
intraday_lookback_days = datetime.datetime.today().date() - datetime.timedelta(days=max_intraday_lookback_days)
if pd.to_datetime(yfinance_params2['start']) < intraday_lookback_days:
    yfinance_params2['start'] = str(intraday_lookback_days)
tickers = list(ticker_map['yahoo'])
num_workers=32

In [11]:
tickers2 = tickers[0:100]

In [25]:
dl = download_yfinance_data(tickers2, n_chunks=30,
                            num_workers=32,
                            yfinance_params={'threads':False, 'start': '2021-01-01'})

 *** Pulling yfinance data using 32 threads! ***
Running safer-parellel


In [99]:
tickers2 = tickers[0:100]

In [100]:
chunk_len = len(tickers2) // num_workers
ticker_chunks = [' '.join(tickers2[i:i+chunk_len]) for i in range(0, len(tickers2), chunk_len)]
chunk_len

3

In [13]:
def download_yfinance_data(tickers,
                           intervals_to_download=['1d', '1h'],
                           num_workers=1,
                           join_method='outer',
                           max_intraday_lookback_days=363,
                           n_chunks=600,
                           yfinance_params={}):
    """

    Parameters
    __________

    See yfinance.download docs for a detailed description of yfinance parameters

    tickers : list of tickers to pass to yfinance.download - it will be parsed to be in the format "AAPL MSFT FB"
    intervals_to_download : list of intervals to download OHLCV data for each stock (e.g. ['1w', '1d', '1h'])
    num_workers : number of threads used to download the data
        so far only 1 thread is implemented
    join_method : can be 'inner', 'left', 'right' or 'outer'
        if 'outer' then all dates will be present
        if 'left' then all dates from the left table will be present
        if 'right' then all dates from the right table will be present
        if 'inner' then all dates must match for each ticker
    **yfinance_params : dict - passed to yfinance.dowload(yfinance_params)
        set threads = True for faster performance, but tickers will fail, scipt may hang
        set threads = False for slower performance, but more tickers will succeed

    NOTE: passing some intervals return unreliable stock data (e.g. '3mo' returns many NA data points when they should not be NA)
    """

    intraday_lookback_days = datetime.datetime.today().date() - datetime.timedelta(days=max_intraday_lookback_days)
    start_date = yfinance_params['start']

    if num_workers == 1:

        list_of_dfs = []

        for i in intervals_to_download:

            yfinance_params['interval'] = i

            if (i.endswith('m') or i.endswith('h')) and (pd.to_datetime(yfinance_params['start']) < intraday_lookback_days):
                yfinance_params['start'] = str(intraday_lookback_days)

            if yfinance_params['threads'] == True:

                df_i = yfinance.download(' '.join(tickers), **yfinance_params)\
                               .stack()\
                               .rename_axis(index=['date', 'ticker'])\
                               .add_suffix('_' + i)\
                               .reset_index()
            else:

                ticker_chunks = [' '.join(tickers[i:i+n_chunks]) for i in range(0, len(tickers), n_chunks)]
                chunk_dfs_lst = []

                for chunk in ticker_chunks:
                    try:
                        df_tmp = yfinance.download(chunk, **yfinance_params)\
                                         .stack()\
                                         .rename_axis(index=['date', 'ticker'])\
                                         .add_suffix('_' + i)\
                                         .reset_index()
                        chunk_dfs_lst.append(df_tmp)
                    except simplejson.errors.JSONDecodeError:
                        pass

                df_i = pd.concat(chunk_dfs_lst)
                del chunk_dfs_lst
                yfinance_params['start'] = start_date

            if i.endswith('m') or i.endswith('h'):
                # Go long-to-wide on the min/hour bars
                df_i = df_i.pivot_table(index=[df_i['date'].dt.date, 'ticker'], columns=[df_i['date'].dt.hour], aggfunc='first',
                                        values=[i for i in df_i.columns if not i in ['date', 'ticker']])
                df_i.columns = list(pd.Index([str(e[0]).lower() + '_' + str(e[1]).lower() for e in df_i.columns.tolist()]).str.replace(' ', '_'))
                df_i.reset_index(inplace=True)
                df_i['date'] = pd.to_datetime(df_i['date']) # pivot table sets the index, and reset_index changes 'date' to an object

            df_i.columns = [col.replace(' ', '_').lower() for col in df_i.columns]

            list_of_dfs.append(df_i)

        df_yahoo = reduce(lambda x, y: pd.merge(x, y, how=join_method, on=['date', 'ticker']), list_of_dfs)
        date_plus_ticker = df_yahoo['date'].astype(str) + df_yahoo['ticker'].astype(str) # one last quality check to ensure date + ticker is unique

        assert len(date_plus_ticker) == len(set(date_plus_ticker)), i + ' date + ticker is not unique in df_yahoo!'

    else:

        print(' *** Pulling yfinance data using', num_workers, 'threads! ***')
        list_of_dfs = []
        chunk_len = len(tickers) // num_workers
        ticker_chunks = [' '.join(tickers[i:i+chunk_len]) for i in range(0, len(tickers), chunk_len)]

        for i in intervals_to_download:

            yfinance_params['interval'] = i

            if (i.endswith('m') or i.endswith('h')) and (pd.to_datetime(yfinance_params['start']) < intraday_lookback_days):
                yfinance_params['start'] = str(intraday_lookback_days)

            if yfinance_params['threads'] == True:

                print('Parallelizing using both dask and yfinance threads - some tickers may return a JSONDecodeError. If so, set threads to False in yfinance_params')
                
                delayed_list = [delayed(yfinance.download)(' '.join(chunk), **yfinance_params)\
                                                              .stack()\
                                                              .rename_axis(index=['date', 'ticker'])\
                                                              .add_suffix('_' + i)\
                                                              .reset_index()\
                                for chunk in ticker_chunks]
                # tuple_of_dfs = dask.compute(*delayed_list, num_workers=num_workers)
                # df_i = reduce(lambda x, y: pd.merge(x, y, how=join_method, on=['date', 'ticker']), tuple_of_dfs)
                # del tuple_of_dfs

            else:

                print('Running safer-parallel')

                def safe_yfinance_pull(ticker_chunks, yfinance_params):

                    chunk_dfs_lst = []

                    for chunk in ticker_chunks:
                        try:
                            df_tmp = yfinance.download(chunk, **yfinance_params)\
                                             .stack()\
                                             .rename_axis(index=['date', 'ticker'])\
                                             .add_suffix('_' + i)\
                                             .reset_index()
                            chunk_dfs_lst.append(df_tmp)
                        except simplejson.errors.JSONDecodeError:
                            pass

                    df_out = pd.concat(chunk_dfs_lst)
                    return df_out

                delayed_list = [delayed(safe_yfinance_pull)(chunk, yfinance_params) for chunk in ticker_chunks]

            tuple_of_dfs = dask.compute(*delayed_list, num_workers=num_workers)

            df_i = reduce(lambda x, y: pd.merge(x, y, how=join_method, on=['date', 'ticker']), tuple_of_dfs)
            del tuple_of_dfs
            yfinance_params['start'] = start_date

            if i.endswith('m') or i.endswith('h'):
                # Go long-to-wide on the min/hour bars
                df_i = df_i.pivot_table(index=[df_i['date'].dt.date, 'ticker'], columns=[df_i['date'].dt.hour], aggfunc='first',
                                        values=[i for i in df_i.columns if not i in ['date', 'ticker']])
                df_i.columns = list(pd.Index([str(e[0]).lower() + '_' + str(e[1]).lower() for e in df_i.columns.tolist()]).str.replace(' ', '_'))
                df_i.reset_index(inplace=True)
                df_i['date'] = pd.to_datetime(df_i['date']) # pivot table sets the index, and reset_index changes 'date' to an object

            df_i.columns = [col.replace(' ', '_').lower() for col in df_i.columns]

            list_of_dfs.append(df_i)

        df_yahoo = reduce(lambda x, y: pd.merge(x, y, how=join_method, on=['date', 'ticker']), list_of_dfs)
        date_plus_ticker = df_yahoo['date'].astype(str) + df_yahoo['ticker'].astype(str) # one last quality check to ensure date + ticker is unique

    return df_yahoo

In [16]:
df = download_yfinance_data(tickers2, num_workers=1, yfinance_params={'start':'2021-02-01', 'threads':True})

[*********************100%***********************]  100 of 100 completed
[*********************100%***********************]  100 of 100 completed


In [13]:
%%time
DOWNLOAD_YAHOO_DATA = False
if DOWNLOAD_YAHOO_DATA:
    df_yahoo = dd.from_pandas(download_yfinance_data(list(ticker_map['yahoo']), start='2006-01-01')) # all valid yahoo tickers
else:
    DF_YAHOO_FILEPATH = '/media/melgazar9/HDD_10TB/trading/data/yfinance/df_yahoo_2021-04-07.pq'
    NPARTITIONS=16
    if DF_YAHOO_FILEPATH.lower().endswith('pq') or DF_YAHOO_FILEPATH.lower().endswith('parquet'):
        df_yahoo = dd.read_parquet(DF_YAHOO_FILEPATH,
                                    npartitions=NPARTITIONS).compute()
    elif DF_YAHOO_FILEPATH.lower().endswith('feather'):
        df_yahoo = dd.from_pandas(delayed(feather.read_dataframe)(DF_YAHOO_FILEPATH).compute(),
                                   npartitions=NPARTITIONS).compute()

df_yahoo.tail(2)

CPU times: user 18.4 s, sys: 33.6 s, total: 52 s
Wall time: 31.3 s


Unnamed: 0,date,yahoo_ticker,adj_close_1d,close_1d,high_1d,low_1d,open_1d,volume_1d,adj_close_1h_0,adj_close_1h_1,...,volume_1h_15,volume_1h_16,volume_1h_17,volume_1h_18,volume_1h_19,volume_1h_20,volume_1h_21,volume_1h_22,volume_1h_23,bloomberg_ticker
17616895,2021-04-06,ZZZ.TO,31.879999,31.879999,32.240002,31.629999,31.98,1107286000.0,,,...,6800.0,3943.0,4128.0,6819.0,6188.0,,,,,ZZZ CN
17616896,2021-04-06,ZZZ.TO,31.879999,31.879999,32.240002,31.629999,31.98,1107286000.0,,,...,6800.0,3943.0,4128.0,6819.0,6188.0,,,,,ZZZ CN


In [14]:
df_yahoo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17616897 entries, 0 to 17616896
Columns: 153 entries, date to bloomberg_ticker
dtypes: datetime64[ns](1), float64(150), object(2)
memory usage: 20.2+ GB


### Map the yahoo tickers to bloomberg tickers in the ddf_yahoo
Set to True if downloading data. The mapping should already be saved in the dumped parquet file

In [15]:
CREATE_BLOOMBERG_TICKER_FROM_YAHOO = False
if CREATE_BLOOMBERG_TICKER_FROM_YAHOO:
    df_yahoo.loc[:, 'bloomberg_ticker'] = df_yahoo['yahoo_ticker'].map(dict(zip(ticker_map['yahoo'], ticker_map['bloomberg_ticker'])))

### Save df_yahoo to a feather or parquet file for faster loading

In [16]:
%%time
SAVE_DF_YAHOO_TO_FEATHER = False
SAVE_DF_YAHOO_TO_PARQUET = False

DF_YAHOO_OUTPATH = 'data/yfinance/df_yahoo_' + str(datetime.datetime.today().date())
if SAVE_DF_YAHOO_TO_FEATHER:
    df_yahoo.reset_index().to_feather(DF_YAHOO_OUTPATH + '.feather')
if SAVE_DF_YAHOO_TO_PARQUET:
    df_yahoo.to_parquet(DF_YAHOO_OUTPATH + '.pq')

CPU times: user 13 µs, sys: 23 µs, total: 36 µs
Wall time: 42.9 µs


### Load in the numerai targets

In [17]:
%%time
# targets_address = 'https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_train_val.csv' # old
targets_address = 'https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_train_val_bbg.csv'

targets = pd.read_csv(targets_address)\
            .assign(date = lambda df: pd.to_datetime(df['friday_date'], format='%Y%m%d'))
targets.tail(2)

CPU times: user 1.3 s, sys: 602 ms, total: 1.91 s
Wall time: 16.8 s


Unnamed: 0,bloomberg_ticker,friday_date,data_type,target,date
4299721,ZYXI US,20210326,validation,0.5,2021-03-26
4299722,ZZZ CN,20210326,validation,0.5,2021-03-26


In [18]:
targets['target'].value_counts(), targets['target'].value_counts(normalize=True)

(0.50    2151203
 0.25     859478
 0.75     859032
 1.00     215071
 0.00     214939
 Name: target, dtype: int64,
 0.50    0.500312
 0.25    0.199891
 0.75    0.199788
 1.00    0.050020
 0.00    0.049989
 Name: target, dtype: float64)

In [19]:
%%time
print(df_yahoo.shape)
print(df_yahoo.dropna().shape)
print(df_yahoo.dropna(axis=1).shape)
print(df_yahoo[[i for i in df_yahoo.columns if i.endswith('d')]].dropna().shape)

(17616897, 153)
(0, 153)
(17616897, 3)
(17551417, 6)
CPU times: user 10.2 s, sys: 2.06 s, total: 12.2 s
Wall time: 12.2 s


## First iteration (reduced dataset size)

### Merge targets into ddf_yahoo
- From an inner join on `['date', 'bloomberg_ticker']` we lose about 85% of rows. <br>
- If we drop rows with NAs we have 0 rows left no matter what. <br>
- The best bet seems to be an outer join without dropping NA rows.

#### Inner Join
- By doing an inner join we lose about 85% of the rows

In [20]:
%%time
# print('Before: ', df_yahoo.shape[0].compute(), df_yahoo.shape[1])
df_yahoo = pd.merge(df_yahoo, targets, on=['date', 'bloomberg_ticker'], how='inner')

# print('After: ', df_yahoo.shape[0].compute(), df_yahoo.shape[1])
df_yahoo.tail()

CPU times: user 5.14 s, sys: 1.03 s, total: 6.17 s
Wall time: 6.18 s


Unnamed: 0,date,yahoo_ticker,adj_close_1d,close_1d,high_1d,low_1d,open_1d,volume_1d,adj_close_1h_0,adj_close_1h_1,...,volume_1h_18,volume_1h_19,volume_1h_20,volume_1h_21,volume_1h_22,volume_1h_23,bloomberg_ticker,friday_date,data_type,target
2633674,2021-03-26,ZURN.SW,402.100006,402.100006,404.700012,400.899994,401.0,512509.0,,,...,,,,,,,ZURN SW,20210326,validation,0.25
2633675,2021-03-26,ZYXI,14.88,14.88,15.5,14.4,15.5,456200.0,,,...,65519.0,63185.0,,,,,ZYXI US,20210326,validation,0.5
2633676,2021-03-26,ZYXI,14.88,14.88,15.5,14.4,15.5,456200.0,,,...,65519.0,63185.0,,,,,ZYXI US,20210326,validation,0.5
2633677,2021-03-26,ZZZ.TO,32.060001,32.060001,32.060001,31.625,31.709999,40900.0,,,...,5822.0,5578.0,,,,,ZZZ CN,20210326,validation,0.5
2633678,2021-03-26,ZZZ.TO,32.060001,32.060001,32.060001,31.625,31.709999,40900.0,,,...,5822.0,5578.0,,,,,ZZZ CN,20210326,validation,0.5


In [21]:
%%time
df_yahoo.set_index('date', inplace=True)
df_yahoo.sort_index(inplace=True)

CPU times: user 5.11 ms, sys: 0 ns, total: 5.11 ms
Wall time: 4.58 ms


#### Drop rows where the daily prices are NA
By dropping rows where the daily prices are NA we lose 0% rows 

In [22]:
def drop_suffix_nas(df, col_suffix='1d', id_cols=['date', 'bloomberg_ticker']):
    
    df_ids = df[[col for col in df.columns \
                 if col.endswith(col_suffix) \
                 or col in id_cols]\
               ].dropna()[id_cols].isin(df[id_cols])
    
    df = df[df[id_cols].isin(df_ids[id_cols])]
    return df

In [23]:
%%time
DROP_1D_NAS = False
if DROP_1D_NAS:
    df_yahoo = drop_suffix_nas(df_yahoo, col_suffix='1d')

DROP_1H_NAS = False
if DROP_1H_NAS:
    df_yahoo = drop_suffix_nas(df_yahoo, col_suffix='1h')

CPU times: user 2 µs, sys: 3 µs, total: 5 µs
Wall time: 9.06 µs


# Create Features
### Create naive features

In [24]:
%%time
TICKERS = df_yahoo['bloomberg_ticker'].unique().tolist()
TICKERS[0:10]

CPU times: user 134 ms, sys: 4.7 ms, total: 138 ms
Wall time: 138 ms


['1 HK',
 '000100 KS',
 '2 HK',
 '000210 KS',
 '000240 KS',
 '000270 KS',
 '3 HK',
 '4 HK',
 '6 HK',
 '000660 KS']

In [25]:
def create_naive_features_single_symbol(df,\
                                        symbol='',\
                                        symbol_sep='',\
                                        open_col='open_1d',\
                                        high_col='high_1d',\
                                        low_col='low_1d',\
                                        close_col='adj_close_1d',\
                                        volume_col='volume_1d',\
                                        new_col_suffix='_1d',\
                                        copy=True):
    
    """
    Parameters
    __________ 
    
    df: Pandas-like / dask dataframe
        For the stacked yfinance data used for numerai, the syntax is <groupby('bloomberg_ticker').apply(func)>
    
    """
    
    if copy: df = df.copy()

    df['move' + new_col_suffix] = df[close_col] - df[open_col]
    df['move_pct' + new_col_suffix] = df['move' + new_col_suffix] / df[open_col]
    df['move_pct_change' + new_col_suffix] = df['move' + new_col_suffix].pct_change()
    df['open_minus_prev_close' + new_col_suffix] = df[open_col] - df[close_col].shift()
    df['prev_close_pct_chg' + new_col_suffix] = df['move' + new_col_suffix] / df[close_col].shift()

    df['range' + new_col_suffix] = df[high_col] - df[low_col]
    df['range_pct_change' + new_col_suffix] = df['range' + new_col_suffix].pct_change()

    df['high_move' + new_col_suffix] = df[high_col] - df[open_col]
    df['high_move_pct' + new_col_suffix] = df['high_move' + new_col_suffix] / df[open_col]
    df['high_move_pct_change' + new_col_suffix] = df['high_move' + new_col_suffix].pct_change()

    df['low_move' + new_col_suffix] = df[low_col] - df[open_col]
    df['low_move_pct' + new_col_suffix] = df['low_move' + new_col_suffix] / df[open_col]
    df['low_move_pct_change' + new_col_suffix] = df['low_move' + new_col_suffix].pct_change()

    df['volume_diff' + new_col_suffix] = df[volume_col] - df[volume_col].shift()
    df['volume_pct_change' + new_col_suffix] = df[volume_col].pct_change()

    df['close_minus_low' + new_col_suffix] = df[close_col] - df[low_col]
    df['high_minus_close' + new_col_suffix] = df[high_col] - df[close_col]

    df['prev_close_minus_low_minus' + new_col_suffix] = df[close_col].shift() - df[low_col]
    df['high_minus_prev_close' + new_col_suffix] = df[high_col] - df[close_col].shift()

    return df

In [26]:
%%time
df_yahoo = df_yahoo.groupby('bloomberg_ticker', group_keys=False).apply(create_naive_features_single_symbol)
df_yahoo.tail()

CPU times: user 1min 7s, sys: 2.85 s, total: 1min 10s
Wall time: 1min 10s


Unnamed: 0_level_0,yahoo_ticker,adj_close_1d,close_1d,high_1d,low_1d,open_1d,volume_1d,adj_close_1h_0,adj_close_1h_1,adj_close_1h_2,...,high_move_pct_change_1d,low_move_1d,low_move_pct_1d,low_move_pct_change_1d,volume_diff_1d,volume_pct_change_1d,close_minus_low_1d,high_minus_close_1d,prev_close_minus_low_minus_1d,high_minus_prev_close_1d
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-03-12,ZZZ.TO,30.799999,30.799999,30.82,30.0,30.59,123700.0,,,,...,0.0,-0.59,-0.019287,0.0,0.0,0.0,0.799999,0.02,0.799999,0.02
2021-03-19,ZZZ.TO,31.73,31.73,31.885,30.879999,31.27,138800.0,,,,...,1.673917,-0.390001,-0.012472,-0.338981,15100.0,0.12207,0.85,0.155001,-0.08,1.085001
2021-03-19,ZZZ.TO,31.73,31.73,31.885,30.879999,31.27,138800.0,,,,...,0.0,-0.390001,-0.012472,0.0,0.0,0.0,0.85,0.155001,0.85,0.155001
2021-03-26,ZZZ.TO,32.060001,32.060001,32.060001,31.625,31.709999,40900.0,,,,...,-0.43089,-0.084999,-0.002681,-0.782054,-97900.0,-0.705331,0.435001,0.0,0.105,0.330002
2021-03-26,ZZZ.TO,32.060001,32.060001,32.060001,31.625,31.709999,40900.0,,,,...,0.0,-0.084999,-0.002681,0.0,0.0,0.0,0.435001,0.0,0.435001,0.0


### Create my own rule based targets as a feature
These are things that I would be looking for before I make a trade

In [27]:
class CreateTargets():

    def __init__(self, df, copy = True):

        """
        Parameters
        __________

        df : pandas df
        copy : Boolean whether to make a copy of the df before applying transformations
        
        Note: to compute the target based on pct, pass the pct column names into the individual functions
        """

        self.df = df
        self.copy = copy

        if self.copy: self.df = self.df.copy()

    def create_targets_HL5(self,\
                           strong_buy,\
                           med_buy,\
                           med_sell,\
                           strong_sell,\
                           threshold,\
                           stop,\
                           move_col = 'move_pct',\
                           lm_col = 'low_move_pct',\
                           hm_col = 'high_move_pct',\
                           target_suffix = 'target_HL5'):


        # hm stands for high move, lm stands for low move
        # Strong Buy
        self.df.loc[(self.df[hm_col] >= strong_buy) &\
                            (self.df[lm_col] >= (-1)*stop),\
                            target_suffix] = 4

        # Strong Sell
        self.df.loc[(self.df[lm_col] <= (-1)*strong_sell) &\
                    (self.df[hm_col] <= stop) &\
                    (self.df[target_suffix] != 4),\
                    target_suffix] = 0

        # Medium Buy
        self.df.loc[(self.df[hm_col] >= med_buy) &\
                            (self.df[lm_col] >= (-1)*stop) &\
                            (self.df[target_suffix] != 4) &\
                            (self.df[target_suffix] != 0),\
                            target_suffix] = 3

        # Medium Sell
        self.df.loc[(self.df[lm_col] <= (-1)*med_sell) &\
                            (self.df[hm_col] <= stop) &\
                            (self.df[target_suffix] != 4) &\
                            (self.df[target_suffix] != 0) &\
                            (self.df[target_suffix] != 3),\
                            target_suffix] = 1

        # No Trade
        self.df.loc[(self.df[target_suffix] != 0) &\
                            (self.df[target_suffix] != 1) &\
                            (self.df[target_suffix] != 3) &\
                            (self.df[target_suffix] != 4),\
                            target_suffix] = 2


        return self.df


    def create_targets_HL3(self,\
                           buy,\
                           sell,\
                           threshold,\
                           stop,\
                           move_col = 'move_pct',\
                           lm_col = 'low_move_pct',\
                           hm_col = 'high_move_pct',\
                           target_suffix = 'target_HL3'):


        # hm stands for high move, lm stands for low move
        # Buy
        self.df.loc[(self.df[hm_col] >= buy) &\
                            (self.df[lm_col] >= (-1)*stop),\
                            target_suffix] = 2

        # Sell
        self.df.loc[(self.df[lm_col] <= (-1)*sell) &\
                            (self.df[hm_col] <= stop) &\
                            (self.df[target_suffix] != 2),\
                            target_suffix] = 0

        # No Trade
        self.df.loc[(self.df[target_suffix] != 0) &\
                            (self.df[target_suffix] != 2),\
                            target_suffix] = 1

        return self.df

In [28]:
%%time
df_yahoo = CreateTargets(df_yahoo).create_targets_HL3(buy=0.03,\
                                                      sell=0.03,\
                                                      threshold=0.25,\
                                                      stop=.01,\
                                                      move_col = 'move_pct_1d',\
                                                      lm_col = 'low_move_pct_1d',\
                                                      hm_col = 'high_move_pct_1d')

CPU times: user 378 ms, sys: 288 ms, total: 666 ms
Wall time: 666 ms


In [29]:
display(df_yahoo['target_HL3'].value_counts()), display(df_yahoo['target_HL3'].value_counts(normalize=True))

1.0    2224340
0.0     213419
2.0     195920
Name: target_HL3, dtype: int64

1.0    0.844575
0.0    0.081035
2.0    0.074390
Name: target_HL3, dtype: float64

(None, None)

In [30]:
%%time
df_yahoo = CreateTargets(df_yahoo).create_targets_HL5(strong_buy=0.035,\
                                                      med_buy=0.015,\
                                                      med_sell=0.015,\
                                                      strong_sell=0.035,\
                                                      threshold=0.25,\
                                                      stop=.025,\
                                                      move_col = 'move_pct_1d',\
                                                      lm_col = 'low_move_pct_1d',\
                                                      hm_col = 'high_move_pct_1d')
df_yahoo.tail()

CPU times: user 460 ms, sys: 285 ms, total: 745 ms
Wall time: 746 ms


Unnamed: 0_level_0,yahoo_ticker,adj_close_1d,close_1d,high_1d,low_1d,open_1d,volume_1d,adj_close_1h_0,adj_close_1h_1,adj_close_1h_2,...,low_move_pct_1d,low_move_pct_change_1d,volume_diff_1d,volume_pct_change_1d,close_minus_low_1d,high_minus_close_1d,prev_close_minus_low_minus_1d,high_minus_prev_close_1d,target_HL3,target_HL5
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-03-12,ZZZ.TO,30.799999,30.799999,30.82,30.0,30.59,123700.0,,,,...,-0.019287,0.0,0.0,0.0,0.799999,0.02,0.799999,0.02,1.0,1.0
2021-03-19,ZZZ.TO,31.73,31.73,31.885,30.879999,31.27,138800.0,,,,...,-0.012472,-0.338981,15100.0,0.12207,0.85,0.155001,-0.08,1.085001,1.0,3.0
2021-03-19,ZZZ.TO,31.73,31.73,31.885,30.879999,31.27,138800.0,,,,...,-0.012472,0.0,0.0,0.0,0.85,0.155001,0.85,0.155001,1.0,3.0
2021-03-26,ZZZ.TO,32.060001,32.060001,32.060001,31.625,31.709999,40900.0,,,,...,-0.002681,-0.782054,-97900.0,-0.705331,0.435001,0.0,0.105,0.330002,1.0,2.0
2021-03-26,ZZZ.TO,32.060001,32.060001,32.060001,31.625,31.709999,40900.0,,,,...,-0.002681,0.0,0.0,0.0,0.435001,0.0,0.435001,0.0,1.0,2.0


In [31]:
display(df_yahoo['target_HL5'].value_counts()), display(df_yahoo['target_HL5'].value_counts(normalize=True))

2.0    1069661
3.0     585406
1.0     585000
0.0     201187
4.0     192425
Name: target_HL5, dtype: int64

2.0    0.406147
3.0    0.222277
1.0    0.222123
0.0    0.076390
4.0    0.073063
Name: target_HL5, dtype: float64

(None, None)

## Create some more features before applying preprocessing

In [32]:
def create_lagging_features(df, lagging_map, groupby_cols=None, new_col_prefix='prev', copy=True):
    
    """
    
    Parameters
    __________
    
    df : pandas df
    groupby_cols : str or list of cols to groupby before creating lagging transformation cols
    lagging_map : dict with keys as colnames and values as a list of periods for computing lagging features
    periods : periods to look back

    """
    
    if copy: df = df.copy()

    unique_lagging_values = list(sorted({k for v in lagging_map.values() for k in v}))
    
    if groupby_cols is None or len(groupby_cols) == 0:
        for period in unique_lagging_values:
            new_col_prefix_tmp = new_col_prefix + str(period) + '_'
            cols_to_lag = [k for k,v in lagging_map.items() if period in v]
            df[[new_col_prefix_tmp + c for c in cols_to_lag]] = df[cols_to_lag].transform(lambda s: s.shift(periods=period))
    
    else:
        for period in unique_lagging_values:
            new_col_prefix_tmp = new_col_prefix + str(period) + '_'
            cols_to_lag = [k for k,v in lagging_map.items() if period in v]
            
            df[[new_col_prefix_tmp + c for c in cols_to_lag]] = df.groupby(groupby_cols)[cols_to_lag]\
                                                                  .transform(lambda s: s.shift(periods=period))
    return df

In [33]:
LAGGING_MAP = {'target': [1, 2, 3, 4, 5],\
               'target_HL5': [1, 2, 3, 4, 5],\
               'volume_1d': [1, 2, 3, 4, 5],\
               'adj_close_1d' : [1, 2, 3, 4, 5],\
               'move_1d':[1,2,3,4,5]}
LAGGING_MAP

{'target': [1, 2, 3, 4, 5],
 'target_HL5': [1, 2, 3, 4, 5],
 'volume_1d': [1, 2, 3, 4, 5],
 'adj_close_1d': [1, 2, 3, 4, 5],
 'move_1d': [1, 2, 3, 4, 5]}

In [34]:
%%time
df_yahoo = create_lagging_features(df_yahoo, groupby_cols='bloomberg_ticker', lagging_map=LAGGING_MAP)
df_yahoo.tail()

CPU times: user 41.9 s, sys: 4.17 s, total: 46.1 s
Wall time: 46.1 s


Unnamed: 0_level_0,yahoo_ticker,adj_close_1d,close_1d,high_1d,low_1d,open_1d,volume_1d,adj_close_1h_0,adj_close_1h_1,adj_close_1h_2,...,prev4_target,prev4_target_HL5,prev4_volume_1d,prev4_adj_close_1d,prev4_move_1d,prev5_target,prev5_target_HL5,prev5_volume_1d,prev5_adj_close_1d,prev5_move_1d
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-03-12,ZZZ.TO,30.799999,30.799999,30.82,30.0,30.59,123700.0,,,,...,0.75,3.0,106100.0,26.93,0.6,0.75,3.0,106100.0,26.93,0.6
2021-03-19,ZZZ.TO,31.73,31.73,31.885,30.879999,31.27,138800.0,,,,...,0.5,1.0,340600.0,30.549999,-0.380001,0.75,3.0,106100.0,26.93,0.6
2021-03-19,ZZZ.TO,31.73,31.73,31.885,30.879999,31.27,138800.0,,,,...,0.5,1.0,340600.0,30.549999,-0.380001,0.5,1.0,340600.0,30.549999,-0.380001
2021-03-26,ZZZ.TO,32.060001,32.060001,32.060001,31.625,31.709999,40900.0,,,,...,0.75,1.0,123700.0,30.799999,0.209999,0.5,1.0,340600.0,30.549999,-0.380001
2021-03-26,ZZZ.TO,32.060001,32.060001,32.060001,31.625,31.709999,40900.0,,,,...,0.75,1.0,123700.0,30.799999,0.209999,0.75,1.0,123700.0,30.799999,0.209999


### Create rolling features

In [35]:
def create_rolling_features(df,\
                            rolling_fn,\
                            rolling_params,\
                            ewm_fn,\
                            ewm_params,\
                            rolling_cols = 'all_numeric',\
                            ewm_cols='all_numeric',\
                            join_method='outer',\
                            groupby_cols=None,\
                            create_diff_cols=True,
                            copy=True):
    
    """
    
    Parameters
    __________
    df : pandas df
    
    rolling_fn : str called from df.rolling().rolling_fn (e.g. df.rolling.mean() is called with getattr)
    rolling_params : dict params passed to df.rolling()
    
    ewm_fn : str called from df.ewm().ewm_fn (e.g. df.ewm.mean() is called with getattr)
    ewm_params : dict params passed to df.ewm()
    
    rolling_cols : cols to apply rolling_fn
    ewm_cols : cols to apply ewm_fn
    
    join_method : str 'inner', 'outer', 'left', or 'right' - how to join the dfs
    groupby_cols : list or str cols to group by before applying rolling transformations
        example: pass groupby_cols to the stacked ticker numerai dataset, but not a wide df 
    
    copy : bool whether or not to make a copy of the df
    
    """
    
    if copy: df = df.copy()
    
    if isinstance(rolling_cols, str) and rolling_cols.lower() == 'all_numeric':
        rolling_cols = list(df.select_dtypes(include=np.number).columns)
    
    if isinstance(rolling_cols, str) and ewm_cols.lower() == 'all_numeric':
        ewm_cols = list(df.select_dtypes(include=np.number).columns)
    
    lag_dfs_lst = []
    
    if groupby_cols is None or len(groupby_cols) == 0:
        
        # rolling
        lag_dfs_lst.append(getattr(df[rolling_cols].rolling(**rolling_params), rolling_fn)().add_suffix('_rolling_' + rolling_fn))
        
        # ewm
        lag_dfs_lst.append(getattr(df[ewm_cols].ewm(**ewm_params), ewm_fn)().add_suffix('_ewm_' + ewm_fn))
    else:
        
        if isinstance(groupby_cols, list):
            assert(len(groupby_cols) == len(set(groupby_cols))), 'There are duplicates in groupby_cols!'
            rolling_cols_to_select = [i for i in list(set(groupby_cols + rolling_cols)) if i in df.columns] # could be index name
            ewm_cols_to_select = [i for i in list(set(groupby_cols + ewm_cols)) if i in df.columns] # could be index name
        elif isinstance(groupby_cols, str):
            rolling_cols_to_select = [i for i in list(set([groupby_cols] + rolling_cols)) if i in df.columns]
            ewm_cols_to_select = [i for i in list(set([groupby_cols] + ewm_cols)) if i in df.columns]
        else:
            raise('Input param groupby_cols is not a list, string, or None!')
        
        # rolling
        lag_dfs_lst.append(
            df[rolling_cols_to_select].\
            groupby(groupby_cols).\
            apply(lambda x: getattr(x.rolling(**rolling_params), rolling_fn)()).\
            add_suffix('_rolling_' + rolling_fn)\
        )
        
        # ewm
        lag_dfs_lst.append(
            df[ewm_cols_to_select].\
            groupby(groupby_cols).\
            apply(lambda x: getattr(x.ewm(**ewm_params), ewm_fn)()).\
            add_suffix('_ewm_' + ewm_fn)\
        )

    df_lag = reduce(lambda x, y: pd.merge(x, y, how=join_method, left_index=True, right_index=True), lag_dfs_lst)    
    del lag_dfs_lst
    df = pd.merge(df, df_lag, how=join_method, left_index=True, right_index=True)
    
    del df_lag
    
    if create_diff_cols:
        if groupby_cols is None or len(groupby_cols) == 0:
            df = pd.concat([df, df[[i for i in df.columns if 'ewm' in i or 'rolling' in i]].diff().add_suffix('_diff')], axis=1)
        else:
            diff_cols = [i for i in df.columns if 'ewm' in i or 'rolling' in i]
            df[[i + '_diff' for i in diff_cols]] = df.groupby(groupby_cols)[diff_cols].transform(lambda col: col.diff())
    return df

In [57]:
gc.collect()

20

#### Note: Below will use over 130gb of ram if running through jupyter notebook. This notebook will be converted to a py script, which is less memory greedy

In [None]:
%%time
df_yahoo = create_rolling_features(df_yahoo,\
                                   rolling_params={'window':30},\
                                   rolling_fn='mean',\
                                   ewm_params={'com':.5},\
                                   ewm_fn='mean',\
                                   rolling_cols = ['open_1d', 'high_1d', 'low_1d', 'adj_close_1d', 'volume_1d', 'prev1_target', 'prev1_target_HL5'],\
                                   ewm_cols = ['open_1d', 'high_1d', 'low_1d', 'adj_close_1d', 'volume_1d', 'prev1_target', 'prev1_target_HL5'],\
                                   join_method='outer',\
                                   groupby_cols = 'bloomberg_ticker',\
                                   create_diff_cols=True)
df_yahoo.tail()

### This is a good checkpoint to save the df

In [56]:
df_yahoo.reset_index(drop=True).to_feather(OUTPUT_PATH + 'df_numerai_' + str(datetime.datetime.today().date()) + '.feather')

In [8]:
import pandas as pd
import numpy as np

In [40]:
df = pd.DataFrame({'price': [127, 128, 131, 132, 133, 132, 130, 130, 128, 127, 126, 129]})
df['price_diff'] = df['price'].diff()
df

Unnamed: 0,price,price_diff
0,127,
1,128,1.0
2,131,3.0
3,132,1.0
4,133,1.0
5,132,-1.0
6,130,-2.0
7,130,0.0
8,128,-2.0
9,127,-1.0


In [51]:
def calc_move_iar(df, iar_cols, iar_suffix='_iar', copy=True):

    if copy: df = df.copy()

    tmp1 = df[iar_cols].transform(lambda x: x.cumsum().sub(x.cumsum().mask(x >= 0).ffill(), fill_value=0), axis=0).replace(0, np.nan)
    tmp2 = df[iar_cols].transform(lambda x: x.cumsum().sub(x.cumsum().mask(x <= 0).ffill(), fill_value=0), axis=0).replace(0, np.nan)
    
    assert isinstance(iar_cols, str) or isinstance(iar_cols, list), 'iar_cols must be a str or list!'
    
    if isinstance(iar_cols, str):
        df[iar_cols + iar_suffix] = tmp1.fillna(tmp2).ffill()
    else:
        df[[i + iar_suffix for i in iar_cols]] = tmp1.fillna(tmp2).ffill()

    return df

In [52]:
calc_move_iar(df, 'price_diff')

Unnamed: 0,price,price_diff,price_diff_iar
0,127,,
1,128,1.0,1.0
2,131,3.0,4.0
3,132,1.0,5.0
4,133,1.0,6.0
5,132,-1.0,-1.0
6,130,-2.0,-3.0
7,130,0.0,-3.0
8,128,-2.0,-5.0
9,127,-1.0,-6.0


In [54]:
calc_move_iar(df, 2)

KeyError: 2

In [53]:
calc_move_iar(df, ['price','price_diff'])

Unnamed: 0,price,price_diff,price_iar,price_diff_iar
0,127,,127.0,
1,128,1.0,255.0,1.0
2,131,3.0,386.0,4.0
3,132,1.0,518.0,5.0
4,133,1.0,651.0,6.0
5,132,-1.0,783.0,-1.0
6,130,-2.0,913.0,-3.0
7,130,0.0,1043.0,-3.0
8,128,-2.0,1171.0,-5.0
9,127,-1.0,1298.0,-6.0


In [39]:
df[['price_iar', 'price_diff_iar']] = calc_move_iar(df, ['price','price_diff'])
df

ValueError: Columns must be same length as key

In [10]:
iar_col='price_diff'

In [31]:
tmp1 = df[iar_col].transform(lambda x: x.cumsum().sub(x.cumsum().mask(x >= 0).ffill(), fill_value=0), axis=0).replace(0, np.nan)
tmp2 = df[iar_col].transform(lambda x: x.cumsum().sub(x.cumsum().mask(x <= 0).ffill(), fill_value=0), axis=0).replace(0, np.nan)
df['move_iar'] = tmp1.fillna(tmp2).ffill()

df

Unnamed: 0,price,price_diff,move_iar
0,127,,
1,128,1.0,1.0
2,131,3.0,4.0
3,132,1.0,5.0
4,133,1.0,6.0
5,132,-1.0,-1.0
6,130,-2.0,-3.0
7,130,0.0,-3.0
8,128,-2.0,-5.0
9,127,-1.0,-6.0


0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
5    -1.0
6    -3.0
7    -3.0
8    -2.0
9    -3.0
10   -4.0
11    3.0
Name: price_diff, dtype: float64

In [None]:
df[iar_col] = tmp1.fillna(tmp2).ffill()

### There are a lot of missing targets. What do we do with them?
- This becomes a semi-supervised learning problem since there is likely predictive information where there is no numerai target <br>
- To fill them in, I'm going to take an educated guess and say that Numerai's targets are created based on profitable up moves in the market. <br>
- The target they created is likely the following multi-class groups: **strong-short**, **short**, **no-trade**, **buy**, **strong-buy** - Let's find out

In [84]:
tickers_with_target = df_yahoo.loc[df_yahoo['target'].notnull(), 'bloomberg_ticker'].unique().tolist()
tickers_without_target = df_yahoo.loc[df_yahoo['target'].isnull(), 'bloomberg_ticker'].unique().tolist()
len(tickers_with_target), len(tickers_without_target)

(5337, 0)

In [None]:
ticker_groups = full_data.groupby('ticker')

#create lagged features, lag 0 is that day's value, lag 1 is yesterday's value, etc
num_days = 5
for day in range(num_days+1):
    full_data[f'RSI_quintile_lag_{day}'] = ticker_groups['RSI_quintile'].transform(lambda group: group.shift(day))
full_data.tail()

In [None]:
# create difference of the lagged features (change in RSI quintile by day)
for day in range(num_days):
    full_data[f'RSI_diff_{day}'] = full_data[f'RSI_quintile_lag_{day}'] - full_data[f'RSI_quintile_lag_{day + 1}']
    full_data[f'RSI_abs_diff_{day}'] = np.abs(full_data[f'RSI_quintile_lag_{day}'] - full_data[f'RSI_quintile_lag_{day + 1}'])

In [None]:
feature_names = [f'RSI_quintile_lag_{num}' for num in range(num_days)] + [f'RSI_diff_{num}' for num in range(num_days)] + [f'RSI_abs_diff_{num}' for num in range(num_days)]
print(f'Features for training:\n {feature_names}')

In [None]:
TARGET_NAME = 'target'

In [None]:
# read in Signals targets
numerai_targets = 'https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_train_val.csv'
targets = pd.read_csv(numerai_targets)
targets['date'] = pd.to_datetime(targets['friday_date'], format='%Y%m%d')
targets.head()

In [None]:
# the number of tickers per era has generally increased
targets.groupby('date').apply(lambda x: len(x)).plot(kind='line', figsize=(10,4), title='Number of tickers per era')

In [None]:
# the target classes are imbalanced, but we can treat this like a regression problem
targets.target.value_counts()

In [None]:
# the imbalance is consistent across eras with a constant class ratio of: 5%, 20%, 50%, 20%, 5%
pivot_target = targets.groupby(['date','target']).apply(lambda x: len(x)).reset_index(1).pivot(columns='target',values=0)
pivot_target.iloc[::20].plot(kind='bar', stacked=True, figsize=(9,3), title='Number of tickers in each class per era')

stacked_data = pivot_target.apply(lambda x: x/sum(x), axis=1)
stacked_data.iloc[::20].plot(kind='bar', stacked=True, figsize=(9,3), title='Proportion of tickers in each class per era')

In [None]:
targets.head()

In [None]:
# merge our feature data with Numerai targets
ML_data = pd.merge(full_data.reset_index(), targets, on=['date','ticker']).set_index('date')
# print(f'Number of eras in data: {len(ML_data.index.unique())}')

# for training and testing we want clean, complete data only
ML_data.dropna(inplace=True)
ML_data = ML_data[ML_data.index.weekday==4] # ensure we have only fridays
ML_data = ML_data[ML_data.index.value_counts() > 200] # drop eras with under 200 observations per era

In [None]:
print(f'Number of eras in data: {len(ML_data.index.unique())}')
ML_data.head()

In [None]:
train_data = ML_data[ML_data['data_type'] == 'train']
test_data = ML_data[ML_data['data_type'] == 'validation']

In [None]:
model = GradientBoostingRegressor()
model.fit(train_data[feature_names], train_data['target'])

In [None]:
plt.figure(figsize=(15,3))
plt.bar(feature_names, model.feature_importances_)
plt.xticks(rotation=70)
plt.show()

In [None]:
PREDICTION_NAME = 'prediction'

In [None]:
train_data[PREDICTION_NAME] = model.predict(train_data[feature_names])
test_data[PREDICTION_NAME] = model.predict(test_data[feature_names])

#show prediction distribution, most should around the center
test_data[PREDICTION_NAME].hist(bins=30)

In [None]:
def score(df):
    '''Takes df and calculates spearm correlation from pre-defined cols'''
    # method="first" breaks ties based on order in array
    return np.corrcoef(
        df[TARGET_NAME],
        df[PREDICTION_NAME].rank(pct=True, method="first")
    )[0,1]

def run_analytics(era_scores):
    print(f"Mean Correlation: {era_scores.mean():.4f}")
    print(f"Median Correlation: {era_scores.median():.4f}")
    print(f"Standard Deviation: {era_scores.std():.4f}")
    print('\n')
    print(f"Mean Pseudo-Sharpe: {era_scores.mean()/era_scores.std():.4f}")
    print(f"Median Pseudo-Sharpe: {era_scores.median()/era_scores.std():.4f}")
    print('\n')
    print(f'Hit Rate (% positive eras): {era_scores.apply(lambda x: np.sign(x)).value_counts()[1]/len(era_scores):.2%}')

    era_scores.rolling(10).mean().plot(kind='line', title='Rolling Per Era Correlation Mean', figsize=(15,4))
    plt.axhline(y=0.0, color="r", linestyle="--"); plt.show()

    era_scores.cumsum().plot(title='Cumulative Sum of Era Scores', figsize=(15,4))
    plt.axhline(y=0.0, color="r", linestyle="--"); plt.show()

In [None]:
# spearman scores by era
train_era_scores = train_data.groupby(train_data.index).apply(score)
test_era_scores = test_data.groupby(test_data.index).apply(score)

In [None]:
#train scores, in-sample and will be significantly overfit
run_analytics(train_era_scores)

In [None]:
#test scores, out of sample
run_analytics(test_era_scores)

In [None]:
# choose data as of most recent friday
last_friday = datetime.now() + relativedelta(weekday=FR(-1))
date_string = last_friday.strftime('%Y-%m-%d')

live_data = full_data.loc[date_string].copy()
live_data.dropna(subset=feature_names, inplace=True)

In [None]:
print(f"Number of live tickers to submit: {len(live_data)}")

In [None]:
live_data.tail()

In [None]:
live_data[PREDICTION_NAME] = model.predict(live_data[feature_names])

In [None]:
diagnostic_df = pd.concat([test_data, live_data])
diagnostic_df.tail()

In [None]:
diagnostic_df['friday_date'] = diagnostic_df.friday_date.fillna(last_friday.strftime('%Y%m%d')).astype(int)
diagnostic_df['data_type'] = diagnostic_df.data_type.fillna('live')
diagnostic_df[['ticker','friday_date','data_type','prediction']].reset_index(drop=True).to_csv('example_signal_upload.csv', index=False)
diagnostic_df.tail()

In [None]:
# format predictions to match Numerai submission format
predictions = live_data[['ticker', PREDICTION_NAME]].copy()

# choose account
ACCOUNT_NAME = 'ENTER_ACCOUNT_NAME'

# write predictions to csv
live_data[['ticker', PREDICTION_NAME]].to_csv(f"{ACCOUNT_NAME} {datetime.now().strftime('%Y%m%d')}.csv", index=False)

In [None]:
def submit_model(account_name):
    filename = f"{account_name} {datetime.now().strftime('%Y%m%d')}.csv"
    model_id = napi.get_models()[f'{account_name}']
    submission = napi.upload_predictions(filename, model_id=model_id)
    print(submission)

In [None]:
submit_model(ACCOUNT_NAME)

In [55]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                      df_yahoo:  4.4 GiB
                       targets: 623.9 MiB
                    ticker_map:  1.0 MiB
              eligible_tickers: 338.2 KiB
                 valid_tickers: 47.3 KiB
                       TICKERS: 42.0 KiB
                           _40:  9.1 KiB
                           _34:  8.7 KiB
                           _30:  7.8 KiB
                           _26:  7.7 KiB


In [26]:
import inspect as i
import sys
sys.stdout.write(i.getsource(download_yfinance_data))

def download_yfinance_data(tickers,
                           intervals_to_download=['1d', '1h'],
                           num_workers=1,
                           join_method='outer',
                           max_intraday_lookback_days=363,
                           **yfinance_params):
    """
    Parameters
    __________

    See yfinance.download docs for a detailed description of yfinance parameters

    tickers : string separated by space tickers to pass to yfinance.download (e.g. "AAPL MSFT FB")
    intervals_to_download : list of intervals to download OHLCV data for each stock (e.g. ['1w', '1d', '1h'])
    num_workers : number of threads used to download the data
        so far only 1 thread is implemented
    join_method : can be 'inner', 'left', 'right' or 'outer'
        if 'outer' then all dates will be present
        if 'left' then all dates from the left most table will be present
        if 'right' then all dates from the left most table will be present
        i