# This notebook builds the stock market / numerai dataset using dask

### Imports

In [1]:
import os

### Only run the below if on google colab notebook

In [3]:
# from google.colab import drive
# drive.mount('/content/gdrive/')

In [4]:
# sys.path.append('/content/gdrive/trading/dev/scripts/')

In [5]:
# from gdrive.MyDrive.trading.dev.scripts.ML_utils import *

### Only run if on local machine

In [2]:
os.chdir('../..') # local

In [3]:
os.environ['NUMEXPR_MAX_THREADS'] = '32'
os.environ['NUMEXPR_NUM_THREADS'] = '16'

In [4]:
%%time
from dev.scripts.ML_utils import * # run if on local machine
from dev.scripts.numerai_utils import *

Wall time: 7.28 s


## Global Variables

#### Read in the numerai keys via config parser

In [9]:
import configparser
config = configparser.ConfigParser()
config.read('numerai/numerai_keys.ini')

['numerai/numerai_keys.ini']

In [12]:
%%time
DOWNLOAD_NUMERAI_COMPETITION_DATA = False
USE_NUMERAI_COMPETITION_DATA = False

DF_NUMERAI_COMP_TRAIN_PATH = 'C:/Users/Matt/trading/numerai/data/numerai_dataset_255/numerai_training_data.csv' # local

napi = numerapi.SignalsAPI(config['KEYS']['NUMERAI_PUBLIC_KEY'], config['KEYS']['NUMERAI_SECRET_KEY'])

# download data
if DOWNLOAD_NUMERAI_COMPETITION_DATA:

    # napi = numerapi.NumerAPI(NUMERAI_PUBLIC_KEY, NUMERAI_SECRET_KEY)
    napi.download_current_dataset(unzip=True)

    if USE_NUMERAI_COMPETITION_DATA:
        df_numerai_comp = dd.read_csv(DF_NUMERAI_COMP_TRAIN_PATH).compute()
        df_numerai_comp.tail(2)

Wall time: 1.01 ms


## Load in eligible tickers

In [13]:
eligible_tickers = pd.Series(napi.ticker_universe(), name='ticker')
print(f"Number of eligible tickers: {len(eligible_tickers)}")
ticker_map = pd.read_csv('https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_ticker_map_w_bbg.csv')
ticker_map = ticker_map[ticker_map['bloomberg_ticker'].isin(eligible_tickers)]
print(f"Number of eligible tickers in map: {len(ticker_map)}")
ticker_map.tail(2)

Number of eligible tickers: 5431
Number of eligible tickers in map: 5431


Unnamed: 0,ticker,bloomberg_ticker,yahoo
5429,ZYXI,ZYXI US,ZYXI
5430,ZZZ.,ZZZ CN,ZZZ.TO


#### Remove null tickers

In [11]:
valid_tickers = [i for i in ticker_map['yahoo']
     if not pd.isnull(i)
     and not str(i).lower()=='nan' \
     and not str(i).lower()=='null' \
]

print('tickers before:', ticker_map.shape) # before removing bad tickers
ticker_map = ticker_map[ticker_map['yahoo'].isin(valid_tickers)]
print('tickers after:', ticker_map.shape)

tickers before: (5431, 3)
tickers after: (5380, 3)


## Download yahoo finance data in the expected numerai format using the yfinance library
Yahoo Finance wrappers: https://github.com/ranaroussi/yfinance and https://pypi.org/project/yfinance/. <br>
This takes ~2 hours on a single-thread

### Convert the yahoo df to a dask df
- If we don't do this the computation will be very slow. There are ~20 million rows of daily data alone. <br>
- Once I add in intraday data and create additional features, it is necessary to use a lazy computation such as dask or spark. <br>

- Lastly, we'll merge in the numerai target variable and save the ddf as a parquet file

In [28]:
%%time
DOWNLOAD_YAHOO_DATA = False
if DOWNLOAD_YAHOO_DATA:
    ddf_yahoo = dd.from_pandas(download_yfinance_data(list(ticker_map['yahoo']), start='2006-01-01')) # all yahoo tickers
else:
    DF_YAHOO_FILEPATH = 'data/yfinance/df_yahoo_2021-04-07.pq'
    NPARTITIONS=16
    if DF_YAHOO_FILEPATH.lower().endswith('pq') or DF_YAHOO_FILEPATH.lower().endswith('parquet'):
        ddf_yahoo = dd.read_parquet(DF_YAHOO_FILEPATH,
                                    npartitions=NPARTITIONS)
    elif DF_YAHOO_FILEPATH.lower().endswith('feather'):
        ddf_yahoo = dd.from_pandas(delayed(feather.read_dataframe)('data/yfinance/df_yahoo_2021-04-06.feather').compute(),
                                   npartitions=NPARTITIONS)

ddf_yahoo.tail()

Wall time: 4.23 s


Unnamed: 0,date,yahoo_ticker,adj_close_1d,close_1d,high_1d,low_1d,open_1d,volume_1d,adj_close_1h_0,adj_close_1h_1,...,volume_1h_15,volume_1h_16,volume_1h_17,volume_1h_18,volume_1h_19,volume_1h_20,volume_1h_21,volume_1h_22,volume_1h_23,bloomberg_ticker
17616892,2021-04-06,ZURN.SW,406.200012,406.200012,410.799988,405.899994,409.0,1617723000000.0,,,...,60604.0,,,,,,,,,ZURN SW
17616893,2021-04-06,ZYXI,15.29,15.29,15.42,14.86,15.38,285869.0,,,...,100865.0,8730.0,33090.0,19464.0,49338.0,,,,,ZYXI US
17616894,2021-04-06,ZYXI,15.29,15.29,15.42,14.86,15.38,285869.0,,,...,100865.0,8730.0,33090.0,19464.0,49338.0,,,,,ZYXI US
17616895,2021-04-06,ZZZ.TO,31.879999,31.879999,32.240002,31.629999,31.98,1107286000.0,,,...,6800.0,3943.0,4128.0,6819.0,6188.0,,,,,ZZZ CN
17616896,2021-04-06,ZZZ.TO,31.879999,31.879999,32.240002,31.629999,31.98,1107286000.0,,,...,6800.0,3943.0,4128.0,6819.0,6188.0,,,,,ZZZ CN


In [32]:
%%time

df_tmp = ddf_yahoo.compute()

print(df_tmp.shape)
print(df_tmp.dropna().shape)
print(df_tmp.dropna(axis=1).shape)
print(df_tmp[[i for i in df_tmp.columns if i.endswith('d')]].dropna().shape)
del df_tmp

ArrowMemoryError: malloc of size 1321268416 failed

In [36]:
ddf_yahoo.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 153 entries, date to bloomberg_ticker
dtypes: datetime64[ns](1), object(2), float64(150)

### Map the yahoo tickers to bloomberg tickers in the ddf_yahoo
Set to True if reading data - I already saved the bloomberg ticker in the dumped parquet file

In [37]:
def bloomberg_yahoo_tickermap(df,
                              ticker_map_dict = dict(zip(ticker_map['yahoo'],\
                                                         ticker_map['bloomberg_ticker']))):
    
    df.loc[:, 'bloomberg_ticker'] = df['yahoo_ticker'].map(ticker_map_dict)
    return df

In [38]:
SET_BLOOMBERG_TICKERS_AS_INDEX = False
if SET_BLOOMBERG_TICKERS_AS_INDEX:
    ddf_yahoo = ddf_yahoo.map_partitions(bloomberg_yahoo_tickermap)

ddf_yahoo.tail()

Unnamed: 0,date,yahoo_ticker,adj_close_1d,close_1d,high_1d,low_1d,open_1d,volume_1d,adj_close_1h_0,adj_close_1h_1,...,volume_1h_15,volume_1h_16,volume_1h_17,volume_1h_18,volume_1h_19,volume_1h_20,volume_1h_21,volume_1h_22,volume_1h_23,bloomberg_ticker
17616892,2021-04-06,ZURN.SW,406.200012,406.200012,410.799988,405.899994,409.0,1617723000000.0,,,...,60604.0,,,,,,,,,ZURN SW
17616893,2021-04-06,ZYXI,15.29,15.29,15.42,14.86,15.38,285869.0,,,...,100865.0,8730.0,33090.0,19464.0,49338.0,,,,,ZYXI US
17616894,2021-04-06,ZYXI,15.29,15.29,15.42,14.86,15.38,285869.0,,,...,100865.0,8730.0,33090.0,19464.0,49338.0,,,,,ZYXI US
17616895,2021-04-06,ZZZ.TO,31.879999,31.879999,32.240002,31.629999,31.98,1107286000.0,,,...,6800.0,3943.0,4128.0,6819.0,6188.0,,,,,ZZZ CN
17616896,2021-04-06,ZZZ.TO,31.879999,31.879999,32.240002,31.629999,31.98,1107286000.0,,,...,6800.0,3943.0,4128.0,6819.0,6188.0,,,,,ZZZ CN


### Save df_yahoo to a feather file for faster loading

In [39]:
%%time
SAVE_DF_YAHOO_TO_FEATHER = False
SAVE_DF_YAHOO_TO_PARQUET = False

DDF_YAHOO_OUTPATH = 'data/yfinance/df_yahoo_' + str(datetime.datetime.today().date())
if SAVE_DF_YAHOO_TO_FEATHER:
    ddf_yahoo.reset_index().to_feather(DDF_YAHOO_OUTPATH + '.feather')
if SAVE_DF_YAHOO_TO_PARQUET:
    dd.to_parquet(ddf_yahoo,
                  path=DDF_YAHOO_OUTPATH + '.pq'#,
#                   engine='fastparquet', # fails on windows
#                   storage_options={'key':key, 'secret':secret} # used to store on server
                 )

Wall time: 0 ns


### Load in the numerai targets

In [21]:
%%time
# targets_address = 'https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_train_val.csv' # old
targets_address = 'https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_train_val_bbg.csv'
targets = pd.read_csv(targets_address)\
    .assign(date = lambda df: pd.to_datetime(df['friday_date'], format='%Y%m%d'))
targets.tail(2)

Wall time: 8.56 s


Unnamed: 0,bloomberg_ticker,friday_date,data_type,target,date
4294278,ZYXI US,20210319,validation,0.75,2021-03-19
4294279,ZZZ CN,20210319,validation,0.5,2021-03-19


In [22]:
targets['target'].value_counts()

0.50    2148478
0.25     858391
0.75     857948
1.00     214798
0.00     214665
Name: target, dtype: int64

### Merge targets into ddf_yahoo
- From an inner join on `['date', 'bloomberg_ticker']` we lose about 85% of rows. <br>
- If we drop rows with NAs we have 0 rows left no matter what. <br>
- The best bet seems to be an outer join without dropping NA rows.

In [23]:
%%time
ddf_yahoo.shape[1], ddf_yahoo.shape[0].compute()

Wall time: 1.83 s


(153, 17616897)

In [24]:
%%time
ddf_yahoo = dd.merge(ddf_yahoo, targets, on=['date', 'bloomberg_ticker'], how='inner')
ddf_yahoo.tail()

Wall time: 2.12 s


Unnamed: 0,date,yahoo_ticker,adj_close_1d,close_1d,high_1d,low_1d,open_1d,volume_1d,adj_close_1h_0,adj_close_1h_1,...,volume_1h_18,volume_1h_19,volume_1h_20,volume_1h_21,volume_1h_22,volume_1h_23,bloomberg_ticker,friday_date,data_type,target
180084,2021-03-19,ZURN.SW,394.899994,394.899994,396.899994,392.700012,394.299988,1122367.0,,,...,,,,,,,ZURN SW,20210319,validation,0.5
180085,2021-03-19,ZYXI,16.1,16.1,16.15,15.32,15.65,547200.0,,,...,26440.0,125466.0,,,,,ZYXI US,20210319,validation,0.75
180086,2021-03-19,ZYXI,16.1,16.1,16.15,15.32,15.65,547200.0,,,...,26440.0,125466.0,,,,,ZYXI US,20210319,validation,0.75
180087,2021-03-19,ZZZ.TO,31.73,31.73,31.885,30.879999,31.27,138800.0,,,...,12237.0,25319.0,,,,,ZZZ CN,20210319,validation,0.5
180088,2021-03-19,ZZZ.TO,31.73,31.73,31.885,30.879999,31.27,138800.0,,,...,12237.0,25319.0,,,,,ZZZ CN,20210319,validation,0.5


In [27]:
ddf_yahoo.shape[0].compute(), ddf_yahoo.shape[1]

(2623095, 156)

### First iteration - drop rows where the daily prices are NA

In [53]:
# %%time
# df_yahoo_reduced = df_yahoo[df_yahoo.index.isin(df_yahoo[[i for i in df_yahoo.columns if i.endswith('d')]].dropna().index)]
# print(df_yahoo_reduced.shape)

# df_yahoo_reduced.tail()

In [146]:
def drop_suffix_nas(df, col_suffix='1d', id_cols=['date', 'bloomberg_ticker']):
    
    df_ids = df[[col for col in df.columns \
                 if col.endswith(col_suffix) \
                 or col in id_cols]\
               ].dropna()[id_cols].isin(df[id_cols])
    
    df = df[df[id_cols].isin(df_ids[id_cols])]
    return df

In [108]:
ddf_yahoo = ddf_yahoo.map_partitions(drop_suffix_nas)

In [None]:
ddf_yahoo.map_partitions(drop_suffix_nas).shape[0].compute(), ddf_yahoo.map_partitions(drop_suffix_nas).shape[1]

In [None]:
SET_BLOOMBERG_TICKERS_AS_INDEX = True
if SET_BLOOMBERG_TICKERS_AS_INDEX:
    df_yahoo.reset_index(inplace=True)
    df_yahoo.loc[:, 'bloomberg_ticker'] = df_yahoo['ticker'].map(dict(zip(ticker_map['yahoo'], ticker_map['bloomberg_ticker'])))
    df_yahoo.set_index(['date', 'ticker'], inplace=True)
df_yahoo.tail()

In [None]:
def create_rolling_features(df,\
                            rolling_params,\
                            rolling_fn,\
                            ewm_params,\
                            ewm_fn,\
                            rolling_cols = 'all_numeric',\
                            ewm_cols = 'all_numeric',\
                            join_method='outer',\
                            groupby_cols = None,\
                            copy=True):
    
    
    """
    Parameters
    __________
    groupby_cols : list or str cols to group by before applying rolling transformations
        example: pass groupby_cols to the stacked ticker numerai dataset, but not a wide df 
    rolling_cols : cols to apply rolling_fn to
    ewm_cols : cols to apply ewm_fn to
    rolling_params : dict params passed to df.rolling()
    rolling_fn : str called from df.rolling().rolling_fn (e.g. df.rolling.mean() is called with getattr)
    ewm_params : dict params passed to df.ewm()
    ewm_fn : str called from df.ewm().ewm_fn (e.g. df.ewm.mean() is called with getattr)
    join_method : str 'inner', 'outer', 'left', or 'right' - how to join the dfs
    copy : bool whether or not to make a copy of the df
    
    """
    
    if copy: df = df.copy()
    
    if rolling_cols.lower() == 'all_numeric':
        rolling_cols = list(df.select_dtypes(include=np.number).columns)
    if ewm_cols.lower() == 'all_numeric':
        ewm_cols = list(df.select_dtypes(include=np.number).columns)
    
    lag_dfs_lst = []
    
    if groupby_cols is None:
        # rolling
        lag_dfs_lst.append(getattr(df[rolling_cols].rolling(**rolling_params), rolling_fn)().add_suffix('_rolling_' + rolling_fn))
        
        # ewm
        lag_dfs_lst.append(getattr(df[ewm_cols].ewm(**ewm_params), ewm_fn)().add_suffix('_ewm_' + ewm_fn))
    
    else:
        
        if isinstance(groupby_cols, list):
            assert(len(groupby_cols) == len(set(groupby_cols))), 'There are duplicates in groupby_cols!'
            rolling_cols_to_select = [i for i in list(set(groupby_cols + rolling_cols)) if i in df.columns] # could be index name
            ewm_cols_to_select = [i for i in list(set(groupby_cols + ewm_cols)) if i in df.columns] # could be index name
        elif isinstance(groupby_cols, str):
            rolling_cols_to_select = [i for i in list(set([groupby_cols] + rolling_cols)) if i in df.columns]
            ewm_cols_to_select = [i for i in list(set([groupby_cols] + ewm_cols)) if i in df.columns]
        else:
            raise('Input param groupby_cols is not a list, string, or None!')
        
        # rolling
        lag_dfs_lst.append(
            df[rolling_cols_to_select].\
            groupby(groupby_cols).\
            apply(lambda x: getattr(x.rolling(**rolling_params), rolling_fn)()).\
            add_suffix('_rolling_' + rolling_fn)\
        )
        
        # ewm
        lag_dfs_lst.append(
            df[ewm_cols_to_select].\
            groupby(groupby_cols).\
            apply(lambda x: getattr(x.ewm(**ewm_params), ewm_fn)()).\
            add_suffix('_ewm_' + ewm_fn)\
        )

    df_lag = reduce(lambda x, y: pd.merge(x, y, how=join_method, left_index=True, right_index=True), lag_dfs_lst)    
    
    df = pd.merge(df, df_lag, how=join_method, left_index=True, right_index=True)
    
    return df

In [None]:
df_yahoo.shape

In [None]:
df_yahoo = create_rolling_features(df_yahoo,\
                                   rolling_params={'window':30},\
                                   rolling_fn='mean',\
                                   ewm_params={'com':.5},\
                                   ewm_fn='mean',\
                                   rolling_cols = 'all_numeric',\
                                   ewm_cols = 'all_numeric',\
                                   join_method='outer',\
                                   groupby_cols = 'ticker')
df_yahoo.tail(2)

In [None]:
df_yahoo.shape

In [None]:
targets['date'] = pd.to_datetime(targets['friday_date'], format='%Y%m%d')
targets.set_index(['date', 'ticker'], inplace=True)
targets.index.names = ['date', 'ticker']

targets.tail(2)

In [None]:
df_full = pd.merge(df_yahoo, targets, how='outer', left_index=True, right_index=True)
df_full.drop('friday_date', axis=1, inplace=True)
df_full.tail()

In [None]:
df_full.shape

In [None]:
del df_yahoo, targets

### There are a lot of missing targets. What do we do with them?
- This becomes a semi-supervised learning problem since there is likely predictive information where there is no numerai target <br>
- To fill them in, I'm going to take an educated guess and say that Numerai's targets are created based on profitable up moves in the market. <br>
- The target they created is likely the following multi-class groups: **strong-short**, **short**, **no-trade**, **buy**, **strong-buy** - Let's find out

#### First get the tickers available in the target df

In [188]:
# targets_address = 'https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_train_val.csv' # old
targets_address = 'https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_train_val_bbg.csv'
targets = pd.read_csv(targets_address)

targets.tail(2)

Unnamed: 0,bloomberg_ticker,friday_date,data_type,target
4294278,ZYXI US,20210319,validation,0.75
4294279,ZZZ CN,20210319,validation,0.5


In [16]:
tickers_with_target = list(set(\
                               list(set(ticker_map['bloomberg_ticker'].unique().tolist()).\
                                    intersection(targets['ticker'].unique().tolist())) + \
                               list(set(ticker_map['yahoo'].unique().tolist()).\
                                    intersection(targets['ticker'].unique().tolist()))\
                              ))
len(tickers_with_target)

3931

In [None]:
df_full[df_full['target'].notnull()]

In [None]:
df_full.dropna()

In [None]:
ticker_groups = full_data.groupby('ticker')

#create lagged features, lag 0 is that day's value, lag 1 is yesterday's value, etc
num_days = 5
for day in range(num_days+1):
    full_data[f'RSI_quintile_lag_{day}'] = ticker_groups['RSI_quintile'].transform(lambda group: group.shift(day))
full_data.tail()

In [None]:
# create difference of the lagged features (change in RSI quintile by day)
for day in range(num_days):
    full_data[f'RSI_diff_{day}'] = full_data[f'RSI_quintile_lag_{day}'] - full_data[f'RSI_quintile_lag_{day + 1}']
    full_data[f'RSI_abs_diff_{day}'] = np.abs(full_data[f'RSI_quintile_lag_{day}'] - full_data[f'RSI_quintile_lag_{day + 1}'])

In [None]:
feature_names = [f'RSI_quintile_lag_{num}' for num in range(num_days)] + [f'RSI_diff_{num}' for num in range(num_days)] + [f'RSI_abs_diff_{num}' for num in range(num_days)]
print(f'Features for training:\n {feature_names}')

In [None]:
TARGET_NAME = 'target'

In [None]:
# read in Signals targets
numerai_targets = 'https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_train_val.csv'
targets = pd.read_csv(numerai_targets)
targets['date'] = pd.to_datetime(targets['friday_date'], format='%Y%m%d')
targets.head()

In [None]:
# the number of tickers per era has generally increased
targets.groupby('date').apply(lambda x: len(x)).plot(kind='line', figsize=(10,4), title='Number of tickers per era')

In [None]:
# the target classes are imbalanced, but we can treat this like a regression problem
targets.target.value_counts()

In [None]:
# the imbalance is consistent across eras with a constant class ratio of: 5%, 20%, 50%, 20%, 5%
pivot_target = targets.groupby(['date','target']).apply(lambda x: len(x)).reset_index(1).pivot(columns='target',values=0)
pivot_target.iloc[::20].plot(kind='bar', stacked=True, figsize=(9,3), title='Number of tickers in each class per era')

stacked_data = pivot_target.apply(lambda x: x/sum(x), axis=1)
stacked_data.iloc[::20].plot(kind='bar', stacked=True, figsize=(9,3), title='Proportion of tickers in each class per era')

In [None]:
targets.head()

In [None]:
# merge our feature data with Numerai targets
ML_data = pd.merge(full_data.reset_index(), targets, on=['date','ticker']).set_index('date')
# print(f'Number of eras in data: {len(ML_data.index.unique())}')

# for training and testing we want clean, complete data only
ML_data.dropna(inplace=True)
ML_data = ML_data[ML_data.index.weekday==4] # ensure we have only fridays
ML_data = ML_data[ML_data.index.value_counts() > 200] # drop eras with under 200 observations per era

In [None]:
print(f'Number of eras in data: {len(ML_data.index.unique())}')
ML_data.head()

In [None]:
train_data = ML_data[ML_data['data_type'] == 'train']
test_data = ML_data[ML_data['data_type'] == 'validation']

In [None]:
model = GradientBoostingRegressor()
model.fit(train_data[feature_names], train_data['target'])

In [None]:
plt.figure(figsize=(15,3))
plt.bar(feature_names, model.feature_importances_)
plt.xticks(rotation=70)
plt.show()

In [None]:
PREDICTION_NAME = 'prediction'

In [None]:
train_data[PREDICTION_NAME] = model.predict(train_data[feature_names])
test_data[PREDICTION_NAME] = model.predict(test_data[feature_names])

#show prediction distribution, most should around the center
test_data[PREDICTION_NAME].hist(bins=30)

In [None]:
def score(df):
    '''Takes df and calculates spearm correlation from pre-defined cols'''
    # method="first" breaks ties based on order in array
    return np.corrcoef(
        df[TARGET_NAME],
        df[PREDICTION_NAME].rank(pct=True, method="first")
    )[0,1]

def run_analytics(era_scores):
    print(f"Mean Correlation: {era_scores.mean():.4f}")
    print(f"Median Correlation: {era_scores.median():.4f}")
    print(f"Standard Deviation: {era_scores.std():.4f}")
    print('\n')
    print(f"Mean Pseudo-Sharpe: {era_scores.mean()/era_scores.std():.4f}")
    print(f"Median Pseudo-Sharpe: {era_scores.median()/era_scores.std():.4f}")
    print('\n')
    print(f'Hit Rate (% positive eras): {era_scores.apply(lambda x: np.sign(x)).value_counts()[1]/len(era_scores):.2%}')

    era_scores.rolling(10).mean().plot(kind='line', title='Rolling Per Era Correlation Mean', figsize=(15,4))
    plt.axhline(y=0.0, color="r", linestyle="--"); plt.show()

    era_scores.cumsum().plot(title='Cumulative Sum of Era Scores', figsize=(15,4))
    plt.axhline(y=0.0, color="r", linestyle="--"); plt.show()

In [None]:
# spearman scores by era
train_era_scores = train_data.groupby(train_data.index).apply(score)
test_era_scores = test_data.groupby(test_data.index).apply(score)

In [None]:
#train scores, in-sample and will be significantly overfit
run_analytics(train_era_scores)

In [None]:
#test scores, out of sample
run_analytics(test_era_scores)

In [None]:
# choose data as of most recent friday
last_friday = datetime.now() + relativedelta(weekday=FR(-1))
date_string = last_friday.strftime('%Y-%m-%d')

live_data = full_data.loc[date_string].copy()
live_data.dropna(subset=feature_names, inplace=True)

In [None]:
print(f"Number of live tickers to submit: {len(live_data)}")

In [None]:
live_data.tail()

In [None]:
live_data[PREDICTION_NAME] = model.predict(live_data[feature_names])

In [None]:
diagnostic_df = pd.concat([test_data, live_data])
diagnostic_df.tail()

In [None]:
diagnostic_df['friday_date'] = diagnostic_df.friday_date.fillna(last_friday.strftime('%Y%m%d')).astype(int)
diagnostic_df['data_type'] = diagnostic_df.data_type.fillna('live')
diagnostic_df[['ticker','friday_date','data_type','prediction']].reset_index(drop=True).to_csv('example_signal_upload.csv', index=False)
diagnostic_df.tail()

In [None]:
# format predictions to match Numerai submission format
predictions = live_data[['ticker', PREDICTION_NAME]].copy()

# choose account
ACCOUNT_NAME = 'ENTER_ACCOUNT_NAME'

# write predictions to csv
live_data[['ticker', PREDICTION_NAME]].to_csv(f"{ACCOUNT_NAME} {datetime.now().strftime('%Y%m%d')}.csv", index=False)

In [None]:
def submit_model(account_name):
    filename = f"{account_name} {datetime.now().strftime('%Y%m%d')}.csv"
    model_id = napi.get_models()[f'{account_name}']
    submission = napi.upload_predictions(filename, model_id=model_id)
    print(submission)

In [None]:
submit_model(ACCOUNT_NAME)

In [26]:
import inspect as i
import sys
sys.stdout.write(i.getsource(download_yfinance_data))

def download_yfinance_data(tickers,
                           intervals_to_download=['1d', '1h'],
                           num_workers=1,
                           join_method='outer',
                           max_intraday_lookback_days=363,
                           **yfinance_params):
    """
    Parameters
    __________

    See yfinance.download docs for a detailed description of yfinance parameters

    tickers : string separated by space tickers to pass to yfinance.download (e.g. "AAPL MSFT FB")
    intervals_to_download : list of intervals to download OHLCV data for each stock (e.g. ['1w', '1d', '1h'])
    num_workers : number of threads used to download the data
        so far only 1 thread is implemented
    join_method : can be 'inner', 'left', 'right' or 'outer'
        if 'outer' then all dates will be present
        if 'left' then all dates from the left most table will be present
        if 'right' then all dates from the left most table will be present
        i