In [1]:
from functools import reduce
import pandas as pd
import numpy as np
import simplejson
import yfinance
import datetime

In [2]:
numerai_filepath = 'D:/trading/data/numerai/datasets/build_dataset_dfs/df_yahoo_tmp.feather'
df_yahoo = pd.read_feather(numerai_filepath)
df_yahoo.tail()

Unnamed: 0,index,date,yahoo_ticker,adj_close_1d,close_1d,high_1d,low_1d,open_1d,volume_1d,adj_close_1h_0,...,volume_1h_14,volume_1h_15,volume_1h_16,volume_1h_17,volume_1h_18,volume_1h_19,volume_1h_20,volume_1h_21,volume_1h_22,volume_1h_23
23819794,23819794,2021-05-13 00:00:00-05:00,TENERGY.AT,,,,,,,,...,,,,,,,,,,
23819795,23819795,2021-05-13 00:00:00-05:00,U11.SI,,,,,,,,...,,,,,,,863600.0,1086000.0,403700.0,41600.0
23819796,23819796,2021-05-13 00:00:00-05:00,U96.SI,,,,,,,,...,,,,,,,1633100.0,372400.0,480700.0,2400.0
23819797,23819797,2021-05-13 00:00:00-05:00,V03.SI,,,,,,,,...,,,,,,,155400.0,116600.0,101000.0,8600.0
23819798,23819798,2021-05-13 00:00:00-05:00,Z74.SI,,,,,,,,...,,,,,,,9847200.0,2575800.0,3613300.0,278600.0


In [3]:
date_plus_ticker_before = df_yahoo['date'].astype(str) + df_yahoo['yahoo_ticker'].astype(str)

In [4]:
len(set(date_plus_ticker_before)) != len(date_plus_ticker_before)

True

In [5]:
df_yahoo['yahoo_ticker'].isnull().sum()

0

In [8]:
import os
os.chdir('trading')

In [9]:
#####################
###### Imports ######
#####################

import os
from configparser import ConfigParser
import sys
from IPython.display import display
from datetime import datetime
import time
import numerapi

start_time = time.time()

if not os.getcwd().endswith('trading'): os.chdir('../../..') # local machine

assert os.getcwd().endswith('trading'), 'Wrong path!'
os.environ['NUMEXPR_MAX_THREADS'] = '32'
os.environ['NUMEXPR_NUM_THREADS'] = '16'

sys.path.append(os.getcwd())
from dev.scripts.ML_utils import * # run if on local machine
from dev.scripts.trading_utils import * # run if on local machine
from numerai.dev.scripts.numerai_utils import *
from numerai.dev.configs.build_numerai_dataset_cfg import *


###  pd options / configs ###

pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', 10)
config = ConfigParser()
config.read('numerai/numerai_keys.ini')

### connect to the numerai signals API ###

napi = numerapi.SignalsAPI(config['KEYS']['NUMERAI_PUBLIC_KEY'], config['KEYS']['NUMERAI_SECRET_KEY'])

### Load eligible tickers ###

ticker_map = download_ticker_map(napi, **DOWNLOAD_VALID_TICKERS_PARAMS)

Number of eligible tickers: 5421
Number of eligible tickers in map: 5420
tickers before cleaning: (5420, 3)
tickers after cleaning: (5373, 3)


In [12]:
if VERBOSE: print(df_yahoo.info())
gc.collect()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23819799 entries, 0 to 23819798
Columns: 153 entries, index to volume_1h_23
dtypes: datetime64[ns, US/Central](1), float64(150), int64(1), object(1)
memory usage: 27.2+ GB
None


8

In [15]:
if CREATE_BLOOMBERG_TICKER_FROM_YAHOO or DOWNLOAD_YAHOO_DATA:
    if 'ticker' in df_yahoo.columns:
        df_yahoo.rename(columns={'ticker': 'yahoo_ticker'}, inplace=True)
    df_yahoo.loc[:, 'bloomberg_ticker'] = df_yahoo['yahoo_ticker'].map(dict(zip(ticker_map['yahoo'], ticker_map['bloomberg_ticker'])))

In [40]:
gc.collect()

1538

In [18]:
df_yahoo.shape

(23819799, 154)

In [17]:
df_yahoo['yahoo_ticker'].isnull().sum(), df_yahoo['bloomberg_ticker'].isnull().sum()

(0, 132717)

In [19]:
132717/23819799

0.0055717094841984185

In [22]:
print('\nvalidating unique date + ticker index...\n')
if DROP_NULL_TICKERS: df_yahoo.dropna(subset=[TICKER_COL], inplace=True)


validating unique date + ticker index...



In [23]:
datetime_ticker_cat = (df_yahoo[DATETIME_COL].astype(str) + ' ' + df_yahoo[TICKER_COL].astype(str)).tolist()
assert len(datetime_ticker_cat) == len(set(datetime_ticker_cat)), 'TICKER_COL and DATETIME_COL do not make a unique index!'
del datetime_ticker_cat

AssertionError: TICKER_COL and DATETIME_COL do not make a unique index!

In [29]:
tmp = pd.Series(datetime_ticker_cat)[pd.Series(datetime_ticker_cat).duplicated()]

In [35]:
[i for i in tmp if not i.endswith('nan')]

['2021-05-14 00:00:00-05:00 AEM SP',
 '2018-12-24 00:00:00-06:00 ALA CN',
 '2021-05-14 00:00:00-05:00 ALM SM',
 '2021-05-14 00:00:00-05:00 ATO FP',
 '2021-05-14 00:00:00-05:00 BEKB BB',
 '2021-05-14 00:00:00-05:00 BMW GR',
 '2021-05-14 00:00:00-05:00 BOKA NA',
 '2021-05-14 00:00:00-05:00 EDEN FP',
 '2021-05-14 00:00:00-05:00 GNDI3 BZ',
 '2021-05-14 00:00:00-05:00 KINDSDB SS',
 '2021-05-14 00:00:00-05:00 SOF BB',
 '2021-05-14 00:00:00-05:00 UHR SW',
 '2018-12-24 00:00:00-06:00 WFG CN']

In [None]:
df_yahoo['yahoo_ticker'].isnull().sum(), df_yahoo['bloomberg_ticker'].isnull().sum()