In [1]:
import os
from configparser import ConfigParser
import sys
from IPython.display import display
from datetime import datetime
import time
import numerapi

start_time = time.time()

if not os.getcwd().endswith('trading'): os.chdir('../../..') # local machine

assert os.getcwd().endswith('trading'), 'Wrong path!'
os.environ['NUMEXPR_MAX_THREADS'] = '32'
os.environ['NUMEXPR_NUM_THREADS'] = '16'

sys.path.append(os.getcwd())
from dev.scripts.ML_utils import * # run if on local machine
from dev.scripts.trading_utils import * # run if on local machine
from numerai.dev.scripts.numerai_utils import *
from numerai.dev.configs.build_numerai_dataset_cfg import *


###  pd options / configs ###

pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', 10)
config = ConfigParser()
config.read('numerai/numerai_keys.ini')

['numerai/numerai_keys.ini']

In [2]:
df_yahoo_1d = pd.read_feather('/media/melgazar9/HDD_10TB/trading/data/numerai/datasets/raw_yahoo_dfs/df_yahoo_1d_2021-05-02.feather')
df_yahoo_1h = pd.read_feather('/media/melgazar9/HDD_10TB/trading/data/numerai/datasets/raw_yahoo_dfs/df_yahoo_1h_2021-05-02.feather')
df_yahoo_1d.head(2)

Unnamed: 0,Date,yahoo_ticker,Adj Close,Close,High,Low,Open,Volume
0,2000-01-04,000060.KS,277.67206,738.72388,808.19916,703.54657,703.54657,1365652.0
1,2000-01-05,000060.KS,319.32285,849.53247,849.53247,791.48993,835.46155,2067240.0


In [3]:
df_yahoo_1h.head(2)

Unnamed: 0,Date,yahoo_ticker,Adj Close,Close,High,Low,Open,Volume
0,2020-05-04 14:00:00+09:00,000060.KS,13600.0,13600.0,13600.0,13500.0,13550.0,0.0
1,2020-05-06 09:00:00+09:00,000060.KS,13700.0,13700.0,13800.0,13550.0,13650.0,0.0


In [4]:
df_yahoo_1d['Date'] = pd.to_datetime(df_yahoo_1d['Date']).dt.tz_localize('US/Central')

In [5]:
df_yahoo_1h['Date'] = pd.to_datetime(df_yahoo_1h['Date']).dt.tz_convert('US/Central')

In [6]:
df_yahoo_1d['Date'].dtype, df_yahoo_1h['Date'].dtype

(datetime64[ns, US/Central], datetime64[ns, US/Central])

In [7]:
df_yahoo_1d.shape, df_yahoo_1h.shape

((23507715, 8), (9392057, 8))

### Cleaning

In [13]:
tmp = df_yahoo_1h.pivot_table(index=[df_yahoo_1h['Date'].dt.date, 'yahoo_ticker'],
                        columns=[df_yahoo_1h['Date'].dt.hour],
                        aggfunc='first',
                        values=[i for i in df_yahoo_1h.columns if not i in ['Date', 'yahoo_ticker']])
tmp.columns = list(pd.Index([str(e[0]) + '_' + str(e[1]) for e in tmp.columns.tolist()]).str.replace(' ', '_'))
tmp.rename(columns={'Date_': 'Date', 'yahoo_ticker_': 'yahoo_ticker'}, inplace=True)
tmp.reset_index(inplace=True)
tmp['Date'] = pd.to_datetime(tmp['Date']).dt.tz_localize('US/Central')
tmp.tail()

Unnamed: 0,Date,yahoo_ticker,Adj_Close_0,Adj_Close_1,Adj_Close_2,...,Volume_19,Volume_20,Volume_21,Volume_22,Volume_23
1385114,2021-05-02 00:00:00-05:00,WSP.AX,,,,...,35863.0,95.0,,,
1385115,2021-05-02 00:00:00-05:00,WTC.AX,,,,...,90976.0,2762.0,,,
1385116,2021-05-02 00:00:00-05:00,Z1P.AX,,,,...,985261.0,262740.0,,,
1385117,2021-05-02 00:00:00-05:00,Z74.SI,,,,...,,4023800.0,,,
1385118,2021-05-02 00:00:00-05:00,ZEL.NZ,,,,...,34618.0,0.0,,,


In [17]:
df_yahoo_1d.shape, df_yahoo_1h.shape, tmp.shape

((23507715, 8), (9392057, 8), (1385119, 146))

In [19]:
tmp2 = pd.merge(df_yahoo_1d, tmp, on=['Date', 'yahoo_ticker'], how='left')

In [26]:
tmp2[[i for i in tmp.columns] + ['Close']]

Unnamed: 0,Date,yahoo_ticker,Adj_Close_0,Adj_Close_1,Adj_Close_2,...,Volume_20,Volume_21,Volume_22,Volume_23,Close
0,2000-01-04 00:00:00-06:00,000060.KS,,,,...,,,,,738.72388
1,2000-01-05 00:00:00-06:00,000060.KS,,,,...,,,,,849.53247
2,2000-01-06 00:00:00-06:00,000060.KS,,,,...,,,,,822.27008
3,2000-01-07 00:00:00-06:00,000060.KS,,,,...,,,,,826.66724
4,2000-01-10 00:00:00-06:00,000060.KS,,,,...,,,,,949.78790
...,...,...,...,...,...,...,...,...,...,...,...
23507710,2021-04-26 00:00:00-05:00,ZZZ.TO,,,,...,,,,,34.07000
23507711,2021-04-27 00:00:00-05:00,ZZZ.TO,,,,...,,,,,34.38000
23507712,2021-04-28 00:00:00-05:00,ZZZ.TO,,,,...,,,,,34.75000
23507713,2021-04-29 00:00:00-05:00,ZZZ.TO,,,,...,,,,,35.19000


In [29]:
tmp2[tmp2['Date'] > '2021-01-01'].shape

(428102, 152)

In [34]:
tmp2

Unnamed: 0,Date,yahoo_ticker,Adj Close,Close,High,...,Volume_19,Volume_20,Volume_21,Volume_22,Volume_23
0,2000-01-04 00:00:00-06:00,000060.KS,277.67206,738.72388,808.19916,...,,,,,
1,2000-01-05 00:00:00-06:00,000060.KS,319.32285,849.53247,849.53247,...,,,,,
2,2000-01-06 00:00:00-06:00,000060.KS,309.07556,822.27008,910.21338,...,,,,,
3,2000-01-07 00:00:00-06:00,000060.KS,310.72830,826.66724,834.58215,...,,,,,
4,2000-01-10 00:00:00-06:00,000060.KS,357.00708,949.78790,949.78790,...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
23507710,2021-04-26 00:00:00-05:00,ZZZ.TO,34.07000,34.07000,34.28000,...,,,,,
23507711,2021-04-27 00:00:00-05:00,ZZZ.TO,34.38000,34.38000,34.45000,...,,,,,
23507712,2021-04-28 00:00:00-05:00,ZZZ.TO,34.75000,34.75000,34.82000,...,,,,,
23507713,2021-04-29 00:00:00-05:00,ZZZ.TO,35.19000,35.19000,35.38000,...,,,,,


In [42]:
tmp2[(tmp2['Date'] > '2021-01-01') & (tmp2['yahoo_ticker'] == 'TSLA') & (tmp2['Adj_Close_10'].notnull())][[i for i in tmp.columns] + ['Close']].dropna(axis=1)

Unnamed: 0,Date,yahoo_ticker,Adj_Close_8,Adj_Close_9,Adj_Close_10,...,Volume_11,Volume_12,Volume_13,Volume_14,Close
21805141,2021-01-04 00:00:00-06:00,TSLA,740.03003,735.62000,723.60999,...,4484030.00000,3906976.00000,3745736.00000,2459595.00000,729.77002
21805142,2021-01-05 00:00:00-06:00,TSLA,732.07001,738.59003,737.66998,...,2736774.00000,1920131.00000,2556215.00000,2583717.00000,735.10999
21805143,2021-01-06 00:00:00-06:00,TSLA,757.30920,769.41998,771.14459,...,2642121.00000,3645134.00000,8285672.00000,3161001.00000,755.97998
21805144,2021-01-07 00:00:00-06:00,TSLA,788.90997,807.44000,799.81219,...,3737545.00000,5040339.00000,5416631.00000,3965979.00000,816.03998
21805145,2021-01-08 00:00:00-06:00,TSLA,859.34003,882.45001,879.12000,...,6758947.00000,10362588.00000,5746255.00000,6258909.00000,880.02002
...,...,...,...,...,...,...,...,...,...,...,...
21805218,2021-04-26 00:00:00-05:00,TSLA,738.34521,739.12500,738.24011,...,2224910.00000,2477680.00000,2720000.00000,3273865.00000,738.20001
21805219,2021-04-27 00:00:00-05:00,TSLA,708.39752,713.88171,713.21997,...,1991700.00000,2136883.00000,2103525.00000,2241704.00000,704.73999
21805220,2021-04-28 00:00:00-05:00,TSLA,702.19342,698.06000,697.25531,...,1987170.00000,2068922.00000,2713357.00000,1631717.00000,694.40002
21805221,2021-04-29 00:00:00-05:00,TSLA,688.28052,679.52002,671.78998,...,2892704.00000,2679420.00000,3447699.00000,1617562.00000,677.00000


In [31]:
tmp2[tmp2['Date'] > '2021-01-01'][[i for i in tmp.columns] + ['Close']].isnull().sum()

Date                 0
yahoo_ticker         0
Adj_Close_0     372759
Adj_Close_1     399579
Adj_Close_2     331493
                 ...  
Volume_20       337621
Volume_21       335964
Volume_22       336159
Volume_23       344688
Close                1
Length: 147, dtype: int64