In [1]:
import pandas as pd 
import requests 
import os
import requests
import numpy as np
import pickle
import yfinance as yf
from tqdm.notebook import tqdm
from sklearn.preprocessing import OneHotEncoder
import h5py
import warnings
from sklearn.preprocessing import StandardScaler

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
#Helper function to make directory
def make_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)



In [8]:
def get_data():
    symbols = pd.read_csv('data/NASDAQ_preprocess.csv',index_col=False)['Symbol'].to_list()
    symbol_text = ''
    for symbol in symbols:
        # Parse Bug of csv
        if symbol is np.nan: 
            symbol = "NA"
        symbol = symbol.replace('/','-')
        symbol_text= symbol_text + ' ' + symbol
    tickers =  yf.Tickers(symbol_text)
    make_directory('data/price')
    p_bar = tqdm(total=len(symbols))
    for symbol in symbols:
        try:
            p_bar.update(1)
            p_bar.set_description(symbol)
            symbol_df = tickers.tickers[symbol].history(interval='1d', period='max')
            symbol_df.to_csv(f'data/price/{symbol}.csv')
        except Exception as e:
            print(e)
            pass
get_data()

  0%|          | 0/2281 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [24]:
test_df = pd.read_csv('data/price/AACG.csv')
display(test_df)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2008-01-29,0.762811,0.802156,0.688136,0.702589,1489000,0.0,0
1,2008-01-30,0.702590,0.734708,0.666456,0.682516,219000,0.0,0
2,2008-01-31,0.681712,0.827048,0.681712,0.766826,182300,0.0,0
3,2008-02-01,0.797339,0.798142,0.762811,0.763614,28200,0.0,0
4,2008-02-04,0.762811,0.779674,0.762811,0.762811,8300,0.0,0
...,...,...,...,...,...,...,...,...
4085,2024-04-22,0.990000,1.010000,0.870000,0.890000,12600,0.0,0
4086,2024-04-23,0.960000,0.980000,0.890000,0.890000,16900,0.0,0
4087,2024-04-24,0.890000,0.930000,0.870000,0.870000,6300,0.0,0
4088,2024-04-25,0.870000,0.920000,0.870000,0.880000,11900,0.0,0


In [25]:
# MACD -> Cross -> Get Cross-> Get Label (20 days after) -> Pct Change ->
# Truncate & Concat Encode Features  -> ? Standard Scaling (Global)
def MACD(df, factor_1=12, factor_2=26, signal_line=9):
    df["EMA1"]=df["Close"].ewm(span=factor_1, min_periods=factor_1).mean()
    df["EMA2"]=df["Close"].ewm(span=factor_2, min_periods=factor_2).mean()
    
    #MACD Line
    df["DIF_MACD"]=df["EMA1"]-df["EMA2"]
    
    # Signal Line
    df["DEM_MACD"]=df["DIF_MACD"].ewm(span=signal_line, min_periods=signal_line).mean()
    
    df['Histogram_MACD']=(df["DIF_MACD"]-df["DEM_MACD"])*2
    
    # Bullish Cross over = Histogram changes from negative to positive
    df['Bullish_Crossover'] = (df['Histogram_MACD'] >= 0) & (df['Histogram_MACD'].shift() < 0)
    df['Bullish_Crossover'] = df['Bullish_Crossover'].map({True: 1, False: 0})
    
    df.drop(columns=["EMA1","EMA2"],inplace=True)
    df.dropna(inplace=True)
    return df
    
MACD(test_df)
display(test_df[test_df['Bullish_Crossover']==True])

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,DIF_MACD,DEM_MACD,Histogram_MACD,Bullish_Crossover
46,2008-04-04,0.758797,0.956325,0.758797,0.884861,35100,0.0,0,-0.008982,-0.009335,0.000706,1
48,2008-04-08,0.779674,0.819019,0.765220,0.819019,21000,0.0,0,-0.006844,-0.008951,0.004214,1
69,2008-05-07,0.863181,0.883255,0.863181,0.867196,9900,0.0,0,0.012305,0.011692,0.001226,1
121,2008-07-22,0.979610,1.240572,0.978005,1.226922,54800,0.0,0,-0.019854,-0.036345,0.032981,1
179,2008-10-13,0.647988,0.721057,0.647988,0.702589,46900,0.0,0,-0.070781,-0.075653,0.009744,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4006,2023-12-27,0.850000,1.190000,0.850000,1.050000,101600,0.0,0,-0.009312,-0.023350,0.028076,1
4035,2024-02-08,1.170000,1.180000,1.150000,1.180000,15100,0.0,0,0.034465,0.031007,0.006916,1
4051,2024-03-04,1.550000,1.550000,1.400000,1.490000,69000,0.0,0,0.066716,0.062902,0.007628,1
4084,2024-04-19,1.030000,1.030000,0.920000,0.970000,11400,0.0,0,-0.084558,-0.087307,0.005498,1


In [30]:
def pct_change(df, columns=['Open','High','Low','Close','Volume']):
    for col in columns:
        df[f'{col}_pct'] = df[col].pct_change()
        
    df.dropna(inplace=True)
    return df

pct_change(test_df)
display(test_df[['Open_pct','High_pct','Low_pct','Close_pct','Volume_pct']])

Unnamed: 0,Open_pct,High_pct,Low_pct,Close_pct,Volume_pct
35,-0.075938,-0.051235,-0.080392,-0.068628,0.374486
36,0.003960,0.036644,-0.003198,0.004211,-0.395210
37,-0.071992,-0.023256,0.003209,0.069182,-0.762376
38,0.068013,0.000952,0.066098,0.029412,4.604167
39,0.039801,-0.000952,-0.112000,-0.141905,1.561338
...,...,...,...,...,...
4085,-0.038835,-0.019417,-0.054348,-0.082474,0.105263
4086,-0.030303,-0.029703,0.022988,0.000000,0.341270
4087,-0.072917,-0.051020,-0.022472,-0.022472,-0.627219
4088,-0.022472,-0.010753,0.000000,0.011494,0.888889


In [31]:
# ** Filter out only MACD Bullish Crossover
# time_series = How many frames in previous to truncate 
# label_time = How many frames after to define the label 
# 
def save_truncate_data(df, time_series=20, label_time=20, higher_than=0.03, symbol=None, merge_features=True, test_mode=False):
    if test_mode:
        df=df.copy()
    marked_rows=test_df[test_df['Bullish_Crossover']==True]
    make_directory('data/train_data')
    
    if merge_features and symbol is not None:
        feature_df = pd.read_csv(f'data/NASDAQ_preprocess.csv',index_col='index')
        features = feature_df[feature_df['Symbol']==symbol]
        features = features.drop(columns=['Symbol','Name','Lastsale','Netchange','Pctchange','Marketcap','Ipoyear','Volume','Url']).iloc[0]

        for column in features.index:
            df[column] = features[column]
    
    close_copy = df['Close'].to_list()
    #Drop non normalized Columns
    df.drop(columns=['Date','Open','High','Low','Close','Volume','Dividends','Stock Splits','DIF_MACD','DEM_MACD','Histogram_MACD'],inplace=True)

    symbol = symbol.replace('/','-')
    result_data = []
    result_label = []

    for i in marked_rows.index:
        if i >= time_series and (i+label_time) < len(df):
            # Truncate Data
            result_df = df.iloc[i-time_series:i]
            
            infinite_mask = np.isinf(result_df)
            has_infinite = infinite_mask.any().any()
            if has_infinite:
                # print(f'{symbol} {i} has inf values')
                continue
            result_data.append(result_df.to_numpy())
            global_data.append(result_df.to_numpy())
            
            # Get Label
            close_original = close_copy[i]
            close_new = close_copy[i + label_time]
            label = 1 if close_new > close_original * (1+higher_than) else 0
            result_label.append(label) 

    if len(result_data) >= 1:
        result_data = np.array(result_data)
    else:
        return 0

    with h5py.File(f"data/train_data/{symbol}.h5", "w") as out:
        out.create_dataset(f'data',data=result_data,dtype='f')
        out.create_dataset(f'label',data=result_label)
    return result_data.shape[0]
    
print(save_truncate_data(test_df,symbol='AACG',test_mode=True))
# print(save_truncate_data(test_df,symbol='AAPL',test_mode=True,merge_features=False))


122


In [32]:
# Extract Data for all symbols
global_data = []

def extract_data(merge_features):
    price_path = 'data/price'
    symbol_col = None
    index_col = None
    p_bar = tqdm()
    for root, _, files in os.walk(price_path):
        p_bar.total = len(files)
        for file in files:
            if f'.csv' in file:
                symbol = file.split('.')[0]
                p_bar.set_description(symbol)
                p_bar.update(1)
                df = pd.read_csv(f'data/price/{symbol}.csv')
                
                #Price Preprocessing
                MACD(df)
                pct_change(df)
                data_length = save_truncate_data(df,symbol=symbol,merge_features=merge_features)
                
                if data_length != 0:
                    #Meta Data
                    index_col = np.arange(data_length) if index_col is None else np.concatenate([index_col, np.arange(data_length)])
                    symbol_col_temp = np.full(data_length, symbol)
                    symbol_col = symbol_col_temp if symbol_col is None else np.concatenate([symbol_col, symbol_col_temp])
                
    meta_df = pd.DataFrame()
    meta_df['Index'] = index_col
    meta_df['Symbol'] = symbol_col
    meta_df.to_csv('data/meta.csv')

            
merge_features=False
extract_data(merge_features)

0it [00:00, ?it/s]

In [34]:
global_data = np.vstack(global_data)

def standardize(data):
    scaler = StandardScaler()
    scaler.fit(data)
    #Save onehotencoder for later usage
    with open('data/scalar.pkl', 'wb') as to_write:
        pickle.dump(scaler, to_write)

# usage
directory = 'data/train_data'
standardize(global_data)

array([[ 0.00000000e+00, -5.10018112e-02, -3.18182535e-02,
        -9.50577712e-03, -9.41617993e-03, -4.67741935e-01],
       [ 0.00000000e+00,  8.63742619e-03,  2.72302231e-02,
         7.67771116e-03, -1.90101043e-03, -3.33333333e-01],
       [ 0.00000000e+00, -9.51619783e-04, -2.19379434e-02,
        -1.23357759e-07,  9.52372132e-03,  1.04545455e+00],
       [ 0.00000000e+00,  1.23357774e-07, -4.67279232e-03,
        -4.76180016e-03, -9.43387572e-03,  1.77777778e-01],
       [ 0.00000000e+00,  1.33332727e-02,  1.78403326e-02,
         4.78461480e-03,  1.61903899e-02,  5.09433962e-01],
       [ 0.00000000e+00, -8.45874192e-03, -2.67528434e-02,
        -8.19277217e-08, -1.40581518e-02, -5.00000000e-01],
       [ 0.00000000e+00, -4.73930406e-03,  7.58298207e-03,
         5.06619526e-08,  7.60460620e-03, -3.00000000e-01],
       [ 0.00000000e+00,  3.80947585e-03,  2.53997197e-02,
        -4.41554298e-08, -4.71704290e-03,  4.28571429e-01],
       [ 0.00000000e+00,  1.13852331e-02, -2.201