In [135]:
import pandas as pd 
import requests 
import os
import requests
import numpy as np
import pickle
import yfinance as yf
from tqdm.notebook import tqdm
from sklearn.preprocessing import OneHotEncoder
import h5py
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
#Helper function to make directory
def make_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [None]:
def get_data():
    symbols = pd.read_csv('data/NASDAQ_preprocess.csv',index_col=False)['Symbol'].to_list()
    symbol_text = ''
    for symbol in symbols:
        # Parse Bug of csv
        if symbol is np.nan: 
            symbol = "NA"
        symbol = symbol.replace('/','-')
        symbol_text= symbol_text + ' ' + symbol
    tickers =  yf.Tickers(symbol_text)
    make_directory('data/price')
    p_bar = tqdm(total=len(symbols))
    for symbol in symbols:
        try:
            p_bar.update(1)
            p_bar.set_description(symbol)
            symbol_df = tickers.tickers[symbol].history(interval='1d', period='max')
            symbol_df.to_csv(f'data/price/{symbol}.csv')
        except Exception as e:
            print(e)
            pass
get_data()

In [136]:
test_df = pd.read_csv('data/price/AAPL.csv')
display(test_df)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,1980-12-12,0.099192,0.099623,0.099192,0.099192,469033600,0.0,0.0
1,1980-12-15,0.094448,0.094448,0.094017,0.094017,175884800,0.0,0.0
2,1980-12-16,0.087548,0.087548,0.087117,0.087117,105728000,0.0,0.0
3,1980-12-17,0.089273,0.089704,0.089273,0.089273,86441600,0.0,0.0
4,1980-12-18,0.091861,0.092292,0.091861,0.091861,73449600,0.0,0.0
...,...,...,...,...,...,...,...,...
10926,2024-04-17,169.610001,170.649994,168.000000,168.000000,50901200,0.0,0.0
10927,2024-04-18,168.029999,168.639999,166.550003,167.039993,43122900,0.0,0.0
10928,2024-04-19,166.210007,166.399994,164.080002,165.000000,67772100,0.0,0.0
10929,2024-04-22,165.520004,167.259995,164.770004,165.839996,48116400,0.0,0.0


In [137]:
# MACD -> Cross -> Get Cross-> Get Label (20 days after) -> Pct Change ->
# Truncate & Concat Encode Features  -> ? Standard Scaling (Global)
def MACD(df, factor_1=12, factor_2=26, signal_line=9):
    df["EMA1"]=df["Close"].ewm(span=factor_1, min_periods=factor_1).mean()
    df["EMA2"]=df["Close"].ewm(span=factor_2, min_periods=factor_2).mean()
    
    #MACD Line
    df["DIF_MACD"]=df["EMA1"]-df["EMA2"]
    
    # Signal Line
    df["DEM_MACD"]=df["DIF_MACD"].ewm(span=signal_line, min_periods=signal_line).mean()
    
    df['Histogram_MACD']=(df["DIF_MACD"]-df["DEM_MACD"])*2
    
    # Bullish Cross over = Histogram changes from negative to positive
    df['Bullish_Crossover'] = (df['Histogram_MACD'] >= 0) & (df['Histogram_MACD'].shift() < 0)
    df['Bullish_Crossover'] = df['Bullish_Crossover'].map({True: 1, False: 0})
    
    df.drop(columns=["EMA1","EMA2"],inplace=True)
    df.dropna(inplace=True)
    return df
    
MACD(test_df)
display(test_df[test_df['Bullish_Crossover']==True])

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,DIF_MACD,DEM_MACD,Histogram_MACD,Bullish_Crossover
52,1981-02-27,0.091430,0.092292,0.091430,0.091430,14761600,0.0,0.0,-0.004338,-0.004637,0.000598,1
65,1981-03-18,0.088842,0.089704,0.088842,0.088842,36937600,0.0,0.0,-0.003562,-0.004216,0.001307,1
88,1981-04-21,0.094880,0.095311,0.094880,0.094880,28537600,0.0,0.0,0.001104,0.000989,0.000231,1
110,1981-05-21,0.103505,0.103937,0.103505,0.103505,32211200,0.0,0.0,0.001269,0.000959,0.000621,1
149,1981-07-17,0.089273,0.089704,0.089273,0.089273,19824000,0.0,0.0,-0.005067,-0.005109,0.000084,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10813,2023-11-02,175.065670,177.319815,175.005828,177.110367,77334800,0.0,0.0,-1.130044,-1.369145,0.478202,1
10838,2023-12-08,193.952501,195.740228,193.423178,195.460587,53377300,0.0,0.0,3.588110,3.502732,0.170756,1
10841,2023-12-13,194.841365,197.747660,194.601681,197.707718,70404200,0.0,0.0,3.632703,3.510924,0.243559,1
10865,2024-01-19,189.088718,191.705375,188.579374,191.315872,68741000,0.0,0.0,-1.018216,-1.217770,0.399107,1


In [138]:
def pct_change(df, columns=['Open','High','Low','Close','Volume','DIF_MACD','DEM_MACD','Histogram_MACD']):
    for col in columns:
        df[f'{col}_pct'] = df[col].pct_change()
        
    df.dropna(inplace=True)
    return df

pct_change(test_df)
display(test_df[['Open_pct','High_pct','Low_pct','Close_pct','Volume_pct','DIF_MACD_pct','DEM_MACD_pct','Histogram_MACD_pct']])

Unnamed: 0,Open_pct,High_pct,Low_pct,Close_pct,Volume_pct,DIF_MACD_pct,DEM_MACD_pct,Histogram_MACD_pct
34,-0.061400,-6.139972e-02,-0.057518,-0.057518,-0.485451,1.058912,-4.325138,0.514516
35,0.032708,3.738069e-02,0.037556,0.037556,-0.194156,0.266663,1.306751,0.035770
36,0.036197,3.603374e-02,0.036197,0.036197,0.454971,0.054779,0.469941,-0.150476
37,0.000000,4.347585e-03,0.000000,0.000000,-0.715434,0.029144,0.260769,-0.169003
38,0.004366,-2.448950e-08,0.004366,0.004366,0.748588,-0.000746,0.160908,-0.210557
...,...,...,...,...,...,...,...,...
10926,-0.012460,-1.789825e-02,-0.001605,-0.008147,-0.309451,0.158809,-0.047661,-0.443686
10927,-0.009315,-1.177846e-02,-0.008631,-0.005714,-0.152812,0.152339,-0.015669,-0.686920
10928,-0.010831,-1.328277e-02,-0.014830,-0.012213,0.571603,0.196117,0.023992,-2.507214
10929,-0.004151,5.168273e-03,0.004205,0.005091,-0.290026,0.081283,0.036524,0.558878


In [176]:
# ** Filter out only MACD Bullish Crossover
# time_series = How many frames in previous to truncate 
# label_time = How many frames after to define the label 
# 
def save_truncate_data(df, time_series=20, label_time=20, higher_than=0.03, symbol=None, merge_features=True, test_mode=False):
    if test_mode:
        df=df.copy()
    marked_rows=test_df[test_df['Bullish_Crossover']==True]
    make_directory('data/train_data')
    
    if merge_features and symbol is not None:
        feature_df = pd.read_csv(f'data/NASDAQ_preprocess.csv',index_col='index')
        features = feature_df[feature_df['Symbol']==symbol]
        features = features.drop(columns=['Symbol','Name','Lastsale','Netchange','Pctchange','Marketcap','Ipoyear','Volume','Url']).iloc[0]

        for column in features.index:
            df[column] = features[column]
    
    close_copy = df['Close'].to_list()
    #Drop non normalized Columns
    df.drop(columns=['Date','Open','High','Low','Close','Volume','DIF_MACD','DEM_MACD','Histogram_MACD'],inplace=True)

    symbol = symbol.replace('/','-')
    result_data = []
    result_label = []

    for i in marked_rows.index:
        if i >= time_series and (i+label_time) < len(df):
            # Truncate Data
            result_df = df.iloc[i-time_series:i]
            result_data.append(result_df.to_numpy())

            # Get Label
            close_original = close_copy[i]
            close_new = close_copy[i + label_time]
            result_label.append(close_new > close_original * (1+higher_than)) 

    if len(result_data) >= 1:
        result_data = np.array(result_data)
    else:
        return 0
    with h5py.File(f"data/train_data/{symbol}.h5", "w") as out:
        out.create_dataset(f'data',data=result_data,dtype='f')
        out.create_dataset(f'label',data=result_label)
    return result_data.shape[0]
    
print(save_truncate_data(test_df,symbol='AAPL',test_mode=True))
print(save_truncate_data(test_df,symbol='AAPL',test_mode=True,merge_features=False))


410
410


In [179]:
# Extract Data for all symbols
def extract_data(merge_features):
    price_path = 'data/price'
    symbol_col = None
    index_col = None
    p_bar = tqdm()
    for root, _, files in os.walk(price_path):
        p_bar.total = len(files)
        for file in files:
            if f'.csv' in file:
                symbol = file.split('.')[0]
                p_bar.set_description(symbol)
                p_bar.update(1)
                df = pd.read_csv(f'data/price/{symbol}.csv')
                
                #Price Preprocessing
                MACD(df)
                pct_change(df)
                data_length = save_truncate_data(df,symbol=symbol,merge_features=merge_features)
                
                #Meta Data
                index_col = np.arange(data_length) if index_col is None else np.concatenate([index_col, np.arange(data_length)])
                symbol_col_temp = np.full(data_length, symbol)
                symbol_col = symbol_col_temp if symbol_col is None else np.concatenate([symbol_col, symbol_col_temp])
                
    meta_df = pd.DataFrame()
    meta_df['Index'] = index_col
    meta_df['Symbol'] = symbol_col
    meta_df.to_csv('data/meta.csv')

            
merge_features=True
extract_data(merge_features)

0it [00:00, ?it/s]

IndexError: single positional indexer is out-of-bounds