In [53]:
import pandas as pd 
import requests 
import os
import requests
import numpy as np
import pickle
import yfinance as yf
from tqdm.notebook import tqdm
from sklearn.preprocessing import OneHotEncoder
import h5py
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
#Helper function to make directory
def make_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [80]:
def get_data(symbol_dir='data/NASDAQ_preprocess.csv'):
    symbols = pd.read_csv(symbol_dir,index_col=False)['Symbol'].to_list()
    symbol_text = ''
    for symbol in symbols:
        # Parse Bug of csv
        if symbol is np.nan: 
            symbol = "NA"
        symbol = symbol.replace('/','-')
        symbol = symbol.replace('.','-')
        symbol_text= symbol_text + ' ' + symbol
    tickers =  yf.Tickers(symbol_text)
    make_directory('data/price')
    p_bar = tqdm(total=len(symbols))
    for symbol in symbols:
        symbol = symbol.replace('/','-')
        symbol = symbol.replace('.','-')
        try:
            p_bar.update(1)
            p_bar.set_description(symbol)
            symbol_df = tickers.tickers[symbol].history(interval='1d', period='10y')
            symbol_df.to_csv(f'data/price/{symbol}.csv')
        except Exception as e:
            print(e)
            pass
get_data('data/SP500.csv')

  0%|          | 0/503 [00:00<?, ?it/s]

In [82]:
test_df = pd.read_csv('data/price/AAPL.csv')
display(test_df)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2014-04-28,18.018966,18.740920,18.011100,18.688700,669485600,0.0,0.0
1,2014-04-29,18.677687,18.748152,18.544621,18.633331,337377600,0.0,0.0
2,2014-04-30,18.643084,18.856682,18.553744,18.562866,456640800,0.0,0.0
3,2014-05-01,18.622955,18.711036,18.445534,18.606598,244048000,0.0,0.0
4,2014-05-02,18.633653,18.692164,18.550918,18.641201,191514400,0.0,0.0
...,...,...,...,...,...,...,...,...
2513,2024-04-22,165.520004,167.259995,164.770004,165.839996,48116400,0.0,0.0
2514,2024-04-23,165.350006,167.050003,164.919998,166.899994,49537800,0.0,0.0
2515,2024-04-24,166.539993,169.300003,166.210007,169.020004,48251800,0.0,0.0
2516,2024-04-25,169.529999,170.610001,168.149994,169.889999,50558300,0.0,0.0


In [83]:
# MACD -> Cross -> Get Cross-> Get Label (20 days after) -> Pct Change ->
# Truncate & Concat Encode Features  -> ? Standard Scaling (Global)
def MACD(df, factor_1=12, factor_2=26, signal_line=9):
    df["EMA1"]=df["Close"].ewm(span=factor_1, min_periods=factor_1).mean()
    df["EMA2"]=df["Close"].ewm(span=factor_2, min_periods=factor_2).mean()
    
    #MACD Line
    df["DIF_MACD"]=df["EMA1"]-df["EMA2"]
    
    # Signal Line
    df["DEM_MACD"]=df["DIF_MACD"].ewm(span=signal_line, min_periods=signal_line).mean()
    
    df['Histogram_MACD']=(df["DIF_MACD"]-df["DEM_MACD"])*2
    
    # Bullish Cross over = Histogram changes from negative to positive
    df['Bullish_Crossover'] = (df['Histogram_MACD'] >= 0) & (df['Histogram_MACD'].shift() < 0)
    df['Bullish_Crossover'] = df['Bullish_Crossover'].map({True: 1, False: 0})
    
    df.drop(columns=["EMA1","EMA2"],inplace=True)
    df.dropna(inplace=True)
    return df
    
MACD(test_df)
display(test_df[test_df['Bullish_Crossover']==True])

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,DIF_MACD,DEM_MACD,Histogram_MACD,Bullish_Crossover
46,2014-07-02,20.785994,20.828065,20.613274,20.699635,113860000,0.0,0.0,0.179204,0.177516,0.003375,1
61,2014-07-24,21.487933,21.549934,21.350643,21.485718,182916000,0.0,0.0,0.244735,0.232687,0.024097,1
76,2014-08-14,21.659353,21.712761,21.541410,21.697184,112464000,0.0,0.0,0.157724,0.157103,0.001242,1
123,2014-10-21,22.925568,22.925568,22.536132,22.803175,378495600,0.0,0.0,-0.031150,-0.044435,0.026570,1
170,2014-12-29,25.432084,25.651113,25.411968,25.458904,110395600,0.0,0.0,0.065794,0.054790,0.022009,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2422,2023-12-08,193.952501,195.740228,193.423178,195.460587,53377300,0.0,0.0,3.588110,3.502732,0.170756,1
2425,2023-12-13,194.841365,197.747660,194.601681,197.707718,70404200,0.0,0.0,3.632703,3.510924,0.243559,1
2449,2024-01-19,189.088718,191.705375,188.579374,191.315872,68741000,0.0,0.0,-1.018216,-1.217770,0.399107,1
2489,2024-03-18,175.570007,177.710007,173.520004,173.720001,75604200,0.0,0.0,-3.498562,-3.662908,0.328692,1


In [185]:
def pct_change(df, columns=['Open','High','Low','Close','Volume'], threshold=0.05):
    for col in columns:
        df[f'{col}_pct'] = df[col].pct_change()
        df[f'{col}_pct'] = df[f'{col}_pct'].clip(lower=-threshold, upper=threshold)
    
    df.dropna(inplace=True)
    return df

def pct_change_relative(df, row, columns=['Open','High','Low','Close','Volume'],threshold=0.1):
    for col in columns:
        if row[col] != 0:
            df[f'{col}_pct'] = (df[col] - row[col]) / row[col] 
        else:
            df[f'{col}_pct'] = 0
        df[f'{col}_pct'] = df[f'{col}_pct'].clip(lower=-threshold, upper=threshold)
    return df

pct_change_relative(test_df,test_df.iloc[20])
display(test_df[['Open_pct','High_pct','Low_pct','Close_pct','Volume_pct']].head(21))

Unnamed: 0,Open_pct,High_pct,Low_pct,Close_pct,Volume_pct
38,-0.033056,-0.025789,-0.03009,-0.032255,0.1
39,-0.038636,-0.035579,-0.033291,-0.033106,0.1
40,-0.044636,-0.034316,-0.037666,-0.038961,-0.0011
41,-0.050321,-0.045263,-0.043427,-0.038109,-0.056552
42,-0.048636,-0.041579,-0.041827,-0.032361,-0.1
43,-0.043899,-0.031579,-0.031477,-0.020864,0.1
44,-0.030424,-0.013368,-0.017392,-0.010751,0.1
45,-0.015475,-0.009789,-0.006295,-0.004471,-0.021904
46,-0.01179,-0.009894,-0.006722,-0.004896,-0.1
47,-0.013896,-0.009474,-0.005548,0.000958,-0.1


In [89]:
# Data to non-numerical
# The event occurs is more important than its value
# Diviidends, Stock Splits 
def to_non_numerical(df, columns=['Dividends','Stock Splits']):
    for col in columns:
        df.loc[df[col] != 0, col] = 1 

to_non_numerical(test_df)
display(test_df.loc[test_df["Stock Splits"]!=0])

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,DIF_MACD,DEM_MACD,Histogram_MACD,Bullish_Crossover,Open_pct,High_pct,Low_pct,Close_pct,Volume_pct
1598,2020-08-31,124.990502,128.341085,123.44257,126.42086,225702700,0.0,1.0,6.903377,6.467787,0.87118,0,0.012439,0.036044,0.011418,0.033912,0.05


In [131]:
# "# ** Filter out only MACD Bullish Crossover
# # time_series = How many frames in previous to truncate 
# # label_time = How many frames after to define the label 
# # 
# def save_truncate_data(df, time_series=20, label_time=20, higher_than=0.03, symbol=None, merge_features=True, test_mode=False):
#     if test_mode:
#         df=df.copy()
#     marked_rows=test_df[test_df['Bullish_Crossover']==True]
#     make_directory('data/train_data')
    
#     # So Filter will be done even merge_features is off
#     if symbol is not None:
#         feature_df = pd.read_csv(f'data/NASDAQ_preprocess.csv')
#         features = feature_df[feature_df['Symbol']==symbol]
#         if len(features) == 0:
#             return 0, None
    
#     if merge_features:
#         features = features.drop(columns=['Symbol','Name','Lastsale','Netchange','Pctchange','Marketcap','Ipoyear','Volume','Url']).iloc[0]

#         for column in features.index:
#             df[column] = features[column]
    
#     close_copy = df['Close'].to_list()
#     #Drop non normalized Columns
#     df.drop(columns=['Date','Open','High','Low','Close','Volume','Dividends','Stock Splits','DIF_MACD','DEM_MACD','Histogram_MACD'],inplace=True)
#     # df.drop(columns=['Date'],inplace=True)
    
#     symbol = symbol.replace('/','-')
#     result_data = []
#     result_label = []

#     for i in marked_rows.index:
#         if i >= time_series and (i+label_time) < len(df):
#             # Truncate Data
#             result_df = df.iloc[i-time_series:i]
            
#             infinite_mask = np.isinf(result_df)
#             has_infinite = infinite_mask.any().any()
#             if has_infinite:
#                 # print(f'{symbol} {i} has inf values')
#                 continue
#             result_data.append(result_df.to_numpy())
#             global_data.append(result_df.to_numpy())
            
#             # Get Label
#             close_original = close_copy[i]
#             close_new = close_copy[i + label_time]
#             label = 1 if close_new > close_original * (1+higher_than) else 0
#             result_label.append(label) 

#     if len(result_data) >= 1:
#         result_data = np.array(result_data)
#     else:
#         return 0, None

#     with h5py.File(f"data/train_data/{symbol}.h5", "w") as out:
#         out.create_dataset(f'data',data=result_data,dtype='f')
#         out.create_dataset(f'label',data=result_label)
#     return result_data.shape[0], result_label
    
# print(save_truncate_data(test_df,symbol='AACG',test_mode=True))
# # print(save_truncate_data(test_df,symbol='AAPL',test_mode=True,merge_features=False))
"

SyntaxError: unterminated string literal (detected at line 64) (1557175852.py, line 64)

In [181]:
# ** Filter out only MACD Bullish Crossover
# time_series = How many frames in previous to truncate 
# label_time = How many frames after to define the label 
# 
def save_truncate_data_SP500(df, time_series=20, label_time=20, higher_than=0.03, symbol=None, merge_features=True, test_mode=False,regression=False):
    if test_mode:
        df=df.copy()
    marked_rows=test_df[test_df['Bullish_Crossover']==True]
    make_directory('data/train_data')
    
    # So Filter will be done even merge_features is off
    if symbol is not None:
        feature_df = pd.read_csv(f'data/SP500_preprocess.csv')
        features = feature_df[feature_df['Symbol']==symbol]
        if len(features) == 0:
            return 0, None
    
    if merge_features:
        features = features.drop(columns=['Symbol','Security','Date added','CIK','Founded']).iloc[0]
        for column in features.index:
            df[column] = features[column]
    
    close_copy = df['Close'].to_list()
    #Drop non normalized Columns
#     df.drop(columns=['Date','Open','High','Low','Close','Volume','Dividends','Stock Splits','DIF_MACD','DEM_MACD','Histogram_MACD'],inplace=True)
    # df.drop(columns=['Date'],inplace=True)
    
    symbol = symbol.replace('/','-')
    result_data = []
    result_label = []
    if regression:
        result_label_regression = []
    for i in marked_rows.index:
        if i >= (time_series+1) and (i+label_time) < len(df):
            # Truncate Data
            result_df = df.iloc[i-time_series-1:i].copy()
            ref_row = result_df.iloc[-1]
            result_df = pct_change_relative(result_df, ref_row)
            result_df = result_df.iloc[:-1]
            
            result_df.drop(columns=['Date','Open','High','Low','Close','Volume','Dividends','Stock Splits','DIF_MACD','DEM_MACD','Histogram_MACD'],inplace=True)

            
            infinite_mask = np.isinf(result_df)
            has_infinite = infinite_mask.any().any()
            if has_infinite:
                # print(f'{symbol} {i} has inf values')
                continue
            result_data.append(result_df.to_numpy())
            global_data.append(result_df.to_numpy())
            
            # Get Label
            close_original = close_copy[i]
            close_new = close_copy[i + label_time]
            label = 1 if close_new > close_original * (1+higher_than) else 0
            result_label.append(label) 
            if regression:
                regression_df = df.iloc[i:i+label_time].copy()
                regression_df = pct_change_relative(regression_df, ref_row)
                
                regression_df.drop(columns=['Date','Open','High','Low','Close','Volume','Dividends','Stock Splits','DIF_MACD','DEM_MACD','Histogram_MACD'],inplace=True)
                
                regression_df = regression_df['Close_pct']
                result_label_regression.append(regression_df.to_numpy())
                #global_data.append(regression_df.to_numpy())

    if len(result_data) >= 1:
        result_data = np.array(result_data)
        if regression:
            result_label_regression = np.array(result_label_regression)
            
    else:
        return 0, None

    with h5py.File(f"data/train_data/{symbol}.h5", "w") as out:
        out.create_dataset(f'data',data=result_data,dtype='f')
        if not regression:
            out.create_dataset(f'label',data=result_label)
        else:
            out.create_dataset(f'label',data=result_label_regression)

    return result_data.shape[0], result_label
    
print(save_truncate_data_SP500(test_df,symbol='AAPL',test_mode=True,regression=True))
#print(save_truncate_data_SP500(test_df,symbol='AAPL',test_mode=True,merge_features=False))


AttributeError: 'numpy.ndarray' object has no attribute 'append'

In [182]:
# Extract Data for all symbols
global_data = []

def extract_data(merge_features, time_series, save_data_func, file_dir='data/meta_SP500.csv', regression=False,label_time=20):
    price_path = 'data/price'
    symbol_col = None
    index_col = None
    label_col = None
    p_bar = tqdm()
    for root, _, files in os.walk(price_path):
        p_bar.total = len(files)
        for file in files:
            if f'.csv' in file:
                file_split = file.split('.')
                if len(file_split) == 2:
                    symbol = file_split[0]
                else:
                    # String Exception: Some Symbol contains .
                    symbol = f'{file_split[0]}-{file_split[1]}'
                p_bar.set_description(symbol)
                p_bar.update(1)
                df = pd.read_csv(f'data/price/{symbol}.csv')
                
                #Price Preprocessing
                MACD(df)
                #pct_change(df)
                to_non_numerical(df)
                data_length, label = save_data_func(df,symbol=symbol,merge_features=merge_features,time_series=time_series,regression=regression,label_time=label_time)
                
                if data_length != 0:
                    #Meta Data
                    index_col = np.arange(data_length) if index_col is None else np.concatenate([index_col, np.arange(data_length)])
                    symbol_col_temp = np.full(data_length, symbol)
                    symbol_col = symbol_col_temp if symbol_col is None else np.concatenate([symbol_col, symbol_col_temp])
                    label_col = np.array(label) if label_col is None else np.concatenate([label_col, np.array(label)])
                
    meta_df = pd.DataFrame()
    meta_df['Index'] = index_col
    meta_df['Symbol'] = symbol_col
    meta_df['Label'] = label_col
    meta_df.to_csv(file_dir)

regression=True
merge_features=False
time_series=20
label_time=5
extract_data(
    merge_features=merge_features, 
    time_series=time_series, 
    label_time=label_time,
    regression=regression,
    save_data_func=save_truncate_data_SP500)

0it [00:00, ?it/s]

In [189]:
global_data = np.vstack(global_data)

def standardize(data):
    scaler = StandardScaler()
    scaler.fit(data)
    #Save onehotencoder for later usage
    with open('data/scaler_SP500.pkl', 'wb') as to_write:
        pickle.dump(scaler, to_write)
    return scaler.transform(data)

def minmax_scaler(data):
    minmax = MinMaxScaler()
    minmax.fit(data)
    #Save onehotencoder for later usage
    with open('data/minmax_SP500.pkl', 'wb') as to_write:
        pickle.dump(minmax, to_write)
    return minmax.transform(data)


# usage
directory = 'data/train_data'
std_data = standardize(global_data)
minmax_data = minmax_scaler(global_data)

In [195]:
np.percentile(std_data,50, axis=0)

array([-0.2015239 ,  0.00043544,  0.0098153 , -0.00202838, -0.00093593,
        0.05770299])

In [184]:
idx = 25
print(global_data[idx])
print(std_data[idx])
print(minmax_data[idx])

[[ 0.00000000e+00 -3.91149209e-02 -4.60385459e-02 -1.21121802e-02
  -3.70476136e-02 -1.00000000e-01]
 [ 0.00000000e+00 -3.06920463e-02 -3.23397428e-02 -9.89623209e-03
  -3.60460696e-02 -1.00000000e-01]
 [ 1.00000000e+00 -3.59742596e-02 -4.20842892e-02 -7.38541771e-03
  -3.14689910e-02 -1.00000000e-01]
 [ 0.00000000e+00 -3.19769961e-02 -3.95421360e-02 -4.28325821e-03
  -2.77497566e-02 -1.00000000e-01]
 [ 0.00000000e+00 -3.21197294e-02 -4.06720022e-02 -8.86241917e-03
  -3.46158329e-02 -1.00000000e-01]
 [ 0.00000000e+00 -3.29763017e-02 -3.72826469e-02 -2.36308612e-03
  -2.68915687e-02 -1.00000000e-01]
 [ 0.00000000e+00 -2.64097454e-02 -3.34697335e-02  2.06798723e-03
  -2.70348101e-02 -1.00000000e-01]
 [ 0.00000000e+00 -2.81226057e-02 -3.43170135e-02  1.62500992e-03
  -2.54612173e-02 -1.00000000e-01]
 [ 0.00000000e+00 -2.79798980e-02 -3.02215021e-02  4.57931995e-03
  -2.20281218e-02 -1.00000000e-01]
 [ 0.00000000e+00 -2.54106471e-02 -3.33287701e-02  2.51102462e-03
  -2.41741073e-02 -1.0000

In [198]:
global_data.shape 

(798500, 6)

In [197]:
global_data[20]

array([ 0.        , -0.0390015 , -0.03719711, -0.04160134, -0.03916443,
       -0.02596987])