In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
import talib
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, f1_score
import os

In [2]:
def get_technical_indicator(df):
    
    #Overlap Studies

    # Bollinger Bands: Indicates overbought/oversold conditions
    df['upperband'], df['middleband'], df['lowerband'] = talib.BBANDS(df['Close'], timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
    # Value range: [negative infinity, positive infinity]

    # Double Exponential Moving Average: Smooths price data
    df['DEMA'] = talib.DEMA(df['Close'], timeperiod=30)
    # Value range: [negative infinity, positive infinity]

    # Exponential Moving Average: Smooths price data
    df['EMA'] = talib.EMA(df['Close'], timeperiod=30)
    # Value range: [negative infinity, positive infinity]

    # Hilbert Transform - Instantaneous Trendline: Identifies trend direction
    df['HT_TRENDLINE'] = talib.HT_TRENDLINE(df['Close'])
    # Value range: [negative infinity, positive infinity]

    # Kaufman Adaptive Moving Average: Adjusts to market volatility
    df['KAMA'] = talib.KAMA(df['Close'], timeperiod=30)
    # Value range: [negative infinity, positive infinity]

    # Moving Average: Smooths price data
    df['MA'] = talib.MA(df['Close'], timeperiod=30, matype=0)
    # Value range: [negative infinity, positive infinity]

    # MESA Adaptive Moving Average: Adapts to market cycles
    df['MAMA'], df['FAMA'] = talib.MAMA(df['Close'], fastlimit=0.5, slowlimit=0.05)
    # Value range: [negative infinity, positive infinity]

    # Moving Average with Variable Period: Smooths price data with variable periods
    df['MAVP'] = talib.MAVP(df['Close'], df['Volume'], minperiod=2, maxperiod=30, matype=0)
    # Value range: [negative infinity, positive infinity]

    # MidPoint over Period: Average of the highest and lowest prices
    df['MIDPOINT'] = talib.MIDPOINT(df['Close'], timeperiod=14)
    # Value range: [negative infinity, positive infinity]

    # Midpoint Price over Period: Average of the highest and lowest prices
    df['MIDPRICE'] = talib.MIDPRICE(df['High'], df['Low'], timeperiod=14)
    # Value range: [negative infinity, positive infinity]

    # Parabolic SAR: Identifies potential reversal points
    df['SAR'] = talib.SAR(df['High'], df['Low'], acceleration=0.02, maximum=0.2)
    # Value range: [negative infinity, positive infinity]

    # Parabolic SAR - Extended: Identifies potential reversal points with extended parameters
    df['SAREXT'] = talib.SAREXT(df['High'], df['Low'], startvalue=0, offsetonreverse=0, accelerationinitlong=0.02, accelerationlong=0.02, accelerationmaxlong=0.2, accelerationinitshort=0.02, accelerationshort=0.02, accelerationmaxshort=0.2)
    # Value range: [negative infinity, positive infinity]

    # Simple Moving Average: Smooths price data
    df['SMA'] = talib.SMA(df['Close'], timeperiod=30)
    # Value range: [negative infinity, positive infinity]

    # Triple Exponential Moving Average (T3): Smooths price data with less lag
    df['T3'] = talib.T3(df['Close'], timeperiod=5, vfactor=0.7)
    # Value range: [negative infinity, positive infinity]

    # Triple Exponential Moving Average: Smooths price data
    df['TEMA'] = talib.TEMA(df['Close'], timeperiod=30)
    # Value range: [negative infinity, positive infinity]

    # Triangular Moving Average: Smooths price data
    df['TRIMA'] = talib.TRIMA(df['Close'], timeperiod=30)
    # Value range: [negative infinity, positive infinity]

    # Weighted Moving Average: Smooths price data
    df['WMA'] = talib.WMA(df['Close'], timeperiod=30)
    # Value range: [negative infinity, positive infinity]

    # Momentum Indicators

    # Average Directional Movement Index: Measures trend strength
    df['ADX'] = talib.ADX(df['High'], df['Low'], df['Close'], timeperiod=14)
    # Value range: [0, 100]

    # Average Directional Movement Index Rating: Measures trend strength
    df['ADXR'] = talib.ADXR(df['High'], df['Low'], df['Close'], timeperiod=14)
    # Value range: [0, 100]

    # Absolute Price Oscillator: Measures momentum
    df['APO'] = talib.APO(df['Close'], fastperiod=12, slowperiod=26, matype=0)
    # Value range: [negative infinity, positive infinity]

    # Aroon: Identifies trend changes
    df['AROON_down'], df['AROON_up'] = talib.AROON(df['High'], df['Low'], timeperiod=14)
    # Value range: [0, 100]

    # Aroon Oscillator: Measures trend strength
    df['AROONOSC'] = talib.AROONOSC(df['High'], df['Low'], timeperiod=14)
    # Value range: [-100, 100]

    # Balance Of Power: Measures buying and selling pressure
    df['BOP'] = talib.BOP(df['Open'], df['High'], df['Low'], df['Close'])
    # Value range: [-1, 1]

    # Commodity Channel Index: Identifies cyclical trends
    df['CCI'] = talib.CCI(df['High'], df['Low'], df['Close'], timeperiod=14)
    # Value range: [negative infinity, positive infinity]

    # Chande Momentum Oscillator: Measures momentum
    df['CMO'] = talib.CMO(df['Close'], timeperiod=14)
    # Value range: [-100, 100]

    # Directional Movement Index: Measures trend strength
    df['DX'] = talib.DX(df['High'], df['Low'], df['Close'], timeperiod=14)
    # Value range: [0, 100]

    # Moving Average Convergence/Divergence: Measures momentum
    df['MACD'], df['MACD_signal'], df['MACD_hist'] = talib.MACD(df['Close'], fastperiod=12, slowperiod=26, signalperiod=9)
    # Value range: [negative infinity, positive infinity]

    # MACD with controllable MA type: Measures momentum
    df['MACDEXT'], df['MACDEXT_signal'], df['MACDEXT_hist'] = talib.MACDEXT(df['Close'], fastperiod=12, fastmatype=0, slowperiod=26, slowmatype=0, signalperiod=9, signalmatype=0)
    # Value range: [negative infinity, positive infinity]

    # MACD Fix 12/26: Measures momentum
    df['MACDFIX'], df['MACDFIX_signal'], df['MACDFIX_hist'] = talib.MACDFIX(df['Close'], signalperiod=9)
    # Value range: [negative infinity, positive infinity]

    # Money Flow Index: Measures buying and selling pressure
    df['MFI'] = talib.MFI(df['High'], df['Low'], df['Close'], df['Volume'], timeperiod=14)
    # Value range: [0, 100]

    # Minus Directional Indicator: Measures trend strength
    df['MINUS_DI'] = talib.MINUS_DI(df['High'], df['Low'], df['Close'], timeperiod=14)
    # Value range: [0, 100]

    # Minus Directional Movement: Measures trend strength
    df['MINUS_DM'] = talib.MINUS_DM(df['High'], df['Low'], timeperiod=14)
    # Value range: [negative infinity, positive infinity]

    # Momentum: Measures momentum
    df['MOM'] = talib.MOM(df['Close'], timeperiod=10)
    # Value range: [negative infinity, positive infinity]

    # Plus Directional Indicator: Measures trend strength
    df['PLUS_DI'] = talib.PLUS_DI(df['High'], df['Low'], df['Close'], timeperiod=14)
    # Value range: [0, 100]

    # Plus Directional Movement: Measures trend strength
    df['PLUS_DM'] = talib.PLUS_DM(df['High'], df['Low'], timeperiod=14)
    # Value range: [negative infinity, positive infinity]

    # Percentage Price Oscillator: Measures momentum
    df['PPO'] = talib.PPO(df['Close'], fastperiod=12, slowperiod=26, matype=0)
    # Value range: [negative infinity, positive infinity]

    # Rate of Change: Measures rate of change
    df['ROC'] = talib.ROC(df['Close'], timeperiod=10)
    # Value range: [negative infinity, positive infinity]

    # Rate of Change Percentage: Measures rate of change percentage
    df['ROCP'] = talib.ROCP(df['Close'], timeperiod=10)
    # Value range: [negative infinity, positive infinity]

    # Rate of Change Ratio: Measures rate of change ratio
    df['ROCR'] = talib.ROCR(df['Close'], timeperiod=10)
    # Value range: [negative infinity, positive infinity]

    # Rate of Change Ratio 100 Scale: Measures rate of change ratio scaled by 100
    df['ROCR100'] = talib.ROCR100(df['Close'], timeperiod=10)
    # Value range: [negative infinity, positive infinity]

    # Relative Strength Index: Measures momentum
    df['RSI'] = talib.RSI(df['Close'], timeperiod=14)
    # Value range: [0, 100]

    # Stochastic: Measures momentum
    df['STOCH_slowk'], df['STOCH_slowd'] = talib.STOCH(df['High'], df['Low'], df['Close'], fastk_period=14, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
    # Value range: [0, 100]

    # Stochastic Fast: Measures momentum
    df['STOCHF_fastk'], df['STOCHF_fastd'] = talib.STOCHF(df['High'], df['Low'], df['Close'], fastk_period=14, fastd_period=3, fastd_matype=0)
    # Value range: [0, 100]

    # Stochastic Relative Strength Index: Measures momentum
    df['STOCHRSI_fastk'], df['STOCHRSI_fastd'] = talib.STOCHRSI(df['Close'], timeperiod=14, fastk_period=14, fastd_period=3, fastd_matype=0)
    # Value range: [0, 100]

    # TRIX: Measures rate of change of a triple smoothed EMA
    df['TRIX'] = talib.TRIX(df['Close'], timeperiod=30)
    # Value range: [negative infinity, positive infinity]

    # Ultimate Oscillator: Measures momentum
    df['ULTOSC'] = talib.ULTOSC(df['High'], df['Low'], df['Close'], timeperiod1=7, timeperiod2=14, timeperiod3=28)
    # Value range: [0, 100]

    # Williams' %R: Measures overbought/oversold conditions
    df['WILLR'] = talib.WILLR(df['High'], df['Low'], df['Close'], timeperiod=14)
    # Value range: [-100, 0]

    #Volume Indicators

    # Chaikin A/D Line: Measures accumulation/distribution
    df['AD'] = talib.AD(df['High'], df['Low'], df['Close'], df['Volume'])
    # Value range: [negative infinity, positive infinity]

    # Chaikin A/D Oscillator: Measures momentum of the A/D line
    df['ADOSC'] = talib.ADOSC(df['High'], df['Low'], df['Close'], df['Volume'], fastperiod=3, slowperiod=10)
    # Value range: [negative infinity, positive infinity]

    # On Balance Volume: Measures buying and selling pressure
    df['OBV'] = talib.OBV(df['Close'], df['Volume'])
    # Value range: [negative infinity, positive infinity]

    #Cycle Indicators

    # Hilbert Transform - Dominant Cycle Period: Identifies dominant cycle period
    df['HT_DCPERIOD'] = talib.HT_DCPERIOD(df['Close'])
    # Value range: [negative infinity, positive infinity]

    # Hilbert Transform - Dominant Cycle Phase: Identifies dominant cycle phase
    df['HT_DCPHASE'] = talib.HT_DCPHASE(df['Close'])
    # Value range: [negative infinity, positive infinity]

    # Hilbert Transform - Phasor Components: Identifies phasor components
    df['HT_PHASOR_inphase'], df['HT_PHASOR_quadrature'] = talib.HT_PHASOR(df['Close'])
    # Value range: [negative infinity, positive infinity]

    # Hilbert Transform - SineWave: Identifies sinewave components
    df['HT_SINE_sine'], df['HT_SINE_leadsine'] = talib.HT_SINE(df['Close'])
    # Value range: [negative infinity, positive infinity]

    # Hilbert Transform - Trend vs Cycle Mode: Identifies trend vs cycle mode
    df['HT_TRENDMODE'] = talib.HT_TRENDMODE(df['Close'])
    # Value range: [0, 1]

    # Price Transform

    # Average Price: Calculates average price
    df['AVGPRICE'] = talib.AVGPRICE(df['Open'], df['High'], df['Low'], df['Close'])
    # Value range: [negative infinity, positive infinity]

    # Median Price: Calculates median price
    df['MEDPRICE'] = talib.MEDPRICE(df['High'], df['Low'])
    # Value range: [negative infinity, positive infinity]

    # Typical Price: Calculates typical price
    df['TYPPRICE'] = talib.TYPPRICE(df['High'], df['Low'], df['Close'])
    # Value range: [negative infinity, positive infinity]

    # Weighted Close Price: Calculates weighted close price
    df['WCLPRICE'] = talib.WCLPRICE(df['High'], df['Low'], df['Close'])
    # Value range: [negative infinity, positive infinity]

    # Volatility Indicators

    # Average True Range: Measures volatility
    df['ATR'] = talib.ATR(df['High'], df['Low'], df['Close'], timeperiod=14)
    # Value range: [negative infinity, positive infinity]

    # Normalized Average True Range: Measures normalized volatility
    df['NATR'] = talib.NATR(df['High'], df['Low'], df['Close'], timeperiod=14)
    # Value range: [negative infinity, positive infinity]

    # True Range: Measures true range
    df['TRANGE'] = talib.TRANGE(df['High'], df['Low'], df['Close'])
    # Value range: [negative infinity, positive infinity]

    # Pattern Recognition

    # Get all candlestick pattern functions
    all_functions = talib.get_function_groups()
    candlestick_patterns = all_functions['Pattern Recognition']
    patterns = {pattern: getattr(talib, pattern) for pattern in candlestick_patterns}

    # Initialize Pattern_Sum column
    df['PATTERN_SUM'] = 0

    # Apply each pattern function to the DataFrame and sum the results
    for pattern_name, pattern_func in patterns.items():
        pattern_result = pattern_func(df['Open'], df['High'], df['Low'], df['Close'])
        df['PATTERN_SUM'] += pattern_result

    # Normalize the summed pattern values to be within the range of -1 to 1
    df['PATTERN_SUM'] = df['PATTERN_SUM'].apply(lambda x: np.clip(x, -100, 100) / 100)
        
    # Beta: Measures volatility relative to the market
    df['BETA'] = talib.BETA(df['High'], df['Low'], timeperiod=5)
    # Value range: [negative infinity, positive infinity]

    # Pearson's Correlation Coefficient (r): Measures correlation
    df['CORREL'] = talib.CORREL(df['High'], df['Low'], timeperiod=30)
    # Value range: [-1, 1]

    # Linear Regression: Calculates linear regression
    df['LINEARREG'] = talib.LINEARREG(df['Close'], timeperiod=14)
    # Value range: [negative infinity, positive infinity]

    # Linear Regression Angle: Calculates linear regression angle
    df['LINEARREG_ANGLE'] = talib.LINEARREG_ANGLE(df['Close'], timeperiod=14)
    # Value range: [negative infinity, positive infinity]

    # Linear Regression Intercept: Calculates linear regression intercept
    df['LINEARREG_INTERCEPT'] = talib.LINEARREG_INTERCEPT(df['Close'], timeperiod=14)
    # Value range: [negative infinity, positive infinity]

    # Linear Regression Slope: Calculates linear regression slope
    df['LINEARREG_SLOPE'] = talib.LINEARREG_SLOPE(df['Close'], timeperiod=14)
    # Value range: [negative infinity, positive infinity]

    # Standard Deviation: Measures volatility
    df['STDDEV'] = talib.STDDEV(df['Close'], timeperiod=5, nbdev=1)
    # Value range: [0, positive infinity]

    # Time Series Forecast: Forecasts future values
    df['TSF'] = talib.TSF(df['Close'], timeperiod=14)
    # Value range: [negative infinity, positive infinity]

    # Variance: Measures volatility
    df['VAR'] = talib.VAR(df['Close'], timeperiod=5, nbdev=1)
    # Value range: [0, positive infinity]

    return df

def set_target(df):
    # Calculate the daily midpoint price as the average of the high and low prices
    df['DAILY_MIDPRICE'] = (df['High'] + df['Low']) / 2
    
    # Shift the daily midpoint price to get the target for the next day
    df['NEXT_DAY_MIDPRICE'] = df['DAILY_MIDPRICE'].shift(-1)
    
    # Determine the direction of the price movement
    # 1 if the next day's midpoint price is higher (rise)
    # 0 if the next day's midpoint price is the same (no change)
    # -1 if the next day's midpoint price is lower (fall)
    df['PRICE_DIRECTION'] = df.apply(
        lambda row: 1 if row['NEXT_DAY_MIDPRICE'] > row['DAILY_MIDPRICE'] else 
                    (0 if row['NEXT_DAY_MIDPRICE'] == row['DAILY_MIDPRICE'] else -1), 
        axis=1
    )
    
    return df


In [3]:
# Define the path to the ticker symbol file
ticker_symbol_file = "./data/ticker-symbol.txt"

# Check if the ticker symbol file exists
if not os.path.isfile(ticker_symbol_file):
    print(f"Ticker symbol file '{ticker_symbol_file}' does not exist.")
else:
    # Ensure the data directory exists
    os.makedirs('data', exist_ok=True)

    # Read ticker symbols from file
    with open(ticker_symbol_file, 'r') as file:
        ticker_symbols = file.readlines()

    ticker_symbol_list = [ticker_symbol.strip() for ticker_symbol in ticker_symbols]
    end_date = datetime.today()
    start_date = end_date - timedelta(days=2*365)

    for ticker_symbol in ticker_symbol_list:
        ticker = ticker_symbol
        try:
            df = yf.download(ticker, start=start_date, end=end_date)
        except Exception as e:
            print(f"Error fetching data for {ticker}: {e}")
            continue
        df = get_technical_indicator(df)
        df = set_target(df)
        df.to_csv(f'data/{ticker}.csv')


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
