In [None]:
import pandas as pd
from datetime import datetime, timedelta
import yfinance as yf

# Function to download OHLCV data
def download_ohlcv_data(symbols, start_date, end_date, output_folder):
    for symbol in symbols:
        try:
            nse_symbol = symbol.strip() + ".NS"
            print(f"Downloading data for {nse_symbol}...")
            
            data = yf.download(nse_symbol, start=start_date, end=end_date, progress=False, multi_level_index=False)

            if not data.empty:
                file_path = f"{output_folder}/{symbol}_ohlcv.csv"
                data.to_csv(file_path)
                print(f"Data saved for {symbol} at {file_path}")
            else:
                print(f"No data found for {symbol}")

        except Exception as e:
            print(f"Error downloading data for {symbol}: {e}")

# Main execution
if __name__ == "__main__":
    input_csv = "nse_symbols.csv"  
    output_folder = "ohlcv_data"   

    try:
        symbols_df = pd.read_csv(input_csv)
        symbols = symbols_df['Symbol'].tolist()
    except Exception as e:
        print(f"Error reading the input CSV file: {e}")
        exit()

    end_date = datetime.today().strftime('%Y-%m-%d')
    start_date = (datetime.today() - timedelta(days=10*365)).strftime('%Y-%m-%d')

    print(f"Fetching data from {start_date} to {end_date}...")

    import os
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    download_ohlcv_data(symbols, start_date, end_date, output_folder)

    print("Process completed.")

In [None]:
#data sanity
import pandas as pd
import numpy as np

def sanity_check(ticker_name):
    file_name = f"ohlcv_data/{ticker_name}_ohlcv.csv"
    df = pd.read_csv(file_name)

    print(f"\nProcessing: {ticker_name}")

    # checking if columns are present
    required_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        print(f"Missing columns: {missing_cols}")
        return

    # forward fill
    missing_values = df.isnull().sum()
    print(f"Missing Values:\n{missing_values}")
    df.ffill(inplace=True, limit=1)

    # checking for duplicates
    duplicate_count = df.duplicated().sum()
    print(f"Duplicate Rows: {duplicate_count}")
    df.drop_duplicates(inplace=True)

    # checking for negative and zeroes
    invalid_rows = df[(df[required_cols] <= 0).any(axis=1)]
    print(f"Invalid (Zero/Negative) Values: {len(invalid_rows)} rows")

    for col in required_cols:
        df[col] = df[col].replace(0, np.nan).ffill(limit=1)
        df[col] = df[col].where(df[col] > 0, np.nan).ffill(limit=1)

    # sorting by date Date
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df.sort_values('Date', inplace=True)
        print(f"Date Range: {df['Date'].min()} to {df['Date'].max()}")

    # Printing Correlation Matrix
    print(f"Correlation Matrix for {ticker_name}:\n{df[required_cols].corr()}")

    # Save Cleaned Data
    cleaned_file = f"cleaned_data/{ticker_name}_cleaned.csv"
    df.to_csv(cleaned_file, index=False)
    print(f"Cleaned data saved: {cleaned_file}")


input_csv = "nse_symbols.csv"
symbols_df = pd.read_csv(input_csv)
symbols = symbols_df['Symbol'].tolist()

for symbol in symbols:
    sanity_check(symbol)

print("\nData Sanity Check Completed for All Symbols!")

In [None]:
#feature engineering
import yfinance as yf
import talib
import pandas as pd
import numpy as np

def get_indicators(ticker_name):
    file_name = "cleaned_data/" + ticker_name + "_cleaned.csv"
    df = pd.read_csv(file_name)

    Close = df['Close'].astype(float).to_numpy()
    Open = df['Open'].astype(float).to_numpy()
    High = df['High'].astype(float).to_numpy()
    Low = df['Low'].astype(float).to_numpy()
    Volume = df['Volume'].astype(float).to_numpy()

    
    #Volume based indicators
    AD = talib.AD(High, Low, Close, Volume)
    AD_trend = np.diff(AD, prepend=np.nan)
    AD_Crossover = np.where((AD > 0) & (np.roll(AD, 1) <= 0), 1,
                                  np.where((AD < 0) & (np.roll(AD, 1) >= 0), -1, 0))
    ADOSC = talib.ADOSC(High, Low, Close, Volume, fastperiod=3, slowperiod=10)
    ADOSC_trend = np.diff(ADOSC, prepend=np.nan)
    ADOSC_Crossover = np.where((ADOSC > 0) & (np.roll(ADOSC,1) <= 0), 1,
                                     np.where((ADOSC < 0) & (np.roll(ADOSC,1) >= 0), -1, 0))
    #directional movement indicators
    ADX = talib.ADX(High, Low, Close, timeperiod=5)
    ADX_trend = np.diff(ADX, prepend=np.nan)
    PLUS_DI=talib.PLUS_DI(High, Low, Close, timeperiod=5)
    MINUS_DI=talib.MINUS_DI(High, Low, Close, timeperiod=5)
    DI_Crossover = np.where((PLUS_DI > MINUS_DI) & (np.roll(PLUS_DI, 1) <= np.roll(MINUS_DI, 1))), 1, np.where((PLUS_DI < MINUS_DI) & (np.roll(PLUS_DI, 1) >= np.roll(MINUS_DI, 1)), -1, 0)
    ADXR = talib.ADXR(High, Low, Close, timeperiod=5)
    ADXR_trend = np.diff(ADXR, prepend=np.nan)
    MINUS_DM=talib.MINUS_DM(High, Low, timeperiod=5)
    PLUS_DM=talib.PLUS_DM(High, Low, timeperiod=5)
    DM_Crossover = np.where((PLUS_DM > MINUS_DM) & (np.roll(PLUS_DM, 1) <= np.roll(MINUS_DM, 1)), 1,
                        np.where((PLUS_DM < MINUS_DM) & (np.roll(PLUS_DM, 1) >= np.roll(MINUS_DM, 1)), -1, 0))
    
    #momentum indicators
    APO=talib.APO(Close, fastperiod=5, slowperiod=10, matype=0)
    APO_trend = np.diff(APO, prepend=np.nan)
    APO_Crossover = np.where((APO > 0) & (np.roll(APO,1) <= 0), 1, np.where((APO < 0) & (np.roll(APO,1) >= 0), -1, 0))
    AROON_UP, AROON_DOWN =talib.AROON(High, Low, timeperiod=5)
    AROONOSC=talib.AROONOSC(High, Low, timeperiod=5)
    AROON_Crossover = np.where((AROON_UP > AROON_DOWN) & (np.roll(AROON_UP,1) <= np.roll(AROON_DOWN,1)), 1,
                                     np.where((AROON_UP < AROON_DOWN) & (np.roll(AROON_UP,1) >= np.roll(AROON_DOWN,1)), -1, 0))
    MOM=talib.MOM(Close, timeperiod=5)
    MOM_trend = np.diff(MOM, prepend=np.nan)
    MOM_Crossover = np.where((MOM > 0) & (np.roll(MOM,1) <= 0), 1,
                                np.where((MOM < 0) & (np.roll(MOM,1) >= 0), -1, 0)).tolist()
    
    #volatility indicators
    ATR=talib.ATR(High, Low, Close, timeperiod=5)
    ATR_trend = np.diff(ATR, prepend=np.nan)
    ATR_breakout = np.where(np.diff(Close) > ATR[1:], 1, np.where(np.diff(Close) < -ATR[1:], -1, 0))
    NATR=talib.NATR(High, Low, Close, timeperiod=5)
    NATR_trend = np.diff(NATR, prepend=np.nan)
    High_Volatility = np.where(NATR > 20, 1, np.where(NATR < 5, -1, 0))
    TRANGE=talib.TRANGE(High, Low, Close)
    TRANGE_trend = np.diff(TRANGE, prepend=np.nan)
    Volatility_Spike = np.where(TRANGE > np.convolve(TRANGE, np.ones(5) / 5, mode='same') * 1.5, 1, 0)


    #price indicators
    AVGPRICE=talib.AVGPRICE(Open, High, Low, Close)
    Price_Deviation = Close - AVGPRICE
    AVGPRICE_trend = np.diff(AVGPRICE, prepend = np.nan)
    AVGPRICE_Crossover = np.where(Close > AVGPRICE, 1, np.where(Close < AVGPRICE, -1, 0))
    BETA=talib.BETA(High, Low, timeperiod=5)
    BETA_trend = np.diff(BETA, prepend=np.nan)
    High_Beta_Stock = np.where(BETA > 1.5, 1, np.where(BETA < 0.8, -1, 0))
    BOP=talib.BOP(Open, High, Low, Close)
    BOP_trend = np.diff(BOP, prepend=np.nan)
    BOP_Crossover = np.where(BOP > 0, 1, np.where(BOP < 0, -1, 0))
    MAX=talib.MAX(Close, timeperiod=5)
    MEDPRICE=talib.MEDPRICE(High, Low)
    MIDPOINT=talib.MIDPOINT(Close, timeperiod=5)
    MIDPRICE=talib.MIDPRICE(High, Low, timeperiod=5)
    MIN=talib.MIN(Close, timeperiod=5)
    ROC=talib.ROC(Close, timeperiod=5)
    ROC_trend = np.diff(ROC, prepend=np.nan)
    ROC_Crossover = np.where(ROC > 0, 1, np.where(ROC < 0, -1, 0))
    TYPPRICE=talib.TYPPRICE(High, Low, Close)
    WCLPRICE=talib.WCLPRICE(High, Low, Close)

    #bands and moving average
    BB_upper, BB_middle, BB_lower =talib.BBANDS(Close, timeperiod=5, nbdevup=1, nbdevdn=1, matype=0)
    BB_width = BB_upper - BB_lower / BB_middle
    BB_trend = np.diff(BB_middle, prepend = np.nan)
    BB_breakout = np.where(Close > BB_upper, 1, np.where(Close < BB_lower, -1, 0))
    BB_squeeze = np.where(BB_width < np.convolve(BB_width, np.ones(20) / 20, mode='same'), 1, 0)
    SMA=talib.SMA(Close, timeperiod=5)
    EMA=talib.EMA(Close, timeperiod=5)
    WMA=talib.WMA(Close, timeperiod=5)
    TEMA=talib.TEMA(Close, timeperiod=5)
    TRIMA=talib.TRIMA(Close, timeperiod=5)
    DEMA=talib.DEMA(Close, timeperiod=5)
    KAMA=talib.KAMA(Close, timeperiod=5)
    MACD, MACD_signal, MACD_hist = talib.MACDEXT(Close, fastperiod=5, slowperiod=10, signalperiod=9, fastmatype=0, slowmatype=0, signalmatype=0)
    MACD_Crossover = np.where(MACD > MACD_signal, 1, -1)
    MACD_Divergence = np.where((np.diff(Close, prepend=np.nan) > 0) & (MACD_hist < 0), -1, np.where((np.diff(Close, prepend=np.nan) < 0) & (MACD_hist > 0), 1, 0))
    MAMA, FAMA = talib.MAMA(Close, fastlimit=0.5, slowlimit=0.05)
    MAMA_Crossover = np.where(MAMA > FAMA, 1, -1)
    MAMA_Spread = MAMA - FAMA
    MAMA_Trend = np.sign(np.diff(MAMA))
    T3=talib.T3(Close, timeperiod=5, vfactor=0.7)
    TRIX=talib.TRIX(Close, timeperiod=5)

    #hilbert transform indicators (cycle indicators)
    DCPERIOD=talib.HT_DCPERIOD(Close)
    Cycle_change = np.sign(np.diff(DCPERIOD, prepend=np.nan))
    DCPHASE=talib.HT_DCPHASE(Close)
    PHASE_Crossover = np.where((DCPHASE < -270) & (np.roll(DCPHASE,1) >= -270), 1, np.where((DCPHASE > -90) & (np.roll(DCPHASE,1) <= -90), -1, 0))
    PHASE_trend = np.diff(DCPHASE, prepend=DCPHASE[0])
    IN_PHASE , QUADRATURE =talib.HT_PHASOR(Close)
    PHASE_ANGLE = np.degrees(np.arctan2(QUADRATURE, IN_PHASE))
    PHASE_Crossover = np.where((QUADRATURE > IN_PHASE) & (np.roll(QUADRATURE,1) <= np.roll(IN_PHASE,1)), 1,
                               np.where((QUADRATURE < IN_PHASE) & (np.roll(QUADRATURE,1) >= np.roll(IN_PHASE,1)), -1, 0))
    CYCLE_Strength = np.sqrt(IN_PHASE**2 + QUADRATURE**2)
    SINE, LEADSINE = talib.HT_SINE(Close)
    SINE_Trend = np.sign(LEADSINE - SINE)
    SINE_Crossover = np.where((LEADSINE > SINE) & (np.roll(LEADSINE,1) <= np.roll(SINE,1)), 1, 
                              np.where((LEADSINE < SINE) & (np.roll(LEADSINE,1) >= np.roll(SINE,1)), -1, 0))
    TRENDLINE=talib.HT_TRENDLINE(Close)
    TRENDMODE=talib.HT_TRENDMODE(Close)

    #statistical indicators
    LINEARREG=talib.LINEARREG(Close, timeperiod=14)
    LINEARREG_SLOPE=talib.LINEARREG_SLOPE(Close, timeperiod=14)
    #CORREL=talib.CORREL(real0, real1, timeperiod=30)
    STDDEV=talib.STDDEV(Close, timeperiod=5, nbdev=1)
    VAR=talib.VAR(Close, timeperiod=5, nbdev=1)

    #candlestick patterns
    '''CDLENGULFING=talib.CDLENGULFING(Open, High, Low, Close)
    CDLDOJI=talib.CDLDOJI(Open, High, Low, Close)
    CDLHAMMER=talib.CDLHAMMER(Open, High, Low, Close)
    CDLMORNINGSTAR=talib.CDLMORNINGSTAR(Open, High, Low, Close, penetration=0)
    CDL2CROWS=talib.CDL2CROWS(Open, High, Low, Close)
    CDL3BLACKCROWS=talib.CDL3BLACKCROWS(Open, High, Low, Close)
    CDL3INSIDE=talib.CDL3INSIDE(Open, High, Low, Close)
    CDL3LINESTRIKE=talib.CDL3LINESTRIKE(Open, High, Low, Close)
    CDL3STARSINSOUTH=talib.CDL3STARSINSOUTH(Open, High, Low, Close)
    CDL3WHITESOLDIERS=talib.CDL3WHITESOLDIERS(Open, High, Low, Close)
    CDLABANDONEDBABY=talib.CDLABANDONEDBABY(Open, High, Low, Close, penetration=0)
    CDLADVANCEBLOCK=talib.CDLADVANCEBLOCK(Open, High, Low, Close)
    CDLBELTHOLD=talib.CDLBELTHOLD(Open, High, Low, Close)
    CDLBREAKAWAY=talib.CDLBREAKAWAY(Open, High, Low, Close)
    CDLCLOSINGMARUBOZU=talib.CDLCLOSINGMARUBOZU(Open, High, Low, Close)
    CDLCONCEALBABYSWALL=talib.CDLCONCEALBABYSWALL(Open, High, Low, Close)
    CDLCOUNTERATTACK=talib.CDLCOUNTERATTACK(Open, High, Low, Close)
    CDLDARKCLOUDCOVER=talib.CDLDARKCLOUDCOVER(Open, High, Low, Close, penetration=0)
    CDLDOJISTAR=talib.CDLDOJISTAR(Open, High, Low, Close),
    CDLDRAGONFLYDOJI=talib.CDLDRAGONFLYDOJI(Open, High, Low, Close),
    CDLEVENINGDOJISTAR=talib.CDLEVENINGDOJISTAR(Open, High, Low, Close, penetration=0),
    CDLEVENINGSTAR=talib.CDLEVENINGSTAR(Open, High, Low, Close, penetration=0),
    CDLGAPSIDESIDEWHITE=talib.CDLGAPSIDESIDEWHITE(Open, High, Low, Close),
    CDLGRAVESTONEDOJI=talib.CDLGRAVESTONEDOJI(Open, High, Low, Close),
    CDLHANGINGMAN=talib.CDLHANGINGMAN(Open, High, Low, Close),
    CDLHARAMI=talib.CDLHARAMI(Open, High, Low, Close),
    CDLHARAMICROSS=talib.CDLHARAMICROSS(Open, High, Low, Close),
    CDLHIGHWAVE=talib.CDLHIGHWAVE(Open, High, Low, Close),
    CDLHIKKAKE=talib.CDLHIKKAKE(Open, High, Low, Close),
    CDLHIKKAKEMOD=talib.CDLHIKKAKEMOD(Open, High, Low, Close),
    CDLHOMINGPIGEON=talib.CDLHOMINGPIGEON(Open, High, Low, Close),
    CDLIDENTICAL3CROWS=talib.CDLIDENTICAL3CROWS(Open, High, Low, Close),
    CDLINNECK=talib.CDLINNECK(Open, High, Low, Close),
    CDLINVERTEDHAMMER=talib.CDLINVERTEDHAMMER(Open, High, Low, Close),
    CDLKICKING=talib.CDLKICKING(Open, High, Low, Close),
    CDLKICKINGBYLENGTH=talib.CDLKICKINGBYLENGTH(Open, High, Low, Close),
    CDLLADDERBOTTOM=talib.CDLLADDERBOTTOM(Open, High, Low, Close),
    CDLLONGLEGGEDDOJI=talib.CDLLONGLEGGEDDOJI(Open, High, Low, Close),
    CDLLONGLINE=talib.CDLLONGLINE(Open, High, Low, Close),
    CDLMARUBOZU=talib.CDLMARUBOZU(Open, High, Low, Close),
    CDLMATCHINGLOW=talib.CDLMATCHINGLOW(Open, High, Low, Close),
    CDLMATHOLD=talib.CDLMATHOLD(Open, High, Low, Close, penetration=0),
    CDLMORNINGDOJISTAR=talib.CDLMORNINGDOJISTAR(Open, High, Low, Close, penetration=0),
    CDLONNECK=talib.CDLONNECK(Open, High, Low, Close),
    CDLPIERCING=talib.CDLPIERCING(Open, High, Low, Close),
    CDLRICKSHAWMAN=talib.CDLRICKSHAWMAN(Open, High, Low, Close),
    CDLRISEFALL3METHODS=talib.CDLRISEFALL3METHODS(Open, High, Low, Close),
    CDLSEPARATINGLINES=talib.CDLSEPARATINGLINES(Open, High, Low, Close),
    CDLSHOOTINGSTAR=talib.CDLSHOOTINGSTAR(Open, High, Low, Close),
    CDLSHORTLINE=talib.CDLSHORTLINE(Open, High, Low, Close),
    CDLSPINNINGTOP=talib.CDLSPINNINGTOP(Open, High, Low, Close),
    CDLSTALLEDPATTERN=talib.CDLSTALLEDPATTERN(Open, High, Low, Close),
    CDLSTICKSANDWICH=talib.CDLSTICKSANDWICH(Open, High, Low, Close),
    CDLTAKURI=talib.CDLTAKURI(Open, High, Low, Close),
    CDLTASUKIGAP=talib.CDLTASUKIGAP(Open, High, Low, Close),
    CDLTHRUSTING=talib.CDLTHRUSTING(Open, High, Low, Close),
    CDLTRISTAR=talib.CDLTRISTAR(Open, High, Low, Close),
    CDLUNIQUE3RIVER=talib.CDLUNIQUE3RIVER(Open, High, Low, Close),
    CDLUPSIDEGAP2CROWS=talib.CDLUPSIDEGAP2CROWS(Open, High, Low, Close),
    CDLXSIDEGAP3METHODS=talib.CDLXSIDEGAP3METHODS(Open, High, Low, Close),'''
    
    #trend and oscillator indicator
    #MACD=talib.MACD(Close, fastperiod=12, slowperiod=26, signalperiod=9)
    RSI=talib.RSI(Close, timeperiod=5)
    #STOCH=talib.STOCH(High, Low, Close, fastk_period=5, slowk_period=3, slowd_period=3)
    CMO=talib.CMO(Close, timeperiod=5)
    DX=talib.DX(High, Low, Close, timeperiod=5)
    ULTOSC=talib.ULTOSC(High, Low, Close, timeperiod1=5, timeperiod2=10, timeperiod3=15)
    PPO=talib.PPO(Close, fastperiod=5, slowperiod=10, matype=0)
    
    #miscellaneous indicators
    MFI=talib.MFI(High, Low, Close, Volume, timeperiod=5)
    OBV=talib.OBV(Close, Volume)
    CCI=talib.CCI(High, Low, Close, timeperiod=5)
    SAR=talib.SAR(High, Low, acceleration=0.02, maximum=0.2)
    WILLR=talib.WILLR(High, Low, Close, timeperiod=5)
    #MACDFIX=talib.MACDFIX(Close, signalperiod=9)
    SAREXT=talib.SAREXT(High, Low, startvalue=0, offsetonreverse=0, accelerationinitlong=0.02,
                            accelerationlong=0.02, accelerationmaxlong=0.2, accelerationinitshort=0.02,
                            accelerationshort=0.02, accelerationmaxshort=0.2)

    #stochastic indicators
    #STOCHF=talib.STOCHF(High, Low, Close, fastk_period=5, fastd_period=3, fastd_matype=0)
    #STOCHRSI=talib.STOCHRSI(Close, timeperiod=14, fastk_period=5, fastd_period=3, fastd_matype=0)
    TSF=talib.TSF(Close, timeperiod=5)
    
    indicators = {
        'AD' : AD.tolist(),
        'AD_trend' : AD_trend.tolist(),
        'AD_Crossover' : AD_Crossover.tolist(),
        'ADOSC' : ADOSC.tolist(),
        'ADOSC_trend' : ADOSC_trend.tolist(),
        'ADOSC_Crossover' : ADOSC_Crossover.tolist(),
        'ADX' : ADX.tolist(),
        'ADX_trend' : ADX_trend.tolist(),
        'PLUS_DI' : PLUS_DI.tolist(),
        'MINUS_DI' : MINUS_DI.tolist(),
        #'DI_Crossover' : list(DI_Crossover),
        'ADXR' : ADXR.tolist(),
        'ADXR_trend' : ADXR_trend.tolist(),
        'MINUS_DM' : MINUS_DM.tolist(),
        'PLUS_DM' : PLUS_DM.tolist(),
        'DM_Crossover' : DM_Crossover.tolist(),
        'APO' : APO.tolist(),
        'APO_trend' : APO_trend.tolist(),
        'APO_Crossover' : APO_Crossover.tolist(),
        'AROON_UP' : AROON_UP.tolist(),
        'AROON_DOWN' : AROON_DOWN.tolist(),
        'AROONOSC' : AROONOSC.tolist(),
        'AROON_Crossover' : AROON_Crossover.tolist(),
        'MOM' : MOM.tolist(),
        'MOM_trend' : MOM_trend.tolist(),
        'MOM_Crossover' : list(MOM_Crossover),
        'ATR' : ATR.tolist(),
        'ATR_trend' : ATR_trend.tolist(),
        'ATR_breakout' : ATR_breakout.tolist(),
        'NATR' : NATR.tolist(),
        'NATR_trend' : NATR_trend.tolist(),
        'High_Volatility' : High_Volatility.tolist(),
        'TRANGE' : TRANGE.tolist(),
        'TRANGE_trend' : TRANGE_trend.tolist(),
        'Volatility_Spike' : Volatility_Spike.tolist(),
        'AVGPRICE' : AVGPRICE.tolist(),
        'Price_Deviation' : Price_Deviation.tolist(),
        'AVGPRICE_trend' : AVGPRICE_trend.tolist(),
        'AVGPRICE_Crossover' : AVGPRICE_Crossover.tolist(),
        'BETA' : BETA.tolist(),
        'BETA_trend' : BETA_trend.tolist(),
        'High_Beta_Stock' : High_Beta_Stock.tolist(),
        'BOP' : BOP.tolist(),
        'BOP_trend' : BOP_trend.tolist(),
        'BOP_Crossover' : BOP_Crossover.tolist(),
        'MAX' : MAX.tolist(),
        'MEDPRICE' : MEDPRICE.tolist(),
        'MIDPOINT' : MIDPOINT.tolist(),
        'MIDPRICE' : MIDPRICE.tolist(),
        'MIN' : MIN.tolist(),
        'ROC' : ROC.tolist(),
        'ROC_trend' : ROC_trend.tolist(),
        'ROC_Crossover' : ROC_Crossover.tolist(),
        'TYPPRICE' : TYPPRICE.tolist(),
        'WCLPRICE' : WCLPRICE.tolist(),
        'BB_upper' : BB_upper.tolist(),
        'BB_middle' : BB_middle.tolist(),
        'BB_lower' : BB_lower.tolist(),
        'BB_width' : BB_width.tolist(),
        'BB_trend' : BB_trend.tolist(),
        'BB_breakout' : BB_breakout.tolist(),
        'BB_squeeze' : BB_squeeze.tolist(),
        'SMA' : SMA.tolist(),
        'EMA' : EMA.tolist(),
        'WMA' : WMA.tolist(),
        'TEMA' : TEMA.tolist(),
        'TRIMA' : TRIMA.tolist(),
        'DEMA' : DEMA.tolist(),
        'KAMA' : KAMA.tolist(),
        'MACD' : MACD.tolist(), 
        'MACD_signal' : MACD_signal.tolist(), 
        'macd-hist' : MACD_hist.tolist(),
        'MACD_Crossover' : MACD_Crossover.tolist(),
        'MACD_Divergence' : MACD_Divergence.tolist(),
        'MAMA' : MAMA.tolist(), 
        'FAMA' : FAMA.tolist(),
        'MAMA_Crossover' : MAMA_Crossover.tolist(), 
        'MAMA_Spread' : MAMA_Spread.tolist(),
        'MAMA_Trend' : MAMA_Trend.tolist(),
        'T3' : T3.tolist(),
        'TRIX' : TRIX.tolist(),
        'DCPERIOD' : DCPERIOD.tolist(),
        'Cycle_change' : Cycle_change.tolist(),
        'DCPHASE' : DCPHASE.tolist(),
        'PHASE_Crossover' : PHASE_Crossover.tolist(),
        'PHASE_trend' : PHASE_trend.tolist(),
        'IN_PHASE' : IN_PHASE.tolist(), 
        'QUADRATURE' : QUADRATURE.tolist(),
        'PHASE_ANGLE' : PHASE_ANGLE.tolist(),
        'PHASE_Crossover' : PHASE_Crossover.tolist(),
        'CYCLE_Strength' : CYCLE_Strength.tolist(),
        'SINE' : SINE.tolist(),
        'LEADSINE' : LEADSINE.tolist(),
        'SINE_Trend' : SINE_Trend.tolist(),
        'SINE_Crossover' : SINE_Crossover.tolist(),
        'TRENDLINE' : TRENDLINE.tolist(),
        'TRENDMODE' : TRENDMODE.tolist(),
        'LINEARREG' : LINEARREG.tolist(),
        'LINEARREG_SLOPE' : LINEARREG_SLOPE.tolist(),
        'STDDEV' : STDDEV.tolist(),
        'VAR' : VAR.tolist(),
        'RSI' : RSI.tolist(),
        'CMO' : CMO.tolist(),
        'DX' : DX.tolist(),
        'ULTOSC' : ULTOSC.tolist(),
        'PPO' : PPO.tolist(),
        'MFI' : MFI.tolist(),
        'OBV' : OBV.tolist(),
        'CCI' : CCI.tolist(),
        'SAR' : SAR.tolist(),
        'WILLR' : WILLR.tolist(),
        'SAREXT' : SAREXT.tolist(),
        'TSF' : TSF.tolist()
    }

    for key in indicators.keys():
        indicators[key] = prepend_nan(indicators[key], len(df))

    indicators_df = pd.DataFrame(indicators)    

    df = pd.concat([df, indicators_df], axis=1)   
    
    df.to_csv("data_with_indicators/" + ticker_name + "_indicators.csv", index=False),
    return

def remove_unwanted_features(ticker):
    df = pd.read_csv("data_with_indicators/" + ticker + "_indicators.csv")
    for col in df.columns:
        if df[col].nunique() == 1:
            df.drop(columns=[col], inplace=True)

    df.to_csv("data_with_indicators/" + ticker + "_indicators.csv")
    return

def prepend_nan(indicator, target_length):
    if len(indicator) < target_length:
        padding = np.full(target_length - len(indicator), np.nan)
        return np.concatenate((padding, indicator))
    return indicator

input_csv = "nse_symbols.csv"
symbols_df = pd.read_csv(input_csv)
symbols = symbols_df['Symbol'].tolist()

for symbol in symbols:
    print(f"Processing: {symbol}")
    get_indicators(symbol)
    remove_unwanted_features(symbol)

print("Completed.")

In [None]:
#make 2 targets classifications and regression... done
#make features as per basis of manual trading.... Done
#normalize all indicators... done
#data sanity checks.... done

In [None]:
#adding targets
import pandas as pd
from pathlib import Path

def add_targets(
    ticker: str,
    in_folder: str = "data_with_indicators",
    out_folder: str = "data_with_targets"
) -> None:
    df = pd.read_csv(f"{in_folder}/{ticker}_indicators.csv")

    future_high = df["High"].shift(-6)
    df["Target2"] = ((future_high - df["Close"]) / df["Close"])
    df.iloc[-6:, df.columns.get_loc("Target2")] = 0

    five_day_high = (
        df["High"]
        .rolling(window=5, min_periods=1)
        .max()
        .shift(-5)
    )
    df["Target1"] = (five_day_high >= df["Close"] * 1.04).astype(int)
    df.iloc[-5:, df.columns.get_loc("Target1")] = 0

    Path(out_folder).mkdir(exist_ok=True)
    df.to_csv(f"{out_folder}/{ticker}_indicators.csv", index=False)


def main() -> None:
    symbols = pd.read_csv("nse_symbols.csv")["Symbol"]
    for sym in symbols:
        print(f"Processing: {sym}")
        add_targets(sym)
    print("Completed!")

if __name__ == "__main__":
    main()

In [None]:
'''#Normalizing all indicators
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def scale(ticker):
    df = pd.read_csv("data_with_indicators/" + ticker + "_indicators.csv")
    
    columns_to_scale = df.columns[(df.columns != 'Date') & ~df.isin([0, 1, -1]).all()]
    scaler = StandardScaler()
    df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

    output_path = f"scaled_data/{ticker}_scaled.csv"
    df.to_csv(output_path, index=False)

input_csv = "nse_symbols.csv"
symbols_df = pd.read_csv(input_csv)
symbols = symbols_df['Symbol'].tolist()

for symbol in symbols:
    print(f"Processing: {symbol}")
    scale(symbol)

print("Completed!")'''

In [None]:
# train and test split
# feature elimination RFE library
# boruta package
# PCA

In [None]:
'''
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Load and concatenate all stock indicator CSVs
all_data = []
for path in glob.glob("data_with_indicators/*.csv"):
    ticker = path.split("/")[-1].split("_")[0]
    df = pd.read_csv(path)
    df['Ticker'] = ticker
    all_data.append(df)

df_all = pd.concat(all_data).dropna().sort_values(["Date", "Ticker"])

# Define features and targets
exclude_cols = ["Date", "Ticker", "Target1", "Target2"]
feature_cols = [col for col in df_all.columns if col not in exclude_cols]
X = df_all[feature_cols]
y = df_all["Target1"]

# Train/test split based on time (to avoid leakage)
df_all["Date"] = pd.to_datetime(df_all["Date"])
cutoff_date = df_all["Date"].quantile(0.8)
train_idx = df_all["Date"] <= cutoff_date
test_idx = df_all["Date"] > cutoff_date

X_train = X[train_idx]
X_test = X[test_idx]
y_train = y[train_idx]
y_test = y[test_idx]

# Normalize using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train)

# Evaluation
y_pred = clf.predict(X_test_scaled)
y_proba = clf.predict_proba(X_test_scaled)[:, 1]
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))
'''

In [None]:
# LSTM Classifier and Regressor
import os, warnings, joblib, glob, math   
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, roc_curve,
    mean_absolute_error, mean_squared_error, r2_score
)

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt                   

warnings.filterwarnings("ignore")

DATA_DIR   = Path("data_with_targets")
LOOKBACK   = 20
TEST_FRAC  = 0.15
BATCH_SIZE = 256
EPOCHS     = 30
PATIENCE   = 5
THRESH     = 0.50          

os.makedirs("models", exist_ok=True)
os.makedirs("plots",  exist_ok=True)          

# helper: build feature selector pipeline
def build_selector():
    return Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler",  StandardScaler()),
        ("rfecv",   RFECV(
            estimator  = LogisticRegression(max_iter=1000),
            cv         = StratifiedKFold(n_splits=5),
            scoring    = "accuracy",
            step       = 1,
            n_jobs     = -1,
        )),
    ])

# helper: roll 2-D → 3-D sequences + aligned targets
def make_seq(X, y, L):
    Xs, ys = [], []
    for i in range(L, len(X)):
        Xs.append(X[i-L:i])
        ys.append(y[i])
    return np.array(Xs, dtype=np.float32), np.array(ys)


for csv_path in sorted(DATA_DIR.glob("*_indicators.csv")):
    ticker = csv_path.stem.split("_")[0]
    print(f"⏳  {ticker}: feature selection...", flush=True)
    df = pd.read_csv(csv_path).sort_values("Date").reset_index(drop=True)
    df.drop(columns=["Unnamed: 0"], errors="ignore", inplace=True)

    feature_cols = [c for c in df.columns if c not in ["Date", "Target1", "Target2"]]
    X_raw, y_cls_raw, y_reg_raw = df[feature_cols], df["Target1"].astype(int), df["Target2"]

    if y_cls_raw.sum() == 0:         
        print(f"{ticker:<10} skipped (no +4 % events)")
        continue

    cutoff = int(len(df) * (1 - TEST_FRAC))
    pipe   = build_selector()
    pipe.fit(X_raw.iloc[:cutoff], y_cls_raw.iloc[:cutoff])
    X_prep = pipe.transform(X_raw)

    X_seq, y_cls_seq = make_seq(X_prep, y_cls_raw.values, LOOKBACK)
    _,     y_reg_seq = make_seq(X_prep, y_reg_raw.values, LOOKBACK)

    split = int(len(X_seq) * (1 - TEST_FRAC))
    X_tr, X_te          = X_seq[:split],       X_seq[split:]
    y_cls_tr, y_cls_te  = y_cls_seq[:split],   y_cls_seq[split:]
    y_reg_tr, y_reg_te  = y_reg_seq[:split],   y_reg_seq[split:]

    clf = models.Sequential([
        layers.Input(shape=(LOOKBACK, X_tr.shape[-1])),
        layers.LSTM(64, return_sequences=True),
        layers.Dropout(0.2),
        layers.LSTM(32),
        layers.Dropout(0.2),
        layers.Dense(1, activation="sigmoid")
    ])
    clf.compile(optimizer="adam",
                loss="binary_crossentropy",
                metrics=["AUC"])

    clf.fit(
        X_tr, y_cls_tr,
        validation_split=0.1,
        epochs=EPOCHS, batch_size=BATCH_SIZE,
        callbacks=[EarlyStopping("val_auc", patience=PATIENCE,
                                 mode="max", restore_best_weights=True)],
        verbose=0
    )

    reg = models.Sequential([
        layers.Input(shape=(LOOKBACK, X_tr.shape[-1])),
        layers.LSTM(64, return_sequences=True),
        layers.Dropout(0.2),
        layers.LSTM(32),
        layers.Dropout(0.2),
        layers.Dense(1)
    ])
    reg.compile(optimizer="adam", loss="mae")

    reg.fit(
        X_tr, y_reg_tr,
        validation_split=0.1,
        epochs=EPOCHS, batch_size=BATCH_SIZE,
        callbacks=[EarlyStopping("val_loss", patience=PATIENCE,
                                 restore_best_weights=True)],
        verbose=0
    )

    prob_up = clf.predict(X_te, verbose=0).ravel()
    reg_out = reg.predict(X_te, verbose=0).ravel()
    y_pred_cls = (prob_up >= THRESH).astype(int)

    acc  = accuracy_score(y_cls_te, y_pred_cls)
    prec = precision_score(y_cls_te, y_pred_cls, zero_division=0)
    rec  = recall_score(y_cls_te, y_pred_cls, zero_division=0)
    f1   = f1_score(y_cls_te, y_pred_cls, zero_division=0)
    auc  = roc_auc_score(y_cls_te, prob_up)

    print(f"\n📊  {ticker} — classifier metrics (test)")
    print(f"accuracy   : {acc: .3f}")
    print(f"precision  : {prec:.3f}")
    print(f"recall     : {rec: .3f}")
    print(f"F1-score   : {f1 : .3f}")
    print(f"AUC        : {auc:.3f}")

    fpr, tpr, _ = roc_curve(y_cls_te, prob_up)
    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC={auc:.2f}")
    plt.plot([0,1], [0,1], "--", color="gray")
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
    plt.title(f"{ticker} ROC Curve")
    plt.legend()
    roc_path = f"plots/{ticker}_roc.png"
    plt.savefig(roc_path, dpi=300, bbox_inches="tight")
    plt.close()
    print(f"ROC curve  → {roc_path}")

    mae  = mean_absolute_error(y_reg_te, reg_out)
    rmse = math.sqrt(mean_squared_error(y_reg_te, reg_out))
    r2   = r2_score(y_reg_te, reg_out)

    print(f"\n📊  {ticker} — regressor metrics (test)")
    print(f"MAE  (ratio)      : {mae: .4f}")
    print(f"RMSE (ratio)      : {rmse:.4f}")
    print(f"R²                : {r2  : .3f}")

    plt.figure()
    plt.scatter(y_reg_te * 100, reg_out * 100, alpha=0.3)
    plt.xlabel("Actual % move"); plt.ylabel("Predicted % move")
    plt.title(f"{ticker} Predicted vs Actual (%)")
    plt.axline((0,0), slope=1, color="gray", linestyle="--")
    pv_path = f"plots/{ticker}_pred_vs_actual.png"
    plt.savefig(pv_path, dpi=300, bbox_inches="tight")
    plt.show()
    plt.close()
    print(f"Scatter     → {pv_path}")

    test_dates = df["Date"].iloc[LOOKBACK + split:].reset_index(drop=True)
    mask = prob_up >= THRESH
    if mask.sum() == 0:
        print(f"{ticker:<10}  — no ≥{THRESH:.0%} up-prob signals in test set\n")
        continue

    signals = pd.DataFrame({
        "Date"            : test_dates[mask],
        "P(up≥4%)"        : prob_up[mask].round(3),
        "Pred_%Move(%)"   : (reg_out[mask] * 100).round(2),
        "Actual_%Move(%)" : (y_reg_te[mask] * 100).round(2),
        "Actual≥4%"       : (y_reg_te[mask] >= 0.04).astype(int)
    })

    print(f"\n{ticker} — Signals (≥{THRESH:.0%} prob) on last {TEST_FRAC:.0%} of data")
    print(signals.to_string(index=False))
    print("-" * 70, "\n")


In [None]:
'''import matplotlib.pyplot as plt

# Convert to percentage values for readability
actual_pct   = y_reg_te * 100
predicted_pct = reg_out * 100

# Convert to percentage values for readability
actual_pct   = y_reg_te * 100
predicted_pct = reg_out * 100

plt.figure(figsize=(10, 6))
plt.scatter(actual_pct, predicted_pct, alpha=0.5, edgecolor='k')
plt.plot([actual_pct.min(), actual_pct.max()],
         [actual_pct.min(), actual_pct.max()],
         color='red', linestyle='--', label='Ideal Prediction (y = x)')

plt.title("LSTM Regression: Predicted vs Actual % Movement")
plt.xlabel("Actual % Movement")
plt.ylabel("Predicted % Movement")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


plt.figure(figsize=(10, 6))
plt.scatter(actual_pct, predicted_pct, alpha=0.5, edgecolor='k')
plt.plot([actual_pct.min(), actual_pct.max()],
         [actual_pct.min(), actual_pct.max()],
         color='red', linestyle='--', label='Ideal Prediction (y = x)')

plt.title("LSTM Regression: Predicted vs Actual % Movement")
plt.xlabel("Actual % Movement")
plt.ylabel("Predicted % Movement")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()'''

In [None]:
'''
# ------------------------------------------------------------
# LSTM CLASSIFIER (Target1) & LSTM REGRESSOR (Target2)
# ------------------------------------------------------------
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models

X_all = np.array(X_all)
y_cls_all = np.array(y_cls_all)
y_reg_all = np.array(y_reg_all)

# Chronological split (keep ordering)
split_idx = int(X_all.shape[0] * 0.8)
X_train, X_val = X_all[:split_idx], X_all[split_idx:]
y_cls_train, y_cls_val = y_cls_all[:split_idx], y_cls_all[split_idx:]
y_reg_train, y_reg_val = y_reg_all[:split_idx], y_reg_all[split_idx:]

# ---- Classifier ----
cls_model = models.Sequential([
    layers.Input(shape=(X_all.shape[1], X_all.shape[2])),
    layers.LSTM(64, return_sequences=True),
    layers.Dropout(0.2),
    layers.LSTM(32),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')
])
cls_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

cls_model.fit(
    X_train, y_cls_train,
    validation_data=(X_val, y_cls_val),
    epochs=30,
    batch_size=256,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_auc', mode='max', patience=5, restore_best_weights=True)],
    verbose=1
)

# Evaluate classifier
cls_auc = cls_model.evaluate(X_val, y_cls_val, verbose=0)[1]
print(f'Classifier ROC-AUC: {cls_auc:.4f}')

# ---- Regressor ----
reg_model = models.Sequential([
    layers.Input(shape=(X_all.shape[1], X_all.shape[2])),
    layers.LSTM(64, return_sequences=True),
    layers.Dropout(0.2),
    layers.LSTM(32),
    layers.Dropout(0.2),
    layers.Dense(1)   # linear activation for regression
])
reg_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

reg_model.fit(
    X_train, y_reg_train,
    validation_data=(X_val, y_reg_val),
    epochs=30,
    batch_size=256,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_mae', mode='min', patience=5, restore_best_weights=True)],
    verbose=1
)

# Evaluate regressor
reg_loss, reg_mae = reg_model.evaluate(X_val, y_reg_val, verbose=0)
print(f'Regressor MAE: {reg_mae:.4f}')
'''