In [1]:
import pandas as pd

In [2]:
import json
from typing import Dict, Any

In [3]:
from data.preprocessing import Preprocessor
from data.download import download_stock_data
from data.feature_engineering import FeatureEngineer

In [4]:
def print_dict(obj: Dict[str, Any], indent=0) -> None:
    for key, value in obj.items():
        s: str = f"{'\t'*indent}{key}:"
        if isinstance(value, dict):
            print(s)
            print_dict(obj=value, indent=indent+1)
        else:
            print(f"{s} {value}")

### Download Data

In [5]:
data: pd.DataFrame = download_stock_data(
    symbol="AAPL",
    start_date="2020-01-01",
    end_date="2024-12-31",
    interval="1D"
)
data.head(n=10)

[2025-09-21 21:13:47 | INFO] data/download.py : Starting to download stock data for 'AAPL'!
[*********************100%***********************]  1 of 1 completed
[2025-09-21 21:13:48 | INFO] data/download.py : Successfully downloaded 1257 rows of stock data for 'AAPL'.


Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2020-01-02,72.538506,72.598884,71.292296,71.545882,135480400
2020-01-03,71.833275,72.59404,71.608669,71.765651,146322800
2020-01-06,72.405678,72.444321,70.703012,70.954188,118387200
2020-01-07,72.065147,72.671341,71.845369,72.415337,108872000
2020-01-08,73.224419,73.52631,71.768094,71.768094,132079200
2020-01-09,74.779755,74.972962,73.951366,74.202534,170108400
2020-01-10,74.948799,75.513947,74.446455,75.014012,140644800
2020-01-13,76.550034,76.576604,75.146842,75.265189,121532000
2020-01-14,75.516388,76.697392,75.393221,76.487276,161954400
2020-01-15,75.192749,76.197445,74.760438,75.315923,121923600


### Data Preprocessing

In [None]:
preprocessor: Preprocessor = Preprocessor(df=data)
preprocessor.preprocess_stock_data()
cleaned_data: pd.DataFrame = preprocessor.df
print(f"Data Shape: {cleaned_data.shape}")
cleaned_data.tail(n=10)

[2025-09-21 21:13:51 | INFO] data/preprocessing.py : Initiating Enhanced Preprocessing for Stock Data!
[2025-09-21 21:13:51 | INFO] data/preprocessing.py : Flattening level 0 columns for MultiIndex columns...
[2025-09-21 21:13:51 | INFO] data/preprocessing.py : Validating OHLC data consistency...
[2025-09-21 21:13:51 | INFO] data/preprocessing.py : OHLC validation complete. Removed 0 invalid rows.
[2025-09-21 21:13:51 | INFO] data/preprocessing.py : Validating volume data...
[2025-09-21 21:13:51 | INFO] data/preprocessing.py : Volume validation complete. Removed 0 invalid rows.
[2025-09-21 21:13:51 | INFO] data/preprocessing.py : Filling missing values for volume columns ['Volume'] with 0.0...
[2025-09-21 21:13:51 | INFO] data/preprocessing.py : Handling outliers with remove method...
[2025-09-21 21:13:51 | INFO] data/preprocessing.py : Data quality report: 1257 rows processed
[2025-09-21 21:13:51 | INFO] data/preprocessing.py : Finished Enhanced Preprocessing for Stock Data!


Data Shape: (1257, 5)


Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,72.538506,72.598884,71.292296,71.545882,135480400
2020-01-03,71.833275,72.59404,71.608669,71.765651,146322800
2020-01-06,72.405678,72.444321,70.703012,70.954188,118387200
2020-01-07,72.065147,72.671341,71.845369,72.415337,108872000
2020-01-08,73.224419,73.52631,71.768094,71.768094,132079200
2020-01-09,74.779755,74.972962,73.951366,74.202534,170108400
2020-01-10,74.948799,75.513947,74.446455,75.014012,140644800
2020-01-13,76.550034,76.576604,75.146842,75.265189,121532000
2020-01-14,75.516388,76.697392,75.393221,76.487276,161954400
2020-01-15,75.192749,76.197445,74.760438,75.315923,121923600


In [7]:
data_quality_report: Dict[str, Any] = preprocessor.get_data_quality_report()
print_dict(obj=data_quality_report)

total_rows: 1257
missing_values:
	Close: 0
	High: 0
	Low: 0
	Open: 0
	Volume: 0
data_types:
	Close: float64
	High: float64
	Low: float64
	Open: float64
	Volume: int64
date_range:
	start: 2020-01-02 00:00:00
	end: 2024-12-30 00:00:00
price_statistics:
	Open:
		count: 1257.0
		mean: 151.57182359079823
		std: 41.835806588201244
		min: 55.21508661799568
		25%: 126.08005806185443
		50%: 150.19179299707827
		75%: 176.09018854441808
		max: 257.27667867815336
	High:
		count: 1257.0
		mean: 153.25333280548412
		std: 42.03503568314091
		min: 55.316762499666766
		25%: 127.69459465788168
		50%: 152.11979733212814
		75%: 177.9470488771055
		max: 259.1799258751944
	Low:
		count: 1257.0
		mean: 150.0260162841853
		std: 41.671140723502205
		min: 51.47000835368413
		25%: 124.58626017036609
		50%: 148.3669218148619
		75%: 174.80939295881248
		max: 256.7186620602343
	Close:
		count: 1257.0
		mean: 151.728300645414
		std: 41.89638343865508
		min: 54.31694412231445
		25%: 126.60704040527344
		50%: 150.3664

### Feature Engineering

In [None]:
feature_engineering: FeatureEngineer = FeatureEngineer(df=cleaned_data)
feature_engineering.generate_all_features()
features_df: pd.DataFrame = feature_engineering.df
print(f"Data Shape: {features_df.shape}")
features_df.tail(n=10)

[2025-09-21 21:13:59 | INFO] data/feature_engineering.py : FeatureEngineer initialized successfully
[2025-09-21 21:13:59 | INFO] data/feature_engineering.py : Generating all features automatically
[2025-09-21 21:13:59 | INFO] data/feature_engineering.py : Adding SMA indicators for windows: [5, 10, 20, 50]
[2025-09-21 21:13:59 | INFO] data/feature_engineering.py : Adding EMA indicators for windows: [12, 26, 50]
[2025-09-21 21:13:59 | INFO] data/feature_engineering.py : Adding RSI indicator with window: 14
[2025-09-21 21:13:59 | INFO] data/feature_engineering.py : Adding MACD indicator: fast=12, slow=26, signal=9
[2025-09-21 21:13:59 | INFO] data/feature_engineering.py : Adding Bollinger Bands: window=20, std_dev=2.0
[2025-09-21 21:13:59 | INFO] data/feature_engineering.py : Adding returns for windows: [1, 5, 10, 20]
[2025-09-21 21:13:59 | INFO] data/feature_engineering.py : Adding volatility measures for windows: [5, 10, 20, 30]
[2025-09-21 21:13:59 | INFO] data/feature_engineering.py :

Data Shape: (1257, 118)


Price,Close,High,Low,Open,Volume,SMA_5,SMA_5_signal,Close_SMA_5_ratio,SMA_10,SMA_10_signal,...,close_percentile_20d,z_score_20d,body_size,upper_shadow,lower_shadow,doji,hammer,shooting_star,bullish_candle,bearish_candle
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02,72.538506,72.598884,71.292296,71.545882,135480400,,0,,,0,...,,,0.992623,0.060378,0.253586,0,0,0,1,0
2020-01-03,71.833275,72.59404,71.608669,71.765651,146322800,,0,,,0,...,,,0.067623,0.760765,0.156982,1,0,0,1,0
2020-01-06,72.405678,72.444321,70.703012,70.954188,118387200,,0,,,0,...,,,1.45149,0.038643,0.251176,0,0,0,1,0
2020-01-07,72.065147,72.671341,71.845369,72.415337,108872000,,0,,,0,...,,,0.35019,0.256003,0.219778,0,0,0,0,1
2020-01-08,73.224419,73.52631,71.768094,71.768094,132079200,72.413405,1,1.0112,,0,...,,,1.456325,0.301892,0.0,0,0,0,1,0
2020-01-09,74.779755,74.972962,73.951366,74.202534,170108400,72.861655,1,1.026325,,0,...,,,0.57722,0.193208,0.251168,0,0,0,1,0
2020-01-10,74.948799,75.513947,74.446455,75.014012,140644800,73.48476,1,1.019923,,0,...,,,0.065213,0.499934,0.502344,1,0,0,0,1
2020-01-13,76.550034,76.576604,75.146842,75.265189,121532000,74.313631,1,1.030094,,0,...,,,1.284845,0.02657,0.118346,0,0,0,1,0
2020-01-14,75.516388,76.697392,75.393221,76.487276,161954400,75.003879,1,1.006833,,0,...,,,0.970888,0.210115,0.123167,0,0,0,0,1
2020-01-15,75.192749,76.197445,74.760438,75.315923,121923600,75.397545,0,0.997284,73.905475,1,...,,,0.123174,0.881522,0.432311,1,0,0,0,1


In [None]:
feature_summary: Dict[str, Any] = feature_engineering.get_feature_summary()
print_dict(obj=feature_summary)