In [1]:
import pandas as pd

In [2]:
import json
from typing import List, Dict, Any

In [3]:
from data.preprocessing import Preprocessor
from data.download import download_stock_data
from data.feature_analysis import FeatureAnalyzer
from data.feature_engineering import FeatureEngineer

In [4]:
def print_dict(obj: Dict[str, Any], indent=0) -> None:
    for key, value in obj.items():
        s: str = f"{'\t'*indent}{key}:"
        if isinstance(value, dict):
            print(s)
            print_dict(obj=value, indent=indent+1)
        else:
            print(f"{s} {value}")

### Download Data

In [5]:
data: pd.DataFrame = download_stock_data(
    symbol="AAPL",
    start_date="2020-01-01",
    end_date="2024-12-31",
    interval="1D"
)
data.head(n=10)

[2025-09-21 23:02:55 | INFO] data/download.py : Starting to download stock data for 'AAPL'!
[*********************100%***********************]  1 of 1 completed
[2025-09-21 23:02:55 | INFO] data/download.py : Successfully downloaded 1257 rows of stock data for 'AAPL'.


Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2020-01-02,72.538536,72.598914,71.292326,71.545912,135480400
2020-01-03,71.83329,72.594055,71.608685,71.765667,146322800
2020-01-06,72.405678,72.444321,70.703012,70.954188,118387200
2020-01-07,72.065155,72.671348,71.845377,72.415345,108872000
2020-01-08,73.224434,73.526326,71.768109,71.768109,132079200
2020-01-09,74.779755,74.972962,73.951366,74.202534,170108400
2020-01-10,74.948814,75.513962,74.44647,75.014028,140644800
2020-01-13,76.550072,76.576642,75.14688,75.265226,121532000
2020-01-14,75.516373,76.697376,75.393206,76.487261,161954400
2020-01-15,75.192757,76.197453,74.760446,75.315931,121923600


### Data Preprocessing

In [6]:
preprocessor: Preprocessor = Preprocessor(df=data)
preprocessor.preprocess_stock_data()
cleaned_data: pd.DataFrame = preprocessor.df
print(f"Data Shape: {cleaned_data.shape}")
cleaned_data.tail(n=10)

[2025-09-21 23:02:55 | INFO] data/preprocessing.py : Initiating Enhanced Preprocessing for Stock Data!
[2025-09-21 23:02:55 | INFO] data/preprocessing.py : Flattening level 0 columns for MultiIndex columns...
[2025-09-21 23:02:55 | INFO] data/preprocessing.py : Validating OHLC data consistency...
[2025-09-21 23:02:55 | INFO] data/preprocessing.py : OHLC validation complete. Removed 0 invalid rows.
[2025-09-21 23:02:55 | INFO] data/preprocessing.py : Validating volume data...
[2025-09-21 23:02:55 | INFO] data/preprocessing.py : Volume validation complete. Removed 0 invalid rows.
[2025-09-21 23:02:55 | INFO] data/preprocessing.py : Filling missing values for volume columns ['Volume'] with 0.0...
[2025-09-21 23:02:55 | INFO] data/preprocessing.py : Handling outliers with remove method...
[2025-09-21 23:02:55 | INFO] data/preprocessing.py : Data quality report: 1257 rows processed
[2025-09-21 23:02:55 | INFO] data/preprocessing.py : Finished Enhanced Preprocessing for Stock Data!


Data Shape: (1257, 5)


Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-12-16,250.151962,250.490771,246.773955,247.112764,51694800
2024-12-17,252.583344,252.932112,248.896435,249.195377,51356400
2024-12-18,247.172562,253.38052,246.863661,251.268024,56774100
2024-12-19,248.906387,251.108576,246.215941,246.624495,60882300
2024-12-20,253.589767,254.097958,244.820894,247.162572,147495300
2024-12-23,254.367004,254.74565,252.553435,253.868773,40858800
2024-12-24,257.286682,257.296626,254.386957,254.586262,23234700
2024-12-26,258.103729,259.179926,256.718662,257.276679,27237100
2024-12-27,254.685867,257.784882,252.164818,256.917934,42355300
2024-12-30,251.307877,252.603281,249.863009,251.337769,35557500


In [7]:
data_quality_report: Dict[str, Any] = preprocessor.get_data_quality_report()
print_dict(obj=data_quality_report)

total_rows: 1257
missing_values:
	Close: 0
	High: 0
	Low: 0
	Open: 0
	Volume: 0
data_types:
	Close: float64
	High: float64
	Low: float64
	Open: float64
	Volume: int64
date_range:
	start: 2020-01-02 00:00:00
	end: 2024-12-30 00:00:00
price_statistics:
	Open:
		count: 1257.0
		mean: 151.57182411473113
		std: 41.8358063907925
		min: 55.21508661799568
		25%: 126.08007238065763
		50%: 150.1918083587428
		75%: 176.09017287650624
		max: 257.27667867815336
	High:
		count: 1257.0
		mean: 153.2533333373624
		std: 42.03503547957453
		min: 55.316762499666766
		25%: 127.69461835058758
		50%: 152.11979733212814
		75%: 177.94701832041608
		max: 259.1799258751944
	Low:
		count: 1257.0
		mean: 150.02601680703992
		std: 41.671140540646334
		min: 51.47000835368413
		25%: 124.5862452682212
		50%: 148.3669218148619
		75%: 174.80937774220615
		max: 256.7186620602343
	Close:
		count: 1257.0
		mean: 151.7283011795323
		std: 41.89638325776185
		min: 54.31694412231445
		25%: 126.60704040527344
		50%: 150.366424

### Feature Engineering

In [None]:
target_days: int = 5
target_col: str = f'target_{target_days}d'

In [9]:
feature_engineering: FeatureEngineer = FeatureEngineer(df=cleaned_data)
feature_engineering.generate_all_features(target_days=target_days)
features_df: pd.DataFrame = feature_engineering.df
print(f"Data Shape: {features_df.shape}")
features_df.tail(n=10)

[2025-09-21 23:02:55 | INFO] data/feature_engineering.py : FeatureEngineer initialized successfully
[2025-09-21 23:02:55 | INFO] data/feature_engineering.py : Generating all features automatically
[2025-09-21 23:02:55 | INFO] data/feature_engineering.py : Adding SMA indicators for windows: [5, 10, 20, 50]
[2025-09-21 23:02:55 | INFO] data/feature_engineering.py : Adding EMA indicators for windows: [12, 26, 50]
[2025-09-21 23:02:55 | INFO] data/feature_engineering.py : Adding RSI indicator with window: 14
[2025-09-21 23:02:55 | INFO] data/feature_engineering.py : Adding MACD indicator: fast=12, slow=26, signal=9
[2025-09-21 23:02:55 | INFO] data/feature_engineering.py : Adding Bollinger Bands: window=20, std_dev=2.0
[2025-09-21 23:02:55 | INFO] data/feature_engineering.py : Adding returns for windows: [1, 5, 10, 20]
[2025-09-21 23:02:55 | INFO] data/feature_engineering.py : Adding volatility measures for windows: [5, 10, 20, 30]
[2025-09-21 23:02:55 | INFO] data/feature_engineering.py :

Data Shape: (1257, 119)


Price,Close,High,Low,Open,Volume,SMA_5,SMA_5_signal,Close_SMA_5_ratio,SMA_10,SMA_10_signal,...,z_score_20d,body_size,upper_shadow,lower_shadow,doji,hammer,shooting_star,bullish_candle,bearish_candle,target_1d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-12-16,250.151962,250.490771,246.773955,247.112764,51694800,247.39975,1,1.011125,245.097922,1,...,1.518701,3.039199,0.338809,0.338809,0,0,0,1,0,1
2024-12-17,252.583344,252.932112,248.896435,249.195377,51356400,248.537711,1,1.016278,246.17709,1,...,1.634013,3.387967,0.348768,0.298942,0,0,0,1,0,1
2024-12-18,247.172562,253.38052,246.863661,251.268024,56774100,248.848605,0,0.993265,246.679308,1,...,0.870426,4.095462,2.112496,0.308901,0,0,0,0,1,0
2024-12-19,248.906387,251.108576,246.215941,246.624495,60882300,249.213309,0,0.998768,247.351918,1,...,1.012142,2.281893,2.202189,0.408553,0,0,0,1,0,1
2024-12-20,253.589767,254.097958,244.820894,247.162572,147495300,250.480804,1,1.012412,248.512796,1,...,1.547766,6.427196,0.50819,2.341678,0,0,0,1,0,1
2024-12-23,254.367004,254.74565,252.553435,253.868773,40858800,251.323813,1,1.012109,249.361781,1,...,1.558134,0.498231,0.378645,1.315338,0,0,0,1,0,1
2024-12-24,257.286682,257.296626,254.386957,254.586262,23234700,252.264481,1,1.019908,250.401096,1,...,1.818225,2.700421,0.009944,0.199305,0,0,0,1,0,1
2024-12-26,258.103729,259.179926,256.718662,257.276679,27237100,254.450714,1,1.014356,251.64966,1,...,1.752907,0.827051,1.076197,0.558017,0,0,0,1,0,1
2024-12-27,254.685867,257.784882,252.164818,256.917934,42355300,255.60661,0,0.996398,252.409959,1,...,1.157223,2.232066,0.866948,2.521049,0,0,0,0,1,0
2024-12-30,251.307877,252.603281,249.863009,251.337769,35557500,255.150232,0,0.984941,252.815518,0,...,0.528213,0.029893,1.265512,1.444868,1,0,0,0,1,0


In [10]:
feature_summary: Dict[str, Any] = feature_engineering.get_feature_summary()
print_dict(obj=feature_summary)

total_features: 119
feature_categories:
	Technical Indicators: ['SMA_5', 'SMA_5_signal', 'SMA_10', 'SMA_10_signal', 'SMA_20', 'SMA_20_signal', 'SMA_50', 'SMA_50_signal', 'EMA_12', 'EMA_12_signal', 'EMA_26', 'EMA_26_signal', 'EMA_50', 'EMA_50_signal', 'RSI_14', 'RSI_14_oversold', 'RSI_14_overbought', 'RSI_14_neutral', 'MACD_12_26', 'MACD_signal_9', 'MACD_histogram', 'MACD_bullish', 'MACD_bearish', 'BB_upper_20', 'BB_middle_20', 'BB_lower_20', 'BB_width_20', 'BB_position_20', 'BB_upper_touch_20', 'BB_lower_touch_20', 'BB_squeeze_20']
	Returns & Volatility: ['return_1d', 'log_return_1d', 'cumulative_return_1d', 'return_5d', 'log_return_5d', 'cumulative_return_5d', 'return_10d', 'log_return_10d', 'cumulative_return_10d', 'return_20d', 'log_return_20d', 'cumulative_return_20d', 'volatility_5d', 'volatility_annualized_5d', 'volatility_10d', 'volatility_annualized_10d', 'volatility_ratio_10d', 'volatility_20d', 'volatility_annualized_20d', 'volatility_ratio_20d', 'volatility_30d', 'volatility

### Feature Analysis

In [None]:
analyzer: FeatureAnalyzer = FeatureAnalyzer(
    df=features_df,
    target_column=target_col
)
analysis_results: Dict[str, Any] = analyzer.comprehensive_feature_analysis()

[2025-09-21 23:02:55 | INFO] data/feature_analysis.py : FeatureAnalyzer initialized with 118 features
[2025-09-21 23:02:55 | INFO] data/feature_analysis.py : Performing comprehensive feature analysis...
[2025-09-21 23:02:55 | INFO] data/feature_analysis.py : Analyzing feature correlations...
[2025-09-21 23:02:55 | INFO] data/feature_analysis.py : Found 141 highly correlated pairs
[2025-09-21 23:02:55 | INFO] data/feature_analysis.py : Identified 43 redundant features
[2025-09-21 23:02:55 | INFO] data/feature_analysis.py : Analyzing feature importance using methods: ['random_forest', 'lasso', 'mutual_info']
[2025-09-21 23:02:55 | INFO] data/feature_analysis.py : Calculating Random Forest importance...
[2025-09-21 23:02:56 | INFO] data/feature_analysis.py : Calculating Lasso importance...
[2025-09-21 23:02:56 | INFO] data/feature_analysis.py : Calculating Mutual Information...
[2025-09-21 23:02:56 | INFO] data/feature_analysis.py : Feature importance analysis completed
[2025-09-21 23:02:

#### Results

In [12]:
corr_results = analysis_results['correlation_analysis'] # type: ignore
print(f"\n📊 ANÁLISIS DE CORRELACIÓN:")
print(f"   • Total de features: {len(analyzer.feature_columns)}")
print(f"   • Pares altamente correlacionados: {len(corr_results['high_correlation_pairs'])}")
print(f"   • Features redundantes identificados: {len(corr_results['redundant_features'])}")

print(f"\n   Top 5 features por correlación con target:")
for i, (feature, corr) in enumerate(corr_results['target_correlations'].head(5).items()):
    print(f"   {i+1}. {feature}: {corr:.4f}")

# Análisis de importancia
importance_results = analysis_results['importance_analysis']
print(f"\n🎯 ANÁLISIS DE IMPORTANCIA:")
print(f"   Top 5 features por importancia combinada:")
for i, (feature, score) in enumerate(importance_results['combined'].head(5).items()):
    print(f"   {i+1}. {feature}: {score:.4f}")

# Análisis de estabilidad temporal
stability_results = analysis_results['temporal_stability']
print(f"\n⏰ ANÁLISIS DE ESTABILIDAD TEMPORAL:")
print(f"   Top 5 features más estables:")
for i, feature in enumerate(stability_results['most_stable_features'][:5]):
    stability_score = stability_results['stability_scores'][feature]['stability_score']
    print(f"   {i+1}. {feature}: {stability_score:.4f}")


📊 ANÁLISIS DE CORRELACIÓN:
   • Total de features: 118
   • Pares altamente correlacionados: 141
   • Features redundantes identificados: 43

   Top 5 features por correlación con target:
   1. trend_up: 1.0000
   2. trend_down: 0.9920
   3. consecutive_down_days: 0.7497
   4. log_return_1d: 0.7032
   5. return_1d: 0.7027

🎯 ANÁLISIS DE IMPORTANCIA:
   Top 5 features por importancia combinada:
   1. volume_price_trend: 0.9267
   2. trend_up: 0.6621
   3. return_1d: 0.5798
   4. consecutive_up_days: 0.5733
   5. log_return_1d: 0.5664

⏰ ANÁLISIS DE ESTABILIDAD TEMPORAL:
   Top 5 features más estables:
   1. trend_up: 99999998.3664
   2. log_return_1d: 37.1806
   3. return_1d: 36.3301
   4. close_percentile_5d: 31.9702
   5. acceleration_5d: 23.0734


In [13]:
univariate_results = analysis_results['univariate_selection']
print(f"\n📈 SELECCIÓN UNIVARIADA:")
print(f"   Features seleccionadas: {len(univariate_results['selected_features'])}")
print(f"   Top 5 features:")
for i, feature in enumerate(univariate_results['selected_features'][:5]):
    print(f"   {i+1}. {feature}")

# Selección multivariada
multivariate_results = analysis_results['multivariate_selection']
print(f"\n📊 SELECCIÓN MULTIVARIADA:")
print(f"   Features seleccionadas: {len(multivariate_results['selected_features'])}")
print(f"   Top 5 features:")
for i, feature in enumerate(multivariate_results['selected_features'][:5]):
    print(f"   {i+1}. {feature}")


📈 SELECCIÓN UNIVARIADA:
   Features seleccionadas: 20
   Top 5 features:
   1. SMA_5_signal
   2. Close_EMA_12_ratio
   3. return_1d
   4. log_return_1d
   5. close_position

📊 SELECCIÓN MULTIVARIADA:
   Features seleccionadas: 20
   Top 5 features:
   1. return_1d
   2. log_return_1d
   3. cumulative_return_1d
   4. return_5d
   5. log_return_5d


In [14]:
recommendations = analysis_results['recommendations']
print(f"\n🎯 FEATURES CONSENSO:")
print(f"   Total de features consenso: {len(recommendations['consensus_features'])}")

print(f"\n   Categorías de features:")
for category, count in recommendations['feature_categories'].items():
    print(f"   • {category}: {count} features")

print(f"\n   Features consenso por categoría:")
print(f"   • Indicadores Técnicos: {len(recommendations['technical_indicators'])}")
print(f"   • Volatilidad: {len(recommendations['volatility_features'])}")
print(f"   • Retornos: {len(recommendations['return_features'])}")
print(f"   • Volumen: {len(recommendations['volume_features'])}")


🎯 FEATURES CONSENSO:
   Total de features consenso: 19

   Categorías de features:
   • Technical Indicators: 1 features
   • Volatility: 0 features
   • Returns: 2 features
   • Volume: 1 features

   Features consenso por categoría:
   • Indicadores Técnicos: 1
   • Volatilidad: 0
   • Retornos: 2
   • Volumen: 1


### Optimal Features Set

In [15]:
optimal_features: List[str] = analyzer.get_optimal_feature_set(analysis_results, max_features=25)
    
print(f"\n🚀 FEATURES ÓPTIMAS SELECCIONADAS ({len(optimal_features)}):")
for i, feature in enumerate(optimal_features):
    print(f"   {i+1:2d}. {feature}")

[2025-09-21 23:03:08 | INFO] data/feature_analysis.py : Selecting optimal feature set (max 25 features)
[2025-09-21 23:03:08 | INFO] data/feature_analysis.py : Selected 25 optimal features



🚀 FEATURES ÓPTIMAS SELECCIONADAS (25):
    1. Close_SMA_5_ratio
    2. return_1d
    3. log_return_1d
    4. close_position
    5. intraday_momentum
    6. acceleration_5d
    7. acceleration_10d
    8. acceleration_20d
    9. trend_up
   10. trend_down
   11. consecutive_up_days
   12. consecutive_down_days
   13. volume_price_trend
   14. close_percentile_5d
   15. z_score_5d
   16. close_percentile_10d
   17. z_score_10d
   18. bullish_candle
   19. bearish_candle
   20. close_percentile_20d
   21. SMA_5_signal
   22. BB_position_20
   23. z_score_20d
   24. gap_size
   25. Close_EMA_12_ratio


In [None]:
optimal_data: pd.DataFrame | pd.Series = features_df[optimal_features + [target_col]]
print(f"\n📊 ESTADÍSTICAS DE FEATURES ÓPTIMAS:")
print(f"   Shape: {optimal_data.shape}")
print(f"   Valores faltantes: {optimal_data.isnull().sum().sum()}")

# Correlación con target
target_correlations: pd.Series = optimal_data[optimal_features].corrwith(other=optimal_data[target_col]).abs().sort_values(ascending=False) # type: ignore

print(f"\n   Top 10 features por correlación con target:")
for i, (feature, corr) in enumerate(target_correlations.head(n=10).items()):
    print(f"   {i+1:2d}. {feature}: {corr:.4f}")


📊 ESTADÍSTICAS DE FEATURES ÓPTIMAS:
   Shape: (1257, 26)
   Valores faltantes: 129

   Top 10 features por correlación con target:
    1. trend_up: 1.0000
    2. trend_down: 0.9920
    3. consecutive_down_days: 0.7497
    4. log_return_1d: 0.7032
    5. return_1d: 0.7027
    6. close_percentile_5d: 0.6958
    7. consecutive_up_days: 0.6804
    8. bearish_candle: 0.6577
    9. bullish_candle: 0.6561
   10. close_position: 0.6414
