In [30]:
# ADDING FEATURES, PT.1
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import sys
from funcs.api_funcs import get_rounds_arg, get_target_arg
import random
import os
from funcs.engineer_features_funcs import optimize_params, compute_mse_scores, compute_baseline_mse, compute_mse_with_added_feature, extract_tsfresh_features, evaluate_feature, compute_mse_with_dropped_feature, drop_features
# Define the target
target = 'EXPGS'  # TESTING, PRESET TARGET

np.random.seed(42)
random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)

def get_rounds_arg():
    return 1
# Define number of feature engineering rounds
engineering_rounds = get_rounds_arg()

# Load data
X_train_transformed = pd.read_csv('data/processed/X_train_transformed.csv', index_col='Date', parse_dates=True)
X_test_transformed = pd.read_csv('data/processed/X_test_transformed.csv', index_col='Date', parse_dates=True)
y_train_transformed = pd.read_csv('data/processed/y_train_transformed.csv', index_col='Date', parse_dates=True)
y_test_transformed = pd.read_csv('data/processed/y_test_transformed.csv', index_col='Date', parse_dates=True)

# Combine X and y dataframes for feature engineering
train_combined = pd.concat([X_train_transformed, y_train_transformed], axis=1)
test_combined = pd.concat([X_test_transformed, y_test_transformed], axis=1)

# Ensure the column names are preserved
base_features = list(X_train_transformed.columns)

# ADDING FEATURES, PT.2
import optuna
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler
import logging
import warnings

# Suppress Optuna logging
optuna.logging.set_verbosity(optuna.logging.WARNING)
# Suppress warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Initial baseline MSE calculation
baseline_mse_scores, aggregated_baseline_mse, best_params = compute_mse_scores(
    X_train_transformed, X_test_transformed, y_train_transformed, y_test_transformed, base_features
)

print(f"Baseline MSE Scores:")
print(baseline_mse_scores)
print(f"Aggregated Baseline MSE Score: {aggregated_baseline_mse}\n")

initial_baseline_mse = aggregated_baseline_mse 

initial_mse_xgboost = baseline_mse_scores['XGBoost']
initial_mse_lightgbm = baseline_mse_scores['LightGBM']

# Save best params as dictionaries
xgboost_params = best_params['XGBoost']
lightgbm_params = best_params['LightGBM']

# Save to a Python file
with open('best_params.py', 'w') as f:
    f.write(f"xgboost_params = {xgboost_params}\n")
    f.write(f"lightgbm_params = {lightgbm_params}\n")

print("Best parameters saved to best_params.py.")

Baseline MSE Scores:
{'XGBoost': 0.0005103063958695216, 'LightGBM': 0.0005872492303105134}
Aggregated Baseline MSE Score: 0.001097555626180035

Best parameters saved to best_params.py.


In [32]:
# V.1.0.0.0 - FEATURE DROPPING - USING THE THREE AT-A-TIME METHOD
from joblib import Parallel, delayed

# Set a higher threshold for improvement (e.g., 0.05% of the initial baseline MSE)
threshold = 0.0002 * ((baseline_mse_scores['XGBoost'] + baseline_mse_scores['LightGBM']) / 2)

# Function to compute MSE scores after dropping a feature
def compute_mse_with_dropped_feature(X_train, X_test, y_train, y_test, base_features, drop_feature):
    X_train = X_train[[f for f in base_features if f != drop_feature]].dropna().values
    X_test = X_test[[f for f in base_features if f != drop_feature]].dropna().values
    y_train = y_train.dropna().values.ravel()
    y_test = y_test.dropna().values.ravel()

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    mse_scores = {'XGBoost': [], 'LightGBM': []}

    models = {
        'XGBoost': XGBRegressor(**xgboost_params, verbosity=0),
        'LightGBM': LGBMRegressor(**lightgbm_params, verbosity=-1)
    }

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse_scores[model_name].append(mean_squared_error(y_test, y_pred))

    mse_scores = {model: np.mean(scores) for model, scores in mse_scores.items()}
    aggregated_mse = sum(mse_scores.values())

    return mse_scores, aggregated_mse

# Function to drop features
def drop_features(train_combined, target, base_features,aggregated_baseline_mse,threshold):
    aggregated_mse_scores_dropped = []
    for feature in base_features:
        mse_scores, aggregated_mse = compute_mse_with_dropped_feature(X_train_transformed, X_test_transformed, y_train_transformed, y_test_transformed, base_features, feature)
        improvement = aggregated_baseline_mse - aggregated_mse
        improvement_status = "improved" if improvement > threshold else "worsened"
        aggregated_mse_scores_dropped.append((feature, aggregated_mse, improvement, improvement_status, mse_scores))


    # Sort and drop the least impactful features if they result in improvement
    aggregated_mse_scores_dropped.sort(key=lambda x: x[1])
    features_to_drop = [f for f in aggregated_mse_scores_dropped if f[2] > threshold]

    if not features_to_drop:
        print("No features were dropped as they did not improve the model.")
    else:
        for feature, _, improvement, _, _ in features_to_drop:
            base_features.remove(feature)
            print(f"Feature dropped: {feature}, Improvement: {improvement}")

    print("Feature Dropping Completed.")

# Example usage:
drop_features(train_combined, target, base_features,threshold,aggregated_baseline_mse)

# Final baseline MSE calculation
final_mse_scores, aggregated_final_mse, _ = compute_mse_scores(
    X_train_transformed, X_test_transformed, y_train_transformed, y_test_transformed, base_features
)
# Store final MSE scores for each model
final_mse_xgboost = final_mse_scores['XGBoost']
final_mse_lightgbm = final_mse_scores['LightGBM']

# Calculate improvements
improvement_xgboost = initial_mse_xgboost - final_mse_xgboost
improvement_lightgbm = initial_mse_lightgbm - final_mse_lightgbm

# Print the results
print("")
print("THRESHOLD OF 0.002:")
print(f"Initial MSE for XGBoost: {initial_mse_xgboost}")
print(f"Final MSE for XGBoost: {final_mse_xgboost}")
print(f"Improvement in MSE for XGBoost: {improvement_xgboost}\n")

print(f"Initial MSE for LightGBM: {initial_mse_lightgbm}")
print(f"Final MSE for LightGBM: {final_mse_lightgbm}")
print(f"Improvement in MSE for LightGBM: {improvement_lightgbm}")


No features were dropped as they did not improve the model.
Feature Dropping Completed.

THRESHOLD OF 0.002:
Initial MSE for XGBoost: 0.0005103063958695216
Final MSE for XGBoost: 0.000526697588164481
Improvement in MSE for XGBoost: -1.63911922949594e-05

Initial MSE for LightGBM: 0.0005872492303105134
Final MSE for LightGBM: 0.0006672955849259226
Improvement in MSE for LightGBM: -8.004635461540926e-05
