# Wind Power Forecasting: LSTM Direct Approach

This notebook performs time-series forecasting of wind turbine power generation using an LSTM model.
It follows these steps:
1. Load Configuration
2. Load Raw Data
3. Preprocess Data (resampling, scaling)
4. Exploratory Data Analysis (EDA)
5. Prepare Data Sequences for LSTM
6. Build and Train LSTM Model
7. Evaluate Model Performance
8. Visualize Results

*(Note: Approach 2 (Indirect RF+LSTM) would be added in subsequent sections if implemented)*

## 1. Setup and Configuration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from IPython.display import display, Markdown # For displaying styled dataframes

# --- Add src directory to Python path ---
# This allows importing modules from src when the notebook is in notebooks/
# Assumes the notebook is run from the project root or the 'notebooks' directory
if os.path.basename(os.getcwd()) == 'notebooks':
    project_root = os.path.abspath(os.path.join('..')) # Go up one level
else:
    project_root = os.path.abspath(os.path.join('.')) # Assume running from root

src_path = os.path.join(project_root, 'src')
if src_path not in sys.path:
    sys.path.append(src_path)
    print(f"Added '{src_path}' to sys.path")

# Change working directory to project root for consistent file paths
if os.getcwd() != project_root:
   os.chdir(project_root)
   print(f"Changed working directory to: {project_root}")

# --- Import refactored code modules ---
try:
    from src import config
    from src import data_preprocessing as dp
    from src import modeling as mdl
    from src import evaluation as evl
    from src import plotting as pl
except ImportError as e:
    print(f"Error importing modules: {e}")
    print("Ensure you have run 'pip install -r requirements.txt' and the src directory is correct.")
    # Stop execution if imports fail
    raise

# Apply plot style
pl.setup_plot_style()

# Ensure results directories exist
os.makedirs(config.RESULTS_DIR, exist_ok=True)
os.makedirs(config.MODEL_SAVE_DIR, exist_ok=True)
os.makedirs(config.IMAGE_DIR, exist_ok=True)

print(f"Configuration loaded. Target variable: {config.TARGET_COL}")
print(f"Sequence length: {config.SEQUENCE_LENGTH}")
print(f"Using raw data from: {config.RAW_DATA_FILE}")

## 2. Load Raw Data

In [None]:
# Use the function from data_preprocessing module
try:
    raw_df = dp.load_data(config.RAW_DATA_FILE)
    print("\nRaw data sample:")
    display(raw_df.head())
    print(f"\nRaw data info:")
    raw_df.info()
except FileNotFoundError as e:
    print(e)
    print(f"\nPlease ensure the file '{config.RAW_DATA_FILE}' exists.")
    # Stop execution or handle appropriately
    raise SystemExit("Raw data file not found.") from e
except Exception as e:
    print(f"An error occurred loading data: {e}")
    raise

## 3. Preprocess Data

In [None]:
# Perform preprocessing steps using the dedicated function
try:
    processed_df = dp.preprocess_data(
        raw_df, 
        config.DATE_COL, 
        config.TARGET_COL, 
        config.FEATURE_COLS, 
        config.RESAMPLE_FREQ
    )
    print("\nProcessed data sample:")
    display(processed_df.head())
except Exception as e:
     print(f"An error occurred during preprocessing: {e}")
     raise

## 4. Exploratory Data Analysis (EDA)

In [None]:
# Plot the target variable time series
pl.plot_time_series(
    processed_df,
    config.TARGET_COL,
    title=f'{config.TARGET_COL} Over Time ({config.RESAMPLE_FREQ} Resampled)',
    ylabel=config.TARGET_COL,
    save_path=os.path.join(config.IMAGE_DIR, 'target_timeseries.png')
)

# Plot distribution of the target variable
pl.plot_feature_distribution(
    processed_df,
    config.TARGET_COL,
    title=f'Distribution of {config.TARGET_COL}',
    save_path=os.path.join(config.IMAGE_DIR, 'target_distribution.png')
)

# Plot distribution of a key feature (e.g., Wind Speed)
if 'Wind Speed (m/s)' in processed_df.columns:
     pl.plot_feature_distribution(
         processed_df,
         'Wind Speed (m/s)',
         title='Distribution of Wind Speed (m/s)',
         save_path=os.path.join(config.IMAGE_DIR, 'windspeed_distribution.png')
     )
else:
    print("Skipping wind speed distribution plot - column not found.")

## 5. Prepare Data for Modeling

In [None]:
# Split data sequentially
try:
    train_df, val_df, test_df = dp.split_data_sequential(
        processed_df, 
        config.TRAIN_SPLIT, 
        config.VALIDATION_SPLIT
    )
except ValueError as e:
    print(f"Error splitting data: {e}")
    raise

# Scale data
train_scaled, val_scaled, test_scaled, scaler = dp.scale_data(
    train_df, val_df, test_df
)

# Get the index of the target column AFTER scaling
try:
    target_col_index = train_scaled.columns.get_loc(config.TARGET_COL)
    print(f"Index of target column '{config.TARGET_COL}' in scaled data: {target_col_index}")
except KeyError:
     print(f"Error: Target column '{config.TARGET_COL}' not found in scaled data columns: {train_scaled.columns}")
     raise

# Create sequences
X_train, y_train = dp.create_sequences(train_scaled, config.SEQUENCE_LENGTH, target_col_index)
X_val, y_val = dp.create_sequences(val_scaled, config.SEQUENCE_LENGTH, target_col_index)
X_test, y_test = dp.create_sequences(test_scaled, config.SEQUENCE_LENGTH, target_col_index)

if X_train.shape[0] == 0 or X_val.shape[0] == 0 or X_test.shape[0] == 0:
    print("Warning: One or more sequence datasets (X_train, X_val, X_test) are empty.")
    print("This might be due to insufficient data length for the given sequence length.")
    # Optionally raise an error or stop execution
    # raise SystemExit("Cannot proceed with empty sequence datasets.")
else:
    print("Sequence shapes:")
    print(f"  X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"  X_val: {X_val.shape}, y_val: {y_val.shape}")
    print(f"  X_test: {X_test.shape}, y_test: {y_test.shape}")

## 6. Build and Train LSTM Model (Direct Approach)

In [None]:
# Check if we have data to train on
if X_train.shape[0] == 0 or X_val.shape[0] == 0:
    print("Skipping model training due to empty training or validation sequences.")
else:
    # Define input shape for LSTM
    input_shape = (X_train.shape[1], X_train.shape[2]) # (sequence_length, n_features)
    print(f"Input shape for LSTM: {input_shape}")
    
    # Build the model
    lstm_model = mdl.build_lstm_model(
        input_shape,
        config.LSTM_UNITS_L1,
        config.LSTM_UNITS_L2,
        config.LEARNING_RATE
    )
    
    # Train the model
    trained_model, history = mdl.train_model(
        lstm_model,
        X_train, y_train,
        X_val, y_val,
        epochs=config.EPOCHS,
        batch_size=config.BATCH_SIZE,
        model_checkpoint_path=config.BEST_LSTM_MODEL_FILE,
        early_stopping_patience=config.EARLY_STOPPING_PATIENCE
    )
    
    # Plot training history
    pl.plot_learning_curves(
        history,
        title='LSTM Model Training History',
        save_path=os.path.join(config.IMAGE_DIR, 'lstm_learning_curves.png')
     )
    
    model_ready = True
    print("Model training finished.")

if 'model_ready' not in locals() or not model_ready:
    print("\nModel was not trained. Skipping evaluation and visualization.")

## 7. Evaluate Model Performance

In [None]:
# Evaluate only if the model was trained successfully
if 'model_ready' in locals() and model_ready:
    # Evaluate on Training Data
    print("\nEvaluating on Training Set...")
    y_true_train_inv, y_pred_train_inv, metrics_train = evl.evaluate_model(
        trained_model, X_train, y_train, scaler, target_col_index, config.METRICS_TO_CALCULATE
    )
    
    # Evaluate on Validation Data
    print("\nEvaluating on Validation Set...")
    y_true_val_inv, y_pred_val_inv, metrics_val = evl.evaluate_model(
        trained_model, X_val, y_val, scaler, target_col_index, config.METRICS_TO_CALCULATE
    )
    
    # Evaluate on Test Data
    print("\nEvaluating on Test Set...")
    y_true_test_inv, y_pred_test_inv, metrics_test = evl.evaluate_model(
        trained_model, X_test, y_test, scaler, target_col_index, config.METRICS_TO_CALCULATE
    )
    
    # --- Display Metrics --- 
    metrics_df = pd.DataFrame([
        {'Dataset': 'Train', **metrics_train},
        {'Dataset': 'Validation', **metrics_val},
        {'Dataset': 'Test', **metrics_test}
    ]).set_index('Dataset')
    
    # Format for better display
    metrics_df_display = metrics_df.style.format("{:.4f}")\
                                    .set_caption("LSTM Direct Approach Performance Metrics")\
                                    .set_table_styles([{'selector': 'caption', 'props': [('font-size', '16px'), ('font-weight', 'bold')]}])
    
    print("\n--- Performance Metrics ---")
    display(metrics_df_display) # Use display() in Jupyter for styled output
    
    # Save metrics to CSV
    metrics_output_path = config.METRICS_LSTM_DIRECT_FILE
    try:
        metrics_df.to_csv(metrics_output_path)
        print(f"\nMetrics saved to {metrics_output_path}")
    except Exception as e:
        print(f"Error saving metrics to {metrics_output_path}: {e}")
        
    evaluation_done = True

if 'evaluation_done' not in locals() or not evaluation_done:
     print("\nModel evaluation was not performed. Skipping result visualization.")

## 8. Visualize Results

In [None]:
# Visualize results only if evaluation was done
if 'evaluation_done' in locals() and evaluation_done:
    # Get the correct index for plotting predictions
    # The predictions correspond to the time steps AFTER the initial sequence length
    # Ensure the dataframes used for indexing still exist and match the sequences
    if len(train_df) >= config.SEQUENCE_LENGTH:
        train_pred_index = train_df.index[config.SEQUENCE_LENGTH:]
    else: train_pred_index = pd.Index([])
        
    if len(val_df) >= config.SEQUENCE_LENGTH:
        val_pred_index = val_df.index[config.SEQUENCE_LENGTH:]
    else: val_pred_index = pd.Index([])
        
    if len(test_df) >= config.SEQUENCE_LENGTH:
         test_pred_index = test_df.index[config.SEQUENCE_LENGTH:]
    else: test_pred_index = pd.Index([])

    # --- Plot actual vs predicted for Test set ---
    print("\nPlotting Actual vs Predicted for Test Set...")
    pl.plot_actual_vs_predicted(
        y_true_test_inv,
        y_pred_test_inv,
        test_pred_index, # Use the adjusted index
        title='LSTM: Actual vs Predicted Power (Test Set)',
        ylabel=config.TARGET_COL,
        save_path=os.path.join(config.IMAGE_DIR, 'lstm_actual_vs_predicted_test.png')
    )
    
    # --- Optional: Plot for validation set ---
    print("\nPlotting Actual vs Predicted for Validation Set...")
    pl.plot_actual_vs_predicted(
        y_true_val_inv,
        y_pred_val_inv,
        val_pred_index, # Use the adjusted index
        title='LSTM: Actual vs Predicted Power (Validation Set)',
        ylabel=config.TARGET_COL,
        save_path=os.path.join(config.IMAGE_DIR, 'lstm_actual_vs_predicted_val.png')
    )
    
    # --- Optional: Plot for training set (might be crowded) ---
    # print("\nPlotting Actual vs Predicted for Training Set...")
    # pl.plot_actual_vs_predicted(
    #     y_true_train_inv,
    #     y_pred_train_inv,
    #     train_pred_index, # Use the adjusted index
    #     title='LSTM: Actual vs Predicted Power (Train Set)',
    #     ylabel=config.TARGET_COL,
    #     save_path=os.path.join(config.IMAGE_DIR, 'lstm_actual_vs_predicted_train.png')
    # )

## 9. Conclusion

The LSTM model (Direct Approach) shows the performance detailed above. Key findings and potential next steps include:

* **(Summarize key performance metrics from the test set)**
* **(Comment on the visual fit in the Actual vs Predicted plot)**
* Implementing and comparing with Approach 2 (Indirect RF+LSTM).
* More extensive hyperparameter tuning (e.g., units, sequence length, learning rate, batch size).
* Exploring different model architectures or features (e.g., adding weather forecasts if available).