# Step 4: Results Analysis and Reporting

This notebook covers the analysis of model results from Step 3, generation of visualizations, and preparation for integrating findings into the final LaTeX report.

## 1. Environment Setup and Library Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tikzplotlib # For LaTeX compatible plots
import os
from sklearn.metrics import roc_auc_score
import joblib # For loading saved models if needed for feature importance

# Define base directory and paths
BASE_DIR = os.path.join(os.path.dirname(os.getcwd()), '') # Assumes notebook is in 'notebooks' subdir
RESULTS_PATH = os.path.join(BASE_DIR, 'results')
FIGURES_PATH = os.path.join(BASE_DIR, 'reports', 'figures')
MODELS_PATH = os.path.join(BASE_DIR, 'models', 'saved_models')

# Ensure figures directory exists
os.makedirs(FIGURES_PATH, exist_ok=True)

print(f"Base Directory: {BASE_DIR}")
print(f"Results Path: {RESULTS_PATH}")
print(f"Figures Path: {FIGURES_PATH}")

## 2. Load and Review Model Results

In [None]:
# Load model comparison metrics (e.g., RMSE, MAE, R2)
try:
    comparison_df = pd.read_csv(os.path.join(RESULTS_PATH, "model_comparison.csv"))
    print("--- Model Performance Comparison (Metrics) ---")
    print(comparison_df)
except FileNotFoundError:
    print(f"Error: model_comparison.csv not found in {RESULTS_PATH}. Please run Step 3 first.")
    comparison_df = pd.DataFrame() # Create empty df to avoid later errors

# Load model predictions
try:
    predictions_df = pd.read_csv(os.path.join(RESULTS_PATH, "model_predictions.csv"))
    # Ensure Date column is parsed as datetime if it's not already
    if 'Date' in predictions_df.columns:
        predictions_df['Date'] = pd.to_datetime(predictions_df['Date'])
    print("\n--- Model Predictions (First 5 rows) ---")
    print(predictions_df.head())
except FileNotFoundError:
    print(f"Error: model_predictions.csv not found in {RESULTS_PATH}. Please run Step 3 first.")
    predictions_df = pd.DataFrame() # Create empty df

### Review of Metrics
*(Markdown cell for observations)*

- Examine the `comparison_df` to identify top-performing models based on RMSE, MAE, R2.
- Note any significant differences in performance.

## 3. Generate Visualizations

### 3.1 Model Comparison Plot (RMSE)

In [None]:
if not comparison_df.empty and 'RMSE' in comparison_df.columns and 'Model' in comparison_df.columns:
    plt.figure(figsize=(10, 6))
    # Sort by RMSE for better visualization
    sorted_comparison_df = comparison_df.sort_values(by='RMSE', ascending=True)
    sns.barplot(x='Model', y='RMSE', data=sorted_comparison_df)
    plt.title('Model Comparison: Root Mean Squared Error (RMSE)')
    plt.xlabel('Model')
    plt.ylabel('RMSE (mm)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout() # Adjust layout to prevent labels from overlapping
    
    # Save as PNG
    rmse_comparison_png_path = os.path.join(FIGURES_PATH, "model_comparison_rmse.png")
    plt.savefig(rmse_comparison_png_path, dpi=300)
    print(f"RMSE comparison plot saved to {rmse_comparison_png_path}")
    
    # Save as TikZ for LaTeX
    rmse_comparison_tex_path = os.path.join(FIGURES_PATH, "model_comparison_rmse.tex")
    try:
        tikzplotlib.save(rmse_comparison_tex_path)
        print(f"RMSE comparison plot saved for LaTeX: {rmse_comparison_tex_path}")
    except Exception as e:
        print(f"Could not save TikZ plot for RMSE comparison: {e}")
    plt.show()
else:
    print("Skipping RMSE comparison plot: comparison_df is empty or missing 'RMSE'/'Model' columns.")

### 3.2 Predictions vs Actual Plot (Best Model)

In [None]:
if not comparison_df.empty and not predictions_df.empty and 'RMSE' in comparison_df.columns:
    # Determine the best model based on the lowest RMSE
    best_model_name = comparison_df.sort_values(by='RMSE').iloc[0]['Model']
    print(f"Best model based on RMSE: {best_model_name}")

    if 'Actual' in predictions_df.columns and f'{best_model_name}_pred' in predictions_df.columns and 'Date' in predictions_df.columns:
        actual_values = predictions_df['Actual']
        predicted_values = predictions_df[f'{best_model_name}_pred']
        dates = predictions_df['Date']

        plt.figure(figsize=(14, 7))
        plt.plot(dates, actual_values, label='Actual Rainfall', color='blue', marker='.', linestyle='-')
        plt.plot(dates, predicted_values, label=f'Predicted Rainfall ({best_model_name})', color='orange', linestyle='--')
        plt.title(f'Actual vs. Predicted Rainfall ({best_model_name})')
        plt.xlabel('Date')
        plt.ylabel('Precipitation (mm)')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        
        # Save as PNG
        pred_actual_png_path = os.path.join(FIGURES_PATH, "predictions_vs_actual.png")
        plt.savefig(pred_actual_png_path, dpi=300)
        print(f"Predictions vs Actual plot saved to {pred_actual_png_path}")
        
        # Save as TikZ for LaTeX
        pred_actual_tex_path = os.path.join(FIGURES_PATH, "predictions_vs_actual.tex")
        try:
            tikzplotlib.save(pred_actual_tex_path)
            print(f"Predictions vs Actual plot saved for LaTeX: {pred_actual_tex_path}")
        except Exception as e:
            print(f"Could not save TikZ plot for Predictions vs Actual: {e}")
        plt.show()
    else:
        print(f"Skipping Predictions vs Actual plot: Columns for best model '{best_model_name}' or 'Actual' or 'Date' not found in predictions_df.")
else:
    print("Skipping Predictions vs Actual plot: comparison_df or predictions_df is empty or RMSE column missing.")

### 3.3 Feature Importance Plot (Example for a loaded model)

This requires loading a saved model (e.g., RandomForest or XGBoost) and its corresponding feature names. 
The `train_models.py` script would need to ensure feature names are available or saved alongside models.
For now, this is a placeholder. You might need to adapt `src/visualization/visualize.py` or add logic here.

In [None]:
# Placeholder for Feature Importance Plot
# Example: Assuming 'RandomForest' was the best and its features are known
# best_model_name_for_fi = 'RandomForest' # Or XGBoost, etc.
# try:
#     model_path = os.path.join(MODELS_PATH, f"{best_model_name_for_fi.lower()}_model.pkl")
#     loaded_model = joblib.load(model_path)
#     print(f"Loaded model {best_model_name_for_fi} from {model_path}")
    
#     # You need the feature names used for training this model
#     # This might come from the 'engineered_features.csv' columns (excluding target/date)
#     # Or saved separately during training
#     engineered_df = pd.read_csv(os.path.join(BASE_DIR, 'data', 'processed', 'engineered_features.csv'))
#     feature_names = [col for col in engineered_df.columns if col not in ['Date', 'Precipitation_mm', 'Year', 'Week_Number']]
    
#     if hasattr(loaded_model, 'feature_importances_'):
#         importances = loaded_model.feature_importances_
#         sorted_indices = np.argsort(importances)[::-1]
        
#         plt.figure(figsize=(10, 8))
#         plt.title(f'Feature Importance for {best_model_name_for_fi}')
#         plt.bar(range(len(feature_names)), importances[sorted_indices], align='center')
#         plt.xticks(range(len(feature_names)), np.array(feature_names)[sorted_indices], rotation=90)
#         plt.tight_layout()
#         fi_png_path = os.path.join(FIGURES_PATH, "feature_importance.png")
#         plt.savefig(fi_png_path, dpi=300)
#         print(f"Feature importance plot saved to {fi_png_path}")
#         plt.show()
#     else:
#         print(f"Model {best_model_name_for_fi} does not have 'feature_importances_' attribute.")
# except FileNotFoundError:
#     print(f"Model file for {best_model_name_for_fi} not found. Skipping feature importance plot.")
# except Exception as e:
#     print(f"Error generating feature importance plot: {e}")
print("Feature importance plot generation is currently a placeholder.")

## 4. Analyze Results and Document Insights

### 4.1 Model Comparison Insights
*(Markdown cell for observations)*

- Which model performed best overall based on regression metrics (RMSE, MAE, R2)?
- Were there any surprising results?
- How do the models compare in terms of complexity vs. performance?

### 4.2 Classification Metrics (AUC-ROC Example)

If the project includes a classification task (e.g., predicting rain/no-rain), calculate AUC-ROC. This assumes 'Precipitation_mm' > 0.1mm signifies a rain event.

In [None]:
if not comparison_df.empty and not predictions_df.empty and 'RMSE' in comparison_df.columns:
    best_model_name = comparison_df.sort_values(by='RMSE').iloc[0]['Model']
    
    if 'Actual' in predictions_df.columns and f'{best_model_name}_pred' in predictions_df.columns:
        y_test_binary = (predictions_df['Actual'] > 0.1).astype(int)
        
        # For regression models, their continuous predictions can be used as scores.
        # Higher predicted rainfall amount implies higher likelihood of 'rain' class.
        # Ensure predictions are non-negative if they can be negative (like ARIMA).
        y_pred_scores = predictions_df[f'{best_model_name}_pred']
        if (y_pred_scores < 0).any():
             y_pred_scores = np.maximum(0, y_pred_scores) # Clip negative predictions at 0
        
        if len(np.unique(y_test_binary)) > 1: # Check for at least two classes in true labels
            try:
                auc_roc = roc_auc_score(y_test_binary, y_pred_scores)
                print(f"\nAUC-ROC for {best_model_name} (treating regression output as classification score): {auc_roc:.4f}")
                
                # Add to comparison_df if not already there from train_models.py
                if 'AUC_ROC' not in comparison_df.columns:
                    comparison_df['AUC_ROC'] = np.nan 
                comparison_df.loc[comparison_df['Model'] == best_model_name, 'AUC_ROC'] = auc_roc
                print("Updated comparison_df with AUC-ROC for best model:")
                print(comparison_df)
            except ValueError as e:
                print(f"Could not calculate AUC-ROC for {best_model_name}: {e}")
        else:
            print(f"Skipping AUC-ROC for {best_model_name}: y_test_binary has only one class.")
    else:
        print(f"Could not calculate AUC-ROC: Prediction column for {best_model_name} or 'Actual' column not found.")
else:
    print("Skipping AUC-ROC calculation: comparison_df or predictions_df is empty.")

### 4.3 Overall Summary and Recommendations
*(Markdown cell for final thoughts)*

- Summarize the key findings from the model training and evaluation.
- Discuss any limitations of the current models or analysis.
- Propose next steps or future improvements (e.g., trying more complex models, further feature engineering, hyperparameter optimization for other models).

## 5. Prepare for LaTeX Report Update

The generated figures (`.png` and `.tex` files) in `reports/figures/` can now be integrated into `reports/latex/rainfall_forecasting_report.tex`.

Example LaTeX code for including a figure:
```latex
\begin{figure}[h!]
    \centering
    % For PNG images
    % \includegraphics[width=0.8\textwidth]{../figures/model_comparison_rmse.png}
    % For TikZ/PGFPlots (preferred for quality)
    \input{../figures/model_comparison_rmse.tex} 
    \caption{Comparison of Model Performance Based on RMSE}
    \label{fig:model_comparison_rmse}
\end{figure}
```
Remember to update the main LaTeX file (`rainfall_forecasting_report.tex` or `expanded_report.tex`) with these figures and the textual analysis derived from this notebook.

## 6. Finalize and Save Notebook

Ensure all cells have been run and outputs are visible. Save the notebook.