In [2]:
# --- Notebook Setup and Imports ---
# Enable autoreloading of imported modules
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging

# Configure basic logging for visibility
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Add project root to sys.path to enable absolute imports from 'src'
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Add courselib parent path to sys.path
courselib_parent_path = os.path.abspath(os.path.join(project_root, "..", "AppliedML"))
if courselib_parent_path not in sys.path:
    sys.path.insert(0, courselib_parent_path)

# --- Configure Logging ---
if not logging.root.handlers:
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Import Project-Specific Modules ---
from src.data.data_processor import AirQualityProcessor
from src.features.feature_engineer import LagFeatureEngineer
from src.models.train_model import ModelEvaluator
from src.models.model_selection import evaluate_lag_depth_effect
from src.visualization.analysis import (
    plot_acf_pacf, 
    plot_error_by_time_group
)
from src.visualization.plotting import (
    plot_time_series,
    plot_residuals,
    plot_coefficients,
    plot_predictions_vs_actual,
    plot_error_by_time_group,
    plot_lag_depth_results
)
from src.utils.config import TARGET_POLLUTANT, START_DATE, END_DATE, LAG_DEPTH # LAG_DEPTH for initial setup example

# --- Import External Libraries for Models ---
from courselib.models.linear_models import LinearRegression
from courselib.optimizers import GDOptimizer
from sklearn.linear_model import Ridge # Example scikit-learn model

# For better display of DataFrames in Jupyter
from IPython.display import display

logger.info("All necessary modules and libraries imported.")

2025-07-15 21:31:04,840 - INFO - All necessary modules and libraries imported.


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# --- Configuration ---

# --- Pipeline Parameters ---
# Data parameters
PIPELINE_TARGET_POLLUTANT = TARGET_POLLUTANT # From utils/config.py
PIPELINE_START_DATE = START_DATE           # From utils/config.py
PIPELINE_END_DATE = END_DATE               # From utils/config.py

# Lag depths for comprehensive evaluation
PIPELINE_TEST_LAGS = [1, 2, 3, 6, 12, 24, 48] # Example lags to test

# Model configurations for comprehensive evaluation
# These match what's expected by model_selection.py and train_model_class.py
PIPELINE_MODEL_CONFIGS = {
    "Courselib LR (GD 0.0001)": {
        "model_class": LinearRegression,
        "is_courselib_model": True,
        "optimizer": GDOptimizer(learning_rate=0.0001),
        "fit_params": {"num_epochs": 2000, "batch_size": 32}
    },
    "Courselib LR (GD 0.001)": {
        "model_class": LinearRegression,
        "is_courselib_model": True,
        "optimizer": GDOptimizer(learning_rate=0.001),
        "fit_params": {"num_epochs": 2000, "batch_size": 32}
    },
    "Scikit-learn Ridge (Alpha 1.0)": {
        "model_class": Ridge,
        "is_courselib_model": False,
        "init_params": {"alpha": 1.0}
    },
    "Scikit-learn Ridge (Alpha 0.1)": {
        "model_class": Ridge,
        "is_courselib_model": False,
        "init_params": {"alpha": 0.1}
    }
}

logger.info("Pipeline configuration loaded.")

2025-07-15 21:32:48,993 - INFO - Pipeline configuration loaded.


In [None]:
# --- Data Loading and Initial Exploration ---

logger.info("--- Data Loading and Initial Exploration ---")

try:
    processor = AirQualityProcessor(
        target_pollutant=PIPELINE_TARGET_POLLUTANT,
        start_date=PIPELINE_START_DATE,
        end_date=PIPELINE_END_DATE
    )
    time_series = processor.get_target_time_series()
    
    if time_series.empty:
        logger.error("Initial time series data is empty. Please check data source and dates.")
    else:
        logger.info(f"Loaded time series data for {PIPELINE_TARGET_POLLUTANT} "
                    f"from {PIPELINE_START_DATE} to {PIPELINE_END_DATE}. Shape: {time_series.shape}")
        
        print("\nFirst 5 rows of the time series:")
        display(time_series.head())
        print("\nTime series basic info:")
        time_series.info()
        print("\nTime series descriptive statistics:")
        display(time_series.describe())

        # Plot Initial Time Series
        plot_time_series(time_series, title=f"{PIPELINE_TARGET_POLLUTANT} Time Series", 
                         ylabel=f"{PIPELINE_TARGET_POLLUTANT} (µg/m³)")
        
        # Plot Initial ACF/PACF Analysis
        plot_acf_pacf(time_series, lags=48)

except Exception as e:
    logger.exception("An error occurred during data loading or initial visualization.")