### Dublin Air

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("muted")
plt.rcParams['figure.figsize'] = (12, 6)

df = pd.read_csv('/kaggle/input/dublinair/airview_dublincity_roaddata_ugm3.csv')

print(f"Loaded data with {df.shape[0]} rows and {df.shape[1]} columns")
print("\nDataset Information:")
print(f"Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
print("\nFirst 5 rows:")
print(df.head())

pollutant_columns = [col for col in df.columns if col.endswith('_ugm3') or col.endswith('_mgm3')]
print(f"\nFound {len(pollutant_columns)} pollutant columns: {pollutant_columns}")

missing_values = df[pollutant_columns].isnull().sum()
print("\nMissing values in pollutant columns:")
print(missing_values)

def normalize_series(series, method='minmax'):
    series_values = series.values.reshape(-1, 1)
    if method == 'minmax':
        scaler = MinMaxScaler()
    elif method == 'standard':
        scaler = StandardScaler()
    else:
        return series, None
    
    normalized_values = scaler.fit_transform(series_values)
    normalized_series = pd.Series(normalized_values.flatten(), index=series.index)
    return normalized_series, scaler

def run_arima_analysis(data, column_name, order=(1,1,1), test_size=0.2, normalize_method=None):
    series = data[column_name].dropna()
    series = pd.to_numeric(series, errors='coerce')
    series = series.dropna()
    
    if len(series) < 30:
        print(f"Not enough data points for {column_name}. Skipping.")
        return None
    
    original_series = series.copy()
    scaler = None
    
    if normalize_method:
        series, scaler = normalize_series(series, normalize_method)
        print(f"Applied {normalize_method} normalization to {column_name}")
    
    train_size = int(len(series) * (1 - test_size))
    train, test = series[:train_size].values, series[train_size:].values
    original_test = original_series.iloc[train_size:].values if normalize_method else None
    
    print(f"\nAnalyzing {column_name} with ARIMA{order}")
    print(f"Training data size: {len(train)}")
    print(f"Test data size: {len(test)}")
    
    try:
        model = ARIMA(train, order=order)
        model_fit = model.fit()
        
        forecasts = model_fit.forecast(steps=len(test))
        
        if scaler:
            forecasts_2d = forecasts.reshape(-1, 1)
            inverse_forecasts = scaler.inverse_transform(forecasts_2d)
            inverse_forecasts = inverse_forecasts.flatten()
            
            mae = mean_absolute_error(original_test, inverse_forecasts)
            mse = mean_squared_error(original_test, inverse_forecasts)
            rmse = np.sqrt(mse)
            forecasts_for_plot = inverse_forecasts
            actual_for_plot = original_test
        else:
            mae = mean_absolute_error(test, forecasts)
            mse = mean_squared_error(test, forecasts)
            rmse = np.sqrt(mse)
            forecasts_for_plot = forecasts
            actual_for_plot = test
        
        neg_log_likelihood = -model_fit.llf
        
        print(f"Results for {column_name} - ARIMA{order}:")
        print(f"MAE: {mae:.4f}")
        print(f"MSE: {mse:.4f}")
        print(f"RMSE: {rmse:.4f}")
        print(f"Negative Log-Likelihood: {neg_log_likelihood:.4f}")
        
        plt.figure(figsize=(12, 6))
        plt.plot(actual_for_plot, label='Actual', color='blue', marker='o', markersize=4, linestyle='-', linewidth=1)
        plt.plot(forecasts_for_plot, label='Forecast', color='red', marker='x', markersize=4, linestyle='--', linewidth=1)
        norm_text = f" ({normalize_method} normalized)" if normalize_method else ""
        plt.title(f'ARIMA{order} Forecast vs Actual for {column_name}{norm_text}')
        plt.xlabel('Test Sample Index')
        plt.ylabel('Value')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(f"{column_name.replace('/', '_')}_{normalize_method}_arima_forecast.png")
        plt.show()
        
        return {
            'column': column_name,
            'order': order,
            'normalize_method': normalize_method,
            'mae': mae,
            'mse': mse,
            'rmse': rmse,
            'neg_log_likelihood': neg_log_likelihood,
            'model': model_fit,
            'forecasts': forecasts_for_plot,
            'actual': actual_for_plot
        }
    
    except Exception as e:
        print(f"Error fitting ARIMA for {column_name}: {str(e)}")
        return None

results = {}
for column in pollutant_columns:
    for norm_method in [None, 'minmax', 'standard']:
        norm_key = f"{column}_{norm_method}" if norm_method else column
        result = run_arima_analysis(df, column, normalize_method=norm_method)
        if result:
            results[norm_key] = result

if results:
    metrics_df = pd.DataFrame({
        'Pollutant': [],
        'Normalization': [],
        'ARIMA_Order': [],
        'MAE': [],
        'MSE': [],
        'RMSE': [],
        'Neg_Log_Likelihood': []
    })
    
    for key, result in results.items():
        new_row = pd.DataFrame({
            'Pollutant': [result['column']],
            'Normalization': [result['normalize_method'] if result['normalize_method'] else 'None'],
            'ARIMA_Order': [str(result['order'])],
            'MAE': [result['mae']],
            'MSE': [result['mse']],
            'RMSE': [result['rmse']],
            'Neg_Log_Likelihood': [result['neg_log_likelihood']]
        })
        metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)
    
    print("\nMetrics Summary:")
    print(metrics_df)
    
    metrics_df.to_csv('arima_metrics_results.csv', index=False)
    print("Results saved to 'arima_metrics_results.csv'")
    
    plt.figure(figsize=(14, 8))
    
    plt.subplot(2, 2, 1)
    sns.barplot(x='Pollutant', y='RMSE', hue='Normalization', data=metrics_df)
    plt.title('RMSE by Pollutant and Normalization')
    plt.xticks(rotation=45)
    
    plt.subplot(2, 2, 2)
    sns.barplot(x='Pollutant', y='MAE', hue='Normalization', data=metrics_df)
    plt.title('MAE by Pollutant and Normalization')
    plt.xticks(rotation=45)
    
    plt.subplot(2, 2, 3)
    sns.barplot(x='Pollutant', y='MSE', hue='Normalization', data=metrics_df)
    plt.title('MSE by Pollutant and Normalization')
    plt.xticks(rotation=45)
    
    plt.subplot(2, 2, 4)
    sns.barplot(x='Pollutant', y='Neg_Log_Likelihood', hue='Normalization', data=metrics_df)
    plt.title('Negative Log-Likelihood by Pollutant and Normalization')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.savefig('arima_metrics_normalization_comparison.png')
    plt.show()
else:
    print("No successful ARIMA models to report.")

def find_best_arima_model(data, column_name, p_range=(0,2), d_range=(0,2), q_range=(0,2), normalize_method=None):
    series = data[column_name].dropna()
    series = pd.to_numeric(series, errors='coerce')
    series = series.dropna()
    
    if len(series) < 30:
        print(f"Not enough data points for {column_name}. Skipping.")
        return None, None
    
    original_series = series.copy()
    scaler = None
    
    if normalize_method:
        series, scaler = normalize_series(series, normalize_method)
    
    train_size = int(len(series) * 0.8)
    train, test = series[:train_size].values, series[train_size:].values
    original_test = original_series.iloc[train_size:].values if normalize_method else None
    
    print(f"\nFinding best ARIMA model for {column_name}")
    print(f"Training data size: {len(train)}")
    print(f"Test data size: {len(test)}")
    
    grid_results = []
    
    best_rmse = float('inf')
    best_model = None
    best_order = None
    best_result = None
    
    for p in range(p_range[0], p_range[1] + 1):
        for d in range(d_range[0], d_range[1] + 1):
            for q in range(q_range[0], q_range[1] + 1):
                try:
                    order = (p, d, q)
                    model = ARIMA(train, order=order)
                    model_fit = model.fit()
                    
                    forecasts = model_fit.forecast(steps=len(test))
                    
                    if scaler:
                        forecasts_2d = forecasts.reshape(-1, 1)
                        inverse_forecasts = scaler.inverse_transform(forecasts_2d)
                        inverse_forecasts = inverse_forecasts.flatten()
                        
                        mae = mean_absolute_error(original_test, inverse_forecasts)
                        mse = mean_squared_error(original_test, inverse_forecasts)
                        rmse = np.sqrt(mse)
                    else:
                        mae = mean_absolute_error(test, forecasts)
                        mse = mean_squared_error(test, forecasts)
                        rmse = np.sqrt(mse)
                    
                    neg_log_likelihood = -model_fit.llf
                    
                    print(f"ARIMA{order} - RMSE: {rmse:.4f}, NegLogLike: {neg_log_likelihood:.4f}")
                    
                    grid_results.append({
                        'p': p,
                        'd': d,
                        'q': q,
                        'RMSE': rmse,
                        'MAE': mae,
                        'MSE': mse,
                        'Neg_Log_Likelihood': neg_log_likelihood
                    })
                    
                    if rmse < best_rmse:
                        best_rmse = rmse
                        best_model = model_fit
                        best_order = order
                        best_result = {
                            'column': column_name,
                            'order': order,
                            'normalize_method': normalize_method,
                            'mae': mae,
                            'mse': mse,
                            'rmse': rmse,
                            'neg_log_likelihood': neg_log_likelihood,
                            'model': model_fit
                        }
                        
                        if scaler:
                            best_result['forecasts'] = inverse_forecasts
                            best_result['actual'] = original_test
                        else:
                            best_result['forecasts'] = forecasts
                            best_result['actual'] = test
                
                except Exception as e:
                    print(f"Error fitting ARIMA{order}: {str(e)}")
                    continue
    
    grid_df = pd.DataFrame(grid_results)
    
    if not grid_df.empty:
        plt.figure(figsize=(15, 10))
        pivot_df = grid_df.pivot_table(
            index='p', columns=['d', 'q'], values='RMSE'
        )
        sns.heatmap(pivot_df, annot=True, fmt='.2f', cmap='coolwarm')
        norm_text = f" ({normalize_method} normalized)" if normalize_method else ""
        plt.title(f'RMSE for Different ARIMA Parameters - {column_name}{norm_text}')
        plt.tight_layout()
        plt.savefig(f"{column_name.replace('/', '_')}_{normalize_method}_parameter_grid.png")
        plt.show()
    
    if best_model:
        print(f"\nBest model for {column_name}: ARIMA{best_order}")
        print(f"MAE: {best_result['mae']:.4f}")
        print(f"MSE: {best_result['mse']:.4f}")
        print(f"RMSE: {best_result['rmse']:.4f}")
        print(f"Negative Log-Likelihood: {best_result['neg_log_likelihood']:.4f}")
        
        plt.figure(figsize=(12, 6))
        plt.plot(best_result['actual'], 'b-o', markersize=4, label='Actual')
        plt.plot(best_result['forecasts'], 'r--x', markersize=4, label='Forecast')
        norm_text = f" ({normalize_method} normalized)" if normalize_method else ""
        plt.title(f"Best ARIMA{best_order} for {column_name}{norm_text}")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(f"{column_name.replace('/', '_')}_{normalize_method}_best_arima.png")
        plt.show()
        
        print("\nBest model summary:")
        print(best_model.summary())
        
        return best_result, grid_df
    else:
        print(f"No successful models for {column_name}")
        return None, None

pollutant_to_optimize = 'NO2_ugm3'
for norm_method in [None, 'minmax', 'standard']:
    best_model_result, grid_results = find_best_arima_model(df, pollutant_to_optimize, normalize_method=norm_method)

### Seoul Air

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import os
import glob
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("muted")
plt.rcParams['figure.figsize'] = (12, 6)

def load_seoul_data():
    try:
        base_path = '../input/air-pollution-in-seoul/AirPollutionSeoul'
        
        measurement_path = os.path.join(base_path, 'Original Data/Measurement_summary.csv')
        if not os.path.exists(measurement_path):
            measurement_path = os.path.join(base_path, 'Measurement_summary.csv')
        
        if not os.path.exists(measurement_path):
            csv_files = glob.glob('../input/air-pollution-in-seoul/**/*.csv', recursive=True)
            if csv_files:
                measurement_path = csv_files[0]
                print(f"Using {measurement_path}")
        
        df = pd.read_csv(measurement_path)
        
        print(f"Loaded measurement data with {df.shape[0]} rows and {df.shape[1]} columns")
        print(f"Available columns: {', '.join(df.columns)}")
        
        return df
    except Exception as e:
        print(f"Error loading Seoul data: {str(e)}")
        try:
            possible_paths = glob.glob('../input/air-pollution-in-seoul/**/*.csv', recursive=True)
            if possible_paths:
                print(f"Found these potential CSV files: {possible_paths}")
                df = pd.read_csv(possible_paths[0])
                print(f"Loaded {possible_paths[0]} with {df.shape[0]} rows")
                return df
            else:
                raise ValueError("No CSV files found")
        except Exception as e2:
            print(f"Secondary loading attempt failed: {str(e2)}")
            raise ValueError("Failed to load Seoul air quality data")

def prepare_time_series(df, station_code, pollutant, normalize=None):
    try:
        filtered_df = df[df['Station code'] == station_code].copy()
        
        if filtered_df.empty:
            available_stations = df['Station code'].unique()
            print(f"No data found for station {station_code}")
            print(f"Available stations: {available_stations}")
            
            station_code = available_stations[0]
            filtered_df = df[df['Station code'] == station_code].copy()
            print(f"Using station {station_code} instead")
        
        if pollutant not in filtered_df.columns:
            available_pollutants = [col for col in filtered_df.columns 
                                   if col in ['SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5']]
            print(f"Pollutant '{pollutant}' not found")
            print(f"Available pollutants: {available_pollutants}")
            
            pollutant = available_pollutants[0]
            print(f"Using pollutant '{pollutant}' instead")
        
        filtered_df['Timestamp'] = pd.to_datetime(filtered_df['Measurement date'])
        filtered_df = filtered_df.sort_values('Timestamp')
        filtered_df.set_index('Timestamp', inplace=True)
        
        series = filtered_df[pollutant].copy()
        series = series.dropna()
        
        original_series = series.copy()
        scaler = None
        
        if normalize == 'minmax':
            scaler = MinMaxScaler()
            series_values = series.values.reshape(-1, 1)
            normalized_values = scaler.fit_transform(series_values)
            series = pd.Series(normalized_values.flatten(), index=series.index)
            print(f"Applied Min-Max normalization (range: 0-1)")
        
        elif normalize == 'standard':
            scaler = StandardScaler()
            series_values = series.values.reshape(-1, 1)
            normalized_values = scaler.fit_transform(series_values)
            series = pd.Series(normalized_values.flatten(), index=series.index)
            print(f"Applied Standard normalization (mean=0, std=1)")
        
        print(f"Prepared time series for station {station_code}, pollutant {pollutant}")
        print(f"Time series length: {len(series)}")
        if not series.empty:
            print(f"Time range: {series.index.min()} to {series.index.max()}")
            if normalize:
                print(f"Original value range: {original_series.min():.4f} to {original_series.max():.4f}")
                print(f"Normalized value range: {series.min():.4f} to {series.max():.4f}")
            else:
                print(f"Value range: {series.min():.4f} to {series.max():.4f}")
        
        return series, original_series, scaler, pollutant
    
    except Exception as e:
        print(f"Error preparing time series: {str(e)}")
        return pd.Series([]), pd.Series([]), None, pollutant

def analyze_time_series(series, station_code, pollutant):
    if len(series) < 10:
        print("Not enough data points for time series analysis")
        return
    
    plt.figure(figsize=(15, 10))
    
    plt.subplot(3, 1, 1)
    plt.plot(series)
    plt.title(f'Time Series - Station {station_code}, {pollutant}')
    plt.ylabel(f'{pollutant} Value')
    plt.grid(True)
    
    ax2 = plt.subplot(3, 1, 2)
    plot_acf(series, ax=ax2, lags=40)
    plt.title('Autocorrelation Function')
    
    ax3 = plt.subplot(3, 1, 3)
    plot_pacf(series, ax=ax3, lags=40)
    plt.title('Partial Autocorrelation Function')
    
    plt.tight_layout()
    plt.savefig(f"Station{station_code}_{pollutant}_time_series_analysis.png")
    plt.show()

def run_arima_analysis(series, original_series, scaler, station_code, pollutant, order=(1,1,1), test_size=0.2):
    if len(series) < 30:
        print(f"Not enough data points for ARIMA analysis. Need at least 30, got {len(series)}")
        return None
    
    train_size = int(len(series) * (1 - test_size))
    train, test = series.iloc[:train_size].values, series.iloc[train_size:].values
    original_test = original_series.iloc[train_size:].values if original_series is not None else None
    
    print(f"\nAnalyzing Station {station_code}, {pollutant} with ARIMA{order}")
    print(f"Training data size: {len(train)}")
    print(f"Test data size: {len(test)}")
    
    try:
        model = ARIMA(train, order=order)
        model_fit = model.fit()
        
        forecasts = model_fit.forecast(steps=len(test))
        
        if scaler is not None:
            forecasts_2d = forecasts.reshape(-1, 1)
            inverse_forecasts = scaler.inverse_transform(forecasts_2d)
            inverse_forecasts = inverse_forecasts.flatten()
            
            mae = mean_absolute_error(original_test, inverse_forecasts)
            mse = mean_squared_error(original_test, inverse_forecasts)
            rmse = np.sqrt(mse)
            
            print("Metrics calculated on original scale data")
        else:
            mae = mean_absolute_error(test, forecasts)
            mse = mean_squared_error(test, forecasts)
            rmse = np.sqrt(mse)
        
        neg_log_likelihood = -model_fit.llf
        
        print(f"Results for Station {station_code}, {pollutant} - ARIMA{order}:")
        print(f"MAE: {mae:.4f}")
        print(f"MSE: {mse:.4f}")
        print(f"RMSE: {rmse:.4f}")
        print(f"Negative Log-Likelihood: {neg_log_likelihood:.4f}")
        
        plt.figure(figsize=(12, 6))
        
        if scaler is not None:
            plt.plot(original_test, label='Actual', color='blue', marker='o', markersize=3, linestyle='-', linewidth=1)
            plt.plot(inverse_forecasts, label='Forecast', color='red', marker='x', markersize=3, linestyle='--', linewidth=1)
            plt.title(f'ARIMA{order} Forecast vs Actual for Station {station_code}, {pollutant} (Original Scale)')
            plt.ylabel(f'{pollutant} Value')
        else:
            plt.plot(test, label='Actual', color='blue', marker='o', markersize=3, linestyle='-', linewidth=1)
            plt.plot(forecasts, label='Forecast', color='red', marker='x', markersize=3, linestyle='--', linewidth=1)
            plt.title(f'ARIMA{order} Forecast vs Actual for Station {station_code}, {pollutant}')
            plt.ylabel(f'{pollutant} Value')
        
        plt.xlabel('Test Sample Index')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(f"Station{station_code}_{pollutant}_arima_forecast.png")
        plt.show()
        
        return {
            'station_code': station_code,
            'pollutant': pollutant,
            'order': order,
            'mae': mae,
            'mse': mse,
            'rmse': rmse,
            'neg_log_likelihood': neg_log_likelihood,
            'model': model_fit,
            'forecasts': forecasts if scaler is None else inverse_forecasts,
            'actual': test if scaler is None else original_test
        }
    except Exception as e:
        print(f"Error fitting ARIMA for Station {station_code}, {pollutant}: {str(e)}")
        return None

def find_best_arima_model(series, original_series, scaler, station_code, pollutant, p_range=(0,2), d_range=(0,2), q_range=(0,2)):
    if len(series) < 30:
        print(f"Not enough data points for ARIMA parameter optimization. Need at least 30, got {len(series)}")
        return None, None
    
    train_size = int(len(series) * 0.8)
    train, test = series.iloc[:train_size].values, series.iloc[train_size:].values
    original_test = original_series.iloc[train_size:].values if original_series is not None else None
    
    print(f"\nFinding best ARIMA model for Station {station_code}, {pollutant}")
    print(f"Training data size: {len(train)}")
    print(f"Test data size: {len(test)}")
    
    grid_results = []
    
    best_rmse = float('inf')
    best_model = None
    best_order = None
    best_result = None
    
    for p in range(p_range[0], p_range[1] + 1):
        for d in range(d_range[0], d_range[1] + 1):
            for q in range(q_range[0], q_range[1] + 1):
                try:
                    order = (p, d, q)
                    model = ARIMA(train, order=order)
                    model_fit = model.fit()
                    
                    forecasts = model_fit.forecast(steps=len(test))
                    
                    if scaler is not None:
                        forecasts_2d = forecasts.reshape(-1, 1)
                        inverse_forecasts = scaler.inverse_transform(forecasts_2d)
                        inverse_forecasts = inverse_forecasts.flatten()
                        
                        mae = mean_absolute_error(original_test, inverse_forecasts)
                        mse = mean_squared_error(original_test, inverse_forecasts)
                        rmse = np.sqrt(mse)
                    else:
                        mae = mean_absolute_error(test, forecasts)
                        mse = mean_squared_error(test, forecasts)
                        rmse = np.sqrt(mse)
                    
                    neg_log_likelihood = -model_fit.llf
                    
                    print(f"ARIMA{order} - RMSE: {rmse:.4f}, NegLogLike: {neg_log_likelihood:.4f}")
                    
                    grid_results.append({
                        'p': p,
                        'd': d,
                        'q': q,
                        'RMSE': rmse,
                        'MAE': mae,
                        'MSE': mse,
                        'Neg_Log_Likelihood': neg_log_likelihood
                    })
                    
                    if rmse < best_rmse:
                        best_rmse = rmse
                        best_model = model_fit
                        best_order = order
                        best_result = {
                            'station_code': station_code,
                            'pollutant': pollutant,
                            'order': order,
                            'mae': mae,
                            'mse': mse,
                            'rmse': rmse,
                            'neg_log_likelihood': neg_log_likelihood,
                            'model': model_fit,
                            'forecasts': forecasts if scaler is None else inverse_forecasts,
                            'actual': test if scaler is None else original_test
                        }
                except Exception as e:
                    print(f"Error fitting ARIMA{order}: {str(e)}")
                    continue
    
    grid_df = pd.DataFrame(grid_results)
    
    if not grid_df.empty:
        try:
            pivot_df = grid_df.pivot_table(
                index='p', columns=['d', 'q'], values='RMSE'
            )
            plt.figure(figsize=(15, 10))
            sns.heatmap(pivot_df, annot=True, fmt='.2f', cmap='coolwarm')
            plt.title(f'RMSE for Different ARIMA Parameters - Station {station_code}, {pollutant}')
            plt.tight_layout()
            plt.savefig(f"Station{station_code}_{pollutant}_parameter_grid.png")
            plt.show()
        except:
            print("Could not create heatmap visualization for parameter grid")
    
    if best_model:
        print(f"\nBest model for Station {station_code}, {pollutant}: ARIMA{best_order}")
        print(f"MAE: {best_result['mae']:.4f}")
        print(f"MSE: {best_result['mse']:.4f}")
        print(f"RMSE: {best_result['rmse']:.4f}")
        print(f"Negative Log-Likelihood: {best_result['neg_log_likelihood']:.4f}")
        
        plt.figure(figsize=(12, 6))
        plt.plot(best_result['actual'], 'b-o', markersize=3, label='Actual')
        plt.plot(best_result['forecasts'], 'r--x', markersize=3, label='Forecast')
        plt.title(f"Best ARIMA{best_order} for Station {station_code}, {pollutant}")
        plt.xlabel('Test Sample Index')
        plt.ylabel(f'{pollutant} Value')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(f"Station{station_code}_{pollutant}_best_arima.png")
        plt.show()
        
        return best_result, grid_df
    else:
        print(f"No successful models for Station {station_code}, {pollutant}")
        return None, None

def compare_pollutants_across_stations(results_df, metric='RMSE'):
    if results_df.empty:
        print("No results to compare")
        return
    
    plt.figure(figsize=(14, 10))
    
    pivot_df = results_df.pivot_table(
        index='Station Code', columns='Pollutant', values=metric
    )
    
    sns.heatmap(pivot_df, annot=True, fmt='.2f', cmap='YlGnBu')
    plt.title(f'{metric} Comparison Across Stations and Pollutants')
    plt.tight_layout()
    plt.savefig(f"{metric}_comparison_heatmap.png")
    plt.show()

def main():
    try:
        df = load_seoul_data()
        
        print("\nDataset Information:")
        print(df.info())
        print("\nMissing values per column:")
        print(df.isnull().sum())
        
        unique_stations = df['Station code'].unique()
        available_pollutants = [col for col in df.columns 
                               if col in ['SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5']]
        
        print(f"\nFound {len(unique_stations)} stations: {unique_stations}")
        print(f"Found {len(available_pollutants)} pollutants: {available_pollutants}")
        
        all_results = []
        
        selected_station = unique_stations[0]
        selected_pollutant = available_pollutants[0]
        
        # Normalized with Min-Max scaling
        normalized_series, original_series, scaler, pollutant = prepare_time_series(
            df, selected_station, selected_pollutant, normalize='minmax'
        )
        
        analyze_time_series(normalized_series, selected_station, pollutant)
        
        result = run_arima_analysis(
            normalized_series, original_series, scaler, 
            selected_station, pollutant
        )
        
        if result:
            all_results.append({
                'Station Code': selected_station,
                'Pollutant': pollutant,
                'ARIMA Order': str(result['order']),
                'Normalization': 'MinMax',
                'MAE': result['mae'],
                'MSE': result['mse'],
                'RMSE': result['rmse'],
                'Neg_Log_Likelihood': result['neg_log_likelihood']
            })
        
        # Normalized with Standard scaling (Z-score)
        normalized_series, original_series, scaler, pollutant = prepare_time_series(
            df, selected_station, selected_pollutant, normalize='standard'
        )
        
        result = run_arima_analysis(
            normalized_series, original_series, scaler, 
            selected_station, pollutant
        )
        
        if result:
            all_results.append({
                'Station Code': selected_station,
                'Pollutant': pollutant,
                'ARIMA Order': str(result['order']),
                'Normalization': 'Standard',
                'MAE': result['mae'],
                'MSE': result['mse'],
                'RMSE': result['rmse'],
                'Neg_Log_Likelihood': result['neg_log_likelihood']
            })
        
        # Without normalization
        original_series, _, _, pollutant = prepare_time_series(
            df, selected_station, selected_pollutant, normalize=None
        )
        
        result = run_arima_analysis(
            original_series, None, None, 
            selected_station, pollutant
        )
        
        if result:
            all_results.append({
                'Station Code': selected_station,
                'Pollutant': pollutant,
                'ARIMA Order': str(result['order']),
                'Normalization': 'None',
                'MAE': result['mae'],
                'MSE': result['mse'],
                'RMSE': result['rmse'],
                'Neg_Log_Likelihood': result['neg_log_likelihood']
            })
        
        best_result, grid_results = find_best_arima_model(
            normalized_series, original_series, scaler,
            selected_station, pollutant
        )
        
        if best_result:
            all_results.append({
                'Station Code': selected_station,
                'Pollutant': pollutant,
                'ARIMA Order': f"Best-{best_result['order']}",
                'Normalization': 'Standard',
                'MAE': best_result['mae'],
                'MSE': best_result['mse'],
                'RMSE': best_result['rmse'],
                'Neg_Log_Likelihood': best_result['neg_log_likelihood']
            })
        
        subset_stations = unique_stations[:2]
        subset_pollutants = available_pollutants[1:3]
        
        for station in subset_stations:
            for pollutant in subset_pollutants:
                if station == selected_station and pollutant == selected_pollutant:
                    continue
                
                normalized_series, original_series, scaler, pollutant_name = prepare_time_series(
                    df, station, pollutant, normalize='standard'
                )
                
                result = run_arima_analysis(
                    normalized_series, original_series, scaler,
                    station, pollutant_name
                )
                
                if result:
                    all_results.append({
                        'Station Code': station,
                        'Pollutant': pollutant_name,
                        'ARIMA Order': str(result['order']),
                        'Normalization': 'Standard',
                        'MAE': result['mae'],
                        'MSE': result['mse'],
                        'RMSE': result['rmse'],
                        'Neg_Log_Likelihood': result['neg_log_likelihood']
                    })
        
        results_df = pd.DataFrame(all_results)
        
        if not results_df.empty:
            print("\nResults Summary:")
            print(results_df)
            
            results_df.to_csv('seoul_arima_results.csv', index=False)
            print("Results saved to 'seoul_arima_results.csv'")
            
            if len(results_df) > 1:
                compare_pollutants_across_stations(results_df, 'RMSE')
                
                plt.figure(figsize=(15, 8))
                sns.barplot(x='Pollutant', y='RMSE', hue='Normalization', data=results_df)
                plt.title('RMSE by Pollutant and Normalization Method')
                plt.xticks(rotation=45)
                plt.tight_layout()
                plt.savefig('normalization_comparison.png')
                plt.show()
        else:
            print("No results to summarize")
    
    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

### Beijing Air

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import os
import glob
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("muted")
plt.rcParams['figure.figsize'] = (12, 6)

def load_beijing_data(data_path):
    if os.path.isdir(data_path):
        all_files = glob.glob(os.path.join(data_path, "*.csv"))
        
        if len(all_files) > 0:
            print(f"Found {len(all_files)} CSV files in directory")
            
            df_list = []
            
            for file in all_files:
                station_name = os.path.basename(file).replace('.csv', '')
                
                df = pd.read_csv(file)
                
                if 'station' not in df.columns:
                    df['station'] = station_name
                    
                df_list.append(df)
                print(f"Loaded {station_name} with {df.shape[0]} rows and {df.shape[1]} columns")
            
            if len(df_list) > 0:
                combined_df = pd.concat(df_list, ignore_index=True)
            else:
                raise ValueError("No valid CSV files found in directory")
        else:
            raise ValueError(f"No CSV files found in directory: {data_path}")
    
    else:
        if not os.path.exists(data_path):
            possible_paths = [
                '../input/beijing-multi-site-air-quality-data/PRSA_Data_20130301-20170228.csv',
                '../input/beijing-multi-site-air-quality-data/Beijing_Multi_Site_Air_Quality.csv',
                '../input/beijing-multi-site-air-quality-data/Beijing_AirQuality_Stations.csv',
                './PRSA_Data_20130301-20170228.csv',
                './Beijing_Multi_Site_Air_Quality.csv',
                './Beijing_AirQuality_Stations.csv'
            ]
            
            for p in possible_paths:
                if os.path.exists(p):
                    data_path = p
                    print(f"Found dataset at: {data_path}")
                    break
            
            if not os.path.exists(data_path):
                raise ValueError(f"Could not find dataset file. Please specify the correct path.")
        
        print(f"Loading single file: {data_path}")
        
        try:
            combined_df = pd.read_csv(data_path)
            print(f"Loaded combined file with {combined_df.shape[0]} rows and {combined_df.shape[1]} columns")
        except Exception as e:
            raise ValueError(f"Error loading dataset: {str(e)}")
    
    required_columns = ['year', 'month', 'day', 'hour']
    missing_columns = [col for col in required_columns if col not in combined_df.columns]
    
    if missing_columns:
        raise ValueError(f"Dataset missing required columns: {missing_columns}")
    
    if 'station' not in combined_df.columns:
        if 'wd' in combined_df.columns and 'WSPM' in combined_df.columns:
            print("No 'station' column found, but this appears to be a multi-station dataset.")
            if any(col.lower() == 'station' for col in combined_df.columns):
                for col in combined_df.columns:
                    if col.lower() == 'station':
                        combined_df.rename(columns={col: 'station'}, inplace=True)
                        break
            else:
                print("Warning: No station identifier found. Using 'Beijing' as default station.")
                combined_df['station'] = 'Beijing'
    
    if 'datetime' not in combined_df.columns:
        combined_df['datetime'] = pd.to_datetime(combined_df[['year', 'month', 'day', 'hour']])
    
    print(f"Final dataset has {combined_df.shape[0]} rows and {combined_df.shape[1]} columns")
    print(f"Columns: {', '.join(combined_df.columns)}")
    
    if 'station' in combined_df.columns:
        stations = combined_df['station'].unique()
        print(f"Found {len(stations)} stations: {stations}")
    
    pollutant_columns = [col for col in combined_df.columns 
                         if col in ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3']]
    print(f"Found {len(pollutant_columns)} pollutant columns: {pollutant_columns}")
    
    return combined_df

def normalize_series(series, method='minmax'):
    series_values = series.values.reshape(-1, 1)
    if method == 'minmax':
        scaler = MinMaxScaler()
    elif method == 'standard':
        scaler = StandardScaler()
    else:
        return series, None
    
    normalized_values = scaler.fit_transform(series_values)
    normalized_series = pd.Series(normalized_values.flatten(), index=series.index)
    return normalized_series, scaler

def prepare_time_series(df, station, pollutant, normalize_method=None):
    try:
        station_df = df[df['station'] == station].copy()
        
        if station_df.empty:
            available_stations = df['station'].unique()
            print(f"No data found for station '{station}'. Available stations: {available_stations}")
            station = available_stations[0]
            station_df = df[df['station'] == station].copy()
            print(f"Using station '{station}' instead")
        
        if pollutant not in station_df.columns:
            potential_pollutants = [col for col in station_df.columns 
                                   if col in ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3']]
            
            if not potential_pollutants:
                numeric_cols = station_df.select_dtypes(include=['number']).columns.tolist()
                potential_pollutants = [col for col in numeric_cols 
                                       if col not in ['year', 'month', 'day', 'hour', 'No']]
            
            if potential_pollutants:
                print(f"Pollutant '{pollutant}' not found. Available pollutants: {potential_pollutants}")
                pollutant = potential_pollutants[0]
                print(f"Using pollutant '{pollutant}' instead")
            else:
                raise ValueError(f"No suitable pollutant columns found in the dataset")
        
        station_df = station_df.sort_values('datetime')
        station_df.set_index('datetime', inplace=True)
        
        series = station_df[pollutant].copy()
        series = series.dropna()
        
        original_series = series.copy()
        scaler = None
        
        if normalize_method:
            series, scaler = normalize_series(series, normalize_method)
            print(f"Applied {normalize_method} normalization to {pollutant}")
            print(f"Original range: {original_series.min():.4f} to {original_series.max():.4f}")
            print(f"Normalized range: {series.min():.4f} to {series.max():.4f}")
        
        print(f"Prepared time series for {pollutant} at {station}")
        print(f"Time series length: {len(series)}")
        if not series.empty:
            print(f"Time range: {series.index.min()} to {series.index.max()}")
        
        return series, original_series, scaler, pollutant
        
    except Exception as e:
        print(f"Error preparing time series: {str(e)}")
        return pd.Series([]), pd.Series([]), None, pollutant

def analyze_time_series(series, pollutant, station):
    if len(series) < 10:
        print("Not enough data points for time series analysis")
        return
    
    plt.figure(figsize=(15, 10))
    
    plt.subplot(3, 1, 1)
    plt.plot(series)
    plt.title(f'Time Series - {pollutant} at {station}')
    plt.ylabel(f'{pollutant} Concentration')
    plt.grid(True)
    
    ax2 = plt.subplot(3, 1, 2)
    plot_acf(series, ax=ax2, lags=40)
    plt.title('Autocorrelation Function')
    
    ax3 = plt.subplot(3, 1, 3)
    plot_pacf(series, ax=ax3, lags=40)
    plt.title('Partial Autocorrelation Function')
    
    plt.tight_layout()
    plt.savefig(f"{pollutant}_{station}_time_series_analysis.png".replace(' ', '_'))
    plt.show()

def run_arima_analysis(series, original_series, scaler, pollutant, station, order=(1,1,1), test_size=0.2):
    if len(series) < 30:
        print(f"Not enough data points for {pollutant} at {station}. Skipping.")
        return None
    
    train_size = int(len(series) * (1 - test_size))
    train, test = series.iloc[:train_size].values, series.iloc[train_size:].values
    original_test = original_series.iloc[train_size:].values if original_series is not None else None
    
    print(f"\nAnalyzing {pollutant} at {station} with ARIMA{order}")
    print(f"Training data size: {len(train)}")
    print(f"Test data size: {len(test)}")
    
    try:
        model = ARIMA(train, order=order)
        model_fit = model.fit()
        
        forecasts = model_fit.forecast(steps=len(test))
        
        if scaler is not None:
            forecasts_2d = forecasts.reshape(-1, 1)
            inverse_forecasts = scaler.inverse_transform(forecasts_2d)
            inverse_forecasts = inverse_forecasts.flatten()
            
            mae = mean_absolute_error(original_test, inverse_forecasts)
            mse = mean_squared_error(original_test, inverse_forecasts)
            rmse = np.sqrt(mse)
            forecasts_for_plot = inverse_forecasts
            actual_for_plot = original_test
            print("Metrics calculated on original scale data")
        else:
            mae = mean_absolute_error(test, forecasts)
            mse = mean_squared_error(test, forecasts)
            rmse = np.sqrt(mse)
            forecasts_for_plot = forecasts
            actual_for_plot = test
        
        neg_log_likelihood = -model_fit.llf
        
        print(f"Results for {pollutant} at {station} - ARIMA{order}:")
        print(f"MAE: {mae:.4f}")
        print(f"MSE: {mse:.4f}")
        print(f"RMSE: {rmse:.4f}")
        print(f"Negative Log-Likelihood: {neg_log_likelihood:.4f}")
        
        plt.figure(figsize=(12, 6))
        plt.plot(actual_for_plot, label='Actual', color='blue', marker='o', markersize=3, linestyle='-', linewidth=1)
        plt.plot(forecasts_for_plot, label='Forecast', color='red', marker='x', markersize=3, linestyle='--', linewidth=1)
        norm_text = f" ({scaler.__class__.__name__})" if scaler else ""
        plt.title(f'ARIMA{order} Forecast vs Actual for {pollutant} at {station}{norm_text}')
        plt.xlabel('Test Sample Index')
        plt.ylabel(f'{pollutant} Concentration')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(f"{pollutant}_{station}_arima_forecast{norm_text}.png".replace(' ', '_'))
        plt.show()
        
        return {
            'pollutant': pollutant,
            'station': station,
            'order': order,
            'normalize_method': scaler.__class__.__name__ if scaler else 'None',
            'mae': mae,
            'mse': mse,
            'rmse': rmse,
            'neg_log_likelihood': neg_log_likelihood,
            'model': model_fit,
            'forecasts': forecasts_for_plot,
            'actual': actual_for_plot
        }
    
    except Exception as e:
        print(f"Error fitting ARIMA for {pollutant} at {station}: {str(e)}")
        return None

def find_best_arima_model(series, original_series, scaler, pollutant, station, p_range=(0,2), d_range=(0,2), q_range=(0,2)):
    if len(series) < 30:
        print(f"Not enough data points for {pollutant} at {station}. Skipping.")
        return None, None
    
    train_size = int(len(series) * 0.8)
    train, test = series.iloc[:train_size].values, series.iloc[train_size:].values
    original_test = original_series.iloc[train_size:].values if original_series is not None else None
    
    print(f"\nFinding best ARIMA model for {pollutant} at {station}")
    print(f"Training data size: {len(train)}")
    print(f"Test data size: {len(test)}")
    
    grid_results = []
    
    best_rmse = float('inf')
    best_model = None
    best_order = None
    best_result = None
    
    for p in range(p_range[0], p_range[1] + 1):
        for d in range(d_range[0], d_range[1] + 1):
            for q in range(q_range[0], q_range[1] + 1):
                try:
                    order = (p, d, q)
                    model = ARIMA(train, order=order)
                    model_fit = model.fit()
                    
                    forecasts = model_fit.forecast(steps=len(test))
                    
                    if scaler is not None:
                        forecasts_2d = forecasts.reshape(-1, 1)
                        inverse_forecasts = scaler.inverse_transform(forecasts_2d)
                        inverse_forecasts = inverse_forecasts.flatten()
                        
                        mae = mean_absolute_error(original_test, inverse_forecasts)
                        mse = mean_squared_error(original_test, inverse_forecasts)
                        rmse = np.sqrt(mse)
                        forecasts_for_save = inverse_forecasts
                        actual_for_save = original_test
                    else:
                        mae = mean_absolute_error(test, forecasts)
                        mse = mean_squared_error(test, forecasts)
                        rmse = np.sqrt(mse)
                        forecasts_for_save = forecasts
                        actual_for_save = test
                    
                    neg_log_likelihood = -model_fit.llf
                    
                    print(f"ARIMA{order} - RMSE: {rmse:.4f}, NegLogLike: {neg_log_likelihood:.4f}")
                    
                    grid_results.append({
                        'p': p,
                        'd': d,
                        'q': q,
                        'RMSE': rmse,
                        'MAE': mae,
                        'MSE': mse,
                        'Neg_Log_Likelihood': neg_log_likelihood
                    })
                    
                    if rmse < best_rmse:
                        best_rmse = rmse
                        best_model = model_fit
                        best_order = order
                        best_result = {
                            'pollutant': pollutant,
                            'station': station,
                            'order': order,
                            'normalize_method': scaler.__class__.__name__ if scaler else 'None',
                            'mae': mae,
                            'mse': mse,
                            'rmse': rmse,
                            'neg_log_likelihood': neg_log_likelihood,
                            'model': model_fit,
                            'forecasts': forecasts_for_save,
                            'actual': actual_for_save
                        }
                
                except Exception as e:
                    print(f"Error fitting ARIMA{order}: {str(e)}")
                    continue
    
    grid_df = pd.DataFrame(grid_results)
    
    if not grid_df.empty:
        if len(grid_df) > 1:
            try:
                pivot_df = grid_df.pivot_table(
                    index='p', columns=['d', 'q'], values='RMSE'
                )
                plt.figure(figsize=(15, 10))
                sns.heatmap(pivot_df, annot=True, fmt='.2f', cmap='coolwarm')
                norm_text = f" ({scaler.__class__.__name__})" if scaler else ""
                plt.title(f'RMSE for Different ARIMA Parameters - {pollutant} at {station}{norm_text}')
                plt.tight_layout()
                plt.savefig(f"{pollutant}_{station}_parameter_grid{norm_text}.png".replace(' ', '_'))
                plt.show()
            except:
                print("Could not create heatmap visualization")
    
    if best_model:
        print(f"\nBest model for {pollutant} at {station}: ARIMA{best_order}")
        print(f"MAE: {best_result['mae']:.4f}")
        print(f"MSE: {best_result['mse']:.4f}")
        print(f"RMSE: {best_result['rmse']:.4f}")
        print(f"Negative Log-Likelihood: {best_result['neg_log_likelihood']:.4f}")
        
        plt.figure(figsize=(12, 6))
        plt.plot(best_result['actual'], 'b-o', markersize=3, label='Actual')
        plt.plot(best_result['forecasts'], 'r--x', markersize=3, label='Forecast')
        norm_text = f" ({scaler.__class__.__name__})" if scaler else ""
        plt.title(f"Best ARIMA{best_order} for {pollutant} at {station}{norm_text}")
        plt.xlabel('Test Sample Index')
        plt.ylabel(f'{pollutant} Concentration')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(f"{pollutant}_{station}_best_arima{norm_text}.png".replace(' ', '_'))
        plt.show()
        
        return best_result, grid_df
    else:
        print(f"No successful models for {pollutant} at {station}")
        return None, None

def compare_results(results_df, metric='RMSE'):
    if results_df.empty:
        print("No results to compare")
        return
    
    plt.figure(figsize=(14, 10))
    
    if 'Station' in results_df.columns and 'Normalization' in results_df.columns:
        pivot_df = results_df.pivot_table(
            index='Station', columns=['Pollutant', 'Normalization'], values=metric
        )
    elif 'Station' in results_df.columns:
        pivot_df = results_df.pivot_table(
            index='Station', columns='Pollutant', values=metric
        )
    else:
        pivot_df = results_df.pivot_table(
            index='Pollutant', columns='Normalization', values=metric
        )
    
    sns.heatmap(pivot_df, annot=True, fmt='.2f', cmap='YlGnBu')
    plt.title(f'{metric} Comparison')
    plt.tight_layout()
    plt.savefig(f"{metric}_comparison_heatmap.png")
    plt.show()
    
    if 'Normalization' in results_df.columns:
        plt.figure(figsize=(14, 7))
        sns.barplot(x='Pollutant', y=metric, hue='Normalization', data=results_df)
        plt.title(f'{metric} by Pollutant and Normalization Method')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f"{metric}_normalization_comparison.png")
        plt.show()

def main():
    try:
        df = load_beijing_data('/kaggle/input/beijing-multisite-airquality-data-set')
        
        print("\nDataset Information:")
        print(df.info())
        print("\nMissing values per column:")
        print(df.isnull().sum())
        
        pollutants = ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3']
        stations = df['station'].unique()
        print(f"\nFound {len(stations)} stations: {stations}")
        
        all_results = []
        
        selected_pollutant = 'PM2.5'
        selected_station = stations[0]
        
        for normalize_method in [None, 'minmax', 'standard']:
            norm_name = normalize_method if normalize_method else 'none'
            print(f"\n{'='*50}")
            print(f"Analyzing with {norm_name} normalization")
            print(f"{'='*50}")
            
            series, original_series, scaler, pollutant = prepare_time_series(
                df, selected_station, selected_pollutant, normalize_method
            )
            
            if normalize_method is None:
                analyze_time_series(series, pollutant, selected_station)
            
            result = run_arima_analysis(
                series, original_series, scaler, 
                pollutant, selected_station
            )
            
            if result:
                all_results.append({
                    'Pollutant': pollutant,
                    'Station': selected_station,
                    'ARIMA_Order': str(result['order']),
                    'Normalization': result['normalize_method'],
                    'MAE': result['mae'],
                    'MSE': result['mse'],
                    'RMSE': result['rmse'],
                    'Neg_Log_Likelihood': result['neg_log_likelihood']
                })
            
            best_result, grid_results = find_best_arima_model(
                series, original_series, scaler,
                pollutant, selected_station
            )
            
            if best_result:
                all_results.append({
                    'Pollutant': pollutant,
                    'Station': selected_station,
                    'ARIMA_Order': f"Best-{best_result['order']}",
                    'Normalization': best_result['normalize_method'],
                    'MAE': best_result['mae'],
                    'MSE': best_result['mse'],
                    'RMSE': best_result['rmse'],
                    'Neg_Log_Likelihood': best_result['neg_log_likelihood']
                })
        
        subset_stations = stations[:2]
        subset_pollutants = pollutants[1:3]
        
        for station in subset_stations:
            if station == selected_station:
                continue
            
            for pollutant in subset_pollutants:
                series, original_series, scaler, pollutant_name = prepare_time_series(
                    df, station, pollutant, normalize_method='standard'
                )
                
                result = run_arima_analysis(
                    series, original_series, scaler,
                    pollutant_name, station
                )
                
                if result:
                    all_results.append({
                        'Pollutant': pollutant_name,
                        'Station': station,
                        'ARIMA_Order': str(result['order']),
                        'Normalization': result['normalize_method'],
                        'MAE': result['mae'],
                        'MSE': result['mse'],
                        'RMSE': result['rmse'],
                        'Neg_Log_Likelihood': result['neg_log_likelihood']
                    })
        
        results_df = pd.DataFrame(all_results)
        
        if not results_df.empty:
            print("\nResults Summary:")
            print(results_df)
            
            results_df.to_csv('beijing_arima_results.csv', index=False)
            print("Results saved to 'beijing_arima_results.csv'")
            
            compare_results(results_df, 'RMSE')
            compare_results(results_df, 'MAE')
        else:
            print("No results to summarize")
    
    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()