# Step 3-2 ARIMAX

For ARIMA-X, the features we should add are exogenous variables.

Not all the features from the previous step, Feature Selection, can be used. (such like cumulative_co2)

* ARIMA-X already handles temporal dynamics, exclude `time lags`

* Any `cumulative` features will be excluded since they are not stationary -> monotonically increase

The selected features are:

`primary_energy_consumption`, `gdp`, `population`, `electricity_demand`

**workflow of ARIMAX**:

select exogenous variables -> check stationarity of exogenous variables -> difference exogenous variables to match ARIMA's parameter d ->

train ARIMAX on train data -> fit once -> forecast test period

### Necessary imports

In [31]:
# Necessary imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller

import warnings
import os

warnings.filterwarnings("ignore")

### Config

In [15]:
TARGET_VARIABLES = 'co2'
SELECTED_COUNTRIES = ['United States', 'China', 'India']
TEST_SIZE = 9
save_dir = 'data/03_02_results'
os.makedirs(save_dir, exist_ok=True)

# ARIMA optimal orders
ARIMA_ORDERS = {
    'United States': (0, 1, 0),
    'China': (0, 2, 0),
    'India': (1, 1, 1)
}

# Exogenous variables
EXOG_VARIABLES = [
    'gdp',
    'primary_energy_consumption',
    #'electricity_demand', # this variable is excluded since the available data for China and India are from year 2000 
    'population'
]

### Data load

In [16]:
def load_data(save_dir='data'):
    data_files = {
        'all_data_df': os.path.join(save_dir, 'all_data_df.csv'),
        'g20_lag_df': os.path.join(save_dir, 'g20_lag_df.csv'),
        'lag_three_sel_1969_df': os.path.join(save_dir, 'lag_three_sel_1969_df.csv')
    }

    dfs = {}
    for name, filepath in data_files.items():
        if os.path.exists(filepath):
            dfs[name] = pd.read_csv(filepath)
            print(f"Loaded {name}: {dfs[name].shape}")
        else:
            print(f"{filepath} not found")
    
    return dfs

In [17]:
data = load_data()
all_data_df = data['all_data_df']
g20_lag_df = data['g20_lag_df']
g20_lag_1969_df = g20_lag_df[g20_lag_df['year'] >= 1969].copy()
g20_lag_1969_df = g20_lag_1969_df[g20_lag_1969_df['year'] < 2023]
lag_three_sel_1969_df = data['lag_three_sel_1969_df']

Loaded all_data_df: (55529, 200)
Loaded g20_lag_df: (3744, 992)
Loaded lag_three_sel_1969_df: (162, 992)


In [18]:
# Checking if exogenous varialbes in the dataset
for feature in EXOG_VARIABLES:
    if feature in lag_three_sel_1969_df.columns:
        print(f"    {feature} in the dataset")
    else:
        print(f"    {feature} not in the dataset")

    gdp in the dataset
    primary_energy_consumption in the dataset
    population in the dataset


### Data preparation

In [19]:
# train_test_split
def tts_by_year(df, test_size=9):
    train_data = {}
    test_data = {}

    for country in df['country'].unique():
        country_data = df[df['country'] == country].sort_values('year')

        split_idx = len(country_data) - test_size
        train_data[country] = country_data.iloc[:split_idx]
        test_data[country] = country_data.iloc[split_idx:]

    train_df = pd.concat(train_data.values(), ignore_index=True)
    test_df = pd.concat(test_data.values(), ignore_index=True)

    return train_df, test_df

In [24]:
# Keep necessary columns
cols_to_keep = ['country', 'year', TARGET_VARIABLES] + EXOG_VARIABLES
selected_data = lag_three_sel_1969_df[cols_to_keep].copy()

selected_data = selected_data.dropna()

print(f"Data shape: {selected_data.shape}")
print(f"Year: {selected_data['year'].min()} - {selected_data['year'].max()}")

Data shape: (162, 6)
Year: 1969 - 2022


In [25]:
# TTS
train_df, test_df = tts_by_year(selected_data, TEST_SIZE)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Train year: {train_df['year'].min()} - {train_df['year'].max()}")
print(f"Test year: {test_df['year'].min()} - {test_df['year'].max()}")

Train shape: (135, 6)
Test shape: (27, 6)
Train year: 1969 - 2013
Test year: 2014 - 2022


### Helper

In [26]:
def mase(y_actual, y_pred, period=1):
    mae_forecast = mean_absolute_error(y_actual, y_pred)

    naive_forecast = y_actual[:-period] if period > 0 else y_actual[:-1]
    actual_for_naive = y_actual[period:] if period > 0 else y_actual[1:]

    if len(naive_forecast) == 0:
        return np.nan
    
    mae_naive = mean_absolute_error(actual_for_naive, naive_forecast)

    if mae_naive == 0:
        return 0 if mae_forecast == 0 else np.inf
    
    return mae_forecast / mae_naive

In [37]:
def adf_test(series, name='Series', country=''):
    """
    p_value < 0.05: Rejct Null hypothesis -> stationary
    """
    # pd -> dropna
    if isinstance(series, pd.Series):
        updated_series = series.dropna()
    # np -> remove nan
    else:
        updated_series = series[~np.isnan(series)]

    result = adfuller(updated_series, autolag='AIC')

    adf_stat = result[0]
    p_value = result[1]
    critical_values = result[4]

    print(f"\n{name} ({country}):")
    print(f"ADF Statistics: {adf_stat:.4f}")
    print(f"p-value: {p_value:.4f}")

    if p_value < 0.05:
        is_stationary = True
    else:
        is_stationary = False

    print(f"Critical Value:")
    for k, v in critical_values.items():
        print(f"    {k}: {v:.4f}")

    return {
        'adf_stat': adf_stat,
        'p_value': p_value,
        'critical_values': critical_values,
        'is_stationary': is_stationary
    }

In [28]:
def check_stationarity(df, features, country):
    print(f"\nFor {country.upper()}")

    results = {}

    for feature in features:
        feature_data = df[feature].values
        result = adf_test(feature_data, name=feature, country=country)
        results[feature] = result

    return results

In [29]:
def difference_series(series, order):
    diff_series = series.copy()
    for i in range(order):
        diff_series = diff_series.diff().dropna()
    return diff_series

### Check Stationarity for each country

In [38]:
stationarity_results = {}

for country in SELECTED_COUNTRIES:
    country_train = train_df[train_df['country'] == country].sort_values('year')

    print(f"\nOriginal features for {country.upper()}")
    original_results = check_stationarity(country_train, EXOG_VARIABLES, country)

    d = ARIMA_ORDERS[country][1]

    if d > 0:
        differenced_results = {}
        for feature in EXOG_VARIABLES:
            feature_data = country_train[feature].values

            diff_data = feature_data.copy()
            for i in range(d):
                diff_data = np.diff(diff_data)

            result = adf_test(diff_data, name=f"{feature} d={d}", country=country)
            differenced_results[feature] = result

        stationarity_results[country] = {
            'original': original_results,
            'differenced': differenced_results,
            'd': d
        }
    else:
        stationarity_results[country] = {
            'original': original_results,
            'd': d
        }


Original features for UNITED STATES

For UNITED STATES

gdp (United States):
ADF Statistics: 0.3374
p-value: 0.9790
Critical Value:
    1%: -3.5925
    5%: -2.9315
    10%: -2.6041

primary_energy_consumption (United States):
ADF Statistics: -1.6422
p-value: 0.4611
Critical Value:
    1%: -3.5886
    5%: -2.9299
    10%: -2.6032

population (United States):
ADF Statistics: 5.3776
p-value: 1.0000
Critical Value:
    1%: -3.5966
    5%: -2.9333
    10%: -2.6050

gdp d=1 (United States):
ADF Statistics: -4.4699
p-value: 0.0002
Critical Value:
    1%: -3.5925
    5%: -2.9315
    10%: -2.6041

primary_energy_consumption d=1 (United States):
ADF Statistics: -5.4121
p-value: 0.0000
Critical Value:
    1%: -3.5925
    5%: -2.9315
    10%: -2.6041

population d=1 (United States):
ADF Statistics: -0.8223
p-value: 0.8125
Critical Value:
    1%: -3.6209
    5%: -2.9435
    10%: -2.6104

Original features for CHINA

For CHINA

gdp (China):
ADF Statistics: 14.5504
p-value: 1.0000
Critical Value:
  

In [39]:
for country in SELECTED_COUNTRIES:
    d = stationarity_results[country]['d']
    print(f"\n{country.upper()} - ARIMA order{ARIMA_ORDERS[country]}")

    if d > 0:
        print(f"\nBefore differencing:")
        for feature, result in stationarity_results[country]['original'].items():
            stat = "Stationary" if result['is_stationary'] else "Not Stationary"
            print(f"    {feature}: {stat}, p-value = {result['p_value']:.4f}")

        print(f"\nDifferenced features d = {d}:")
        for feature, result in stationarity_results[country]['differenced'].items():
            stat = "Stationary" if result['is_stationary'] else "Not Stationary"
            print(f"    {feature}: {stat}, p-value = {result['p_value']:.4f}")
    else:
        print(f" No differencing needed:")
        for feature, result in stationarity_results[country]['original'].items():
            stat = "Stationary" if result['is_stationary'] else "Not Stationary"
            print(f"    {feature}: {stat}, p-value = {result['p_value']:.4f}")


UNITED STATES - ARIMA order(0, 1, 0)

Before differencing:
    gdp: Not Stationary, p-value = 0.9790
    primary_energy_consumption: Not Stationary, p-value = 0.4611
    population: Not Stationary, p-value = 1.0000

Differenced features d = 1:
    gdp: Stationary, p-value = 0.0002
    primary_energy_consumption: Stationary, p-value = 0.0000
    population: Not Stationary, p-value = 0.8125

CHINA - ARIMA order(0, 2, 0)

Before differencing:
    gdp: Not Stationary, p-value = 1.0000
    primary_energy_consumption: Not Stationary, p-value = 0.9982
    population: Stationary, p-value = 0.0111

Differenced features d = 2:
    gdp: Stationary, p-value = 0.0001
    primary_energy_consumption: Stationary, p-value = 0.0000
    population: Not Stationary, p-value = 0.0963

INDIA - ARIMA order(1, 1, 1)

Before differencing:
    gdp: Not Stationary, p-value = 1.0000
    primary_energy_consumption: Not Stationary, p-value = 1.0000
    population: Stationary, p-value = 0.0001

Differenced features 

We can see population is not stationary for all the countries since it is monotonically increasing.

None of the features were stationary with the optimal ARIMA differencing order for India. For India, only ARIMA is usable.