# Recursive Framework

### Imports and Loads

In [None]:
# Necessary imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Dropout, BatchNormalization
from tensorflow.keras.layers import Bidirectional, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import time
import warnings
import pickle
import os
warnings.filterwarnings("ignore")

# Data load
df_co2 = pd.read_csv("https://raw.githubusercontent.com/owid/co2-data/master/owid-co2-data.csv")
df_energy = pd.read_csv("https://raw.githubusercontent.com/owid/energy-data/refs/heads/master/owid-energy-data.csv")

### Pre-processing the dataframe

In [2]:
common_columns = set(df_co2.columns).intersection(set(df_energy.columns))
print(common_columns)

{'energy_per_capita', 'population', 'iso_code', 'primary_energy_consumption', 'gdp', 'year', 'energy_per_gdp', 'country'}


In [3]:
# G20 countries
g20_countries = [
    'United States', 'China', 'Japan', 'Germany', 
    'United Kingdom', 'France', 'Italy', 'Canada',
    'Brazil', 'Russia', 'India', 'Australia', 
    'Mexico', 'Indonesia', 'Turkey', 'Saudi Arabia',
    'South Africa', 'Argentina', 'South Korea', 'Europe'
]

# Remove iso_code from the common columns
common_cols = list(common_columns)
common_cols.remove('iso_code')

# Merge keys
merge_keys = ['country', 'year']

# Unique columns in each dataframe
remain_common_cols = [col for col in common_cols if col not in merge_keys]
df_co2_unique_cols = list(set(df_co2.columns) - set(df_energy.columns) - set(merge_keys))
df_energy_unique_cols = list(set(df_energy.columns) - set(df_co2.columns) - set(merge_keys))
overlap_cols = list(set(df_co2.columns).intersection(set(df_energy.columns)) - set(common_cols) - set(merge_keys))

# Dataframe with unique rows
df_co2_clean = df_co2.drop_duplicates(merge_keys)
df_energy_clean = df_energy.drop_duplicates(merge_keys)

# Merge common cols and unique cols
col_co2 = remain_common_cols + df_co2_unique_cols
col_energy = df_energy_unique_cols

# Common cols appear once
merged_df = pd.merge(
    df_co2_clean[merge_keys + col_co2], 
    df_energy_clean[merge_keys + col_energy],
    on=merge_keys, 
    how='outer'
)

# Overlapping cols not in common cols
if overlap_cols:
    for col in overlap_cols:
        # Temp cols
        co2_data = df_co2_clean[merge_keys + [col]].rename(columns={col: f"{col}_co2"})
        energy_data = df_energy_clean[merge_keys + [col]].rename(columns={col: f"{col}_energy"})

        merged_df = merged_df.merge(co2_data, on=merge_keys, how='left')
        merged_df = merged_df.merge(energy_data, on=merge_keys, how='left')

        merged_df[col] = merged_df[f"{col}_co2"].combine_first(merged_df[f"{col}_energy"])

        # Drop temp cols
        merged_df = merged_df.drop([f"{col}_co2", f"{col}_energy"], axis=1)

df = merged_df.drop(['iso_code'], axis=1).copy()
g20_df = df[df['country'].isin(g20_countries)].copy()

In [4]:
g20_df['country'].unique()

array(['Argentina', 'Australia', 'Brazil', 'Canada', 'China', 'Europe',
       'France', 'Germany', 'India', 'Indonesia', 'Italy', 'Japan',
       'Mexico', 'Russia', 'Saudi Arabia', 'South Africa', 'South Korea',
       'Turkey', 'United Kingdom', 'United States'], dtype=object)

In [5]:
# Time lag feature
def time_lag_feature(df, feature_columns, periods=[1, 2, 3, 4]):
    dup_df = df.copy()

    if 'country' in dup_df.columns and 'year' in dup_df.columns:
        dup_df = dup_df.sort_values(['country', 'year'])

    for country, country_data in dup_df.groupby('country'):
        for col in feature_columns:
            if col in country_data.columns:
                for lag in periods:
                    lag_col_name = f"{col}_lag{lag}"
                    dup_df.loc[country_data.index, lag_col_name] = country_data[col].shift(lag)

    return dup_df

lag_features = ['co2', 'gdp', 'population', 'primary_energy_consumption', 'fossil_fuel_consumption', 'renewables_consumption']

lag_df = g20_df[['country', 'year'] + lag_features].copy()
lag_df = time_lag_feature(lag_df, lag_features, [1, 2, 3, 4])

In [6]:
lag_df.tail(3)

Unnamed: 0,country,year,co2,gdp,population,primary_energy_consumption,fossil_fuel_consumption,renewables_consumption,co2_lag1,co2_lag2,...,primary_energy_consumption_lag3,primary_energy_consumption_lag4,fossil_fuel_consumption_lag1,fossil_fuel_consumption_lag2,fossil_fuel_consumption_lag3,fossil_fuel_consumption_lag4,renewables_consumption_lag1,renewables_consumption_lag2,renewables_consumption_lag3,renewables_consumption_lag4
51601,United States,2022,5078.871,19493170000000.0,341534041.0,26504.305,21479.428,2993.056,5032.213,4714.628,...,26578.494,26768.986,21170.129,19936.998,21948.143,22212.854,2726.014,2590.245,2475.22,2399.24
51602,United States,2023,4911.391,,343477332.0,26189.199,21102.201,3052.564,5078.871,5032.213,...,24622.646,26578.494,21479.428,21170.129,19936.998,21948.143,2993.056,2726.014,2590.245,2475.22
51603,United States,2024,,,,,,,4911.391,5078.871,...,25956.828,24622.646,21102.201,21479.428,21170.129,19936.998,3052.564,2993.056,2726.014,2590.245


In [7]:
# Function for RMSE
def rmse(pred, actual):
    return np.sqrt(((pred - actual)**2).mean())

In [8]:
# Function to split sequence for train
def split_sequence(sequences, n_steps_in, n_steps_out):
    X, y = [], []
    for i in range(len(sequences)):
        # End of the pattern
        end_idx = i + n_steps_in
        out_end_idx = end_idx + n_steps_out

        # Checking if its beyond the dataset
        if out_end_idx > len(sequences):
            break

        # Gathering input and output parts of the pattern
        seq_x, seq_y = sequences.iloc[i:end_idx, :], sequences.iloc[end_idx:out_end_idx, :]
        X.append(seq_x.values)
        y.append(seq_y.values)

    return np.array(X), np.array(y)

In [9]:
# Function to reshape seq for pred
def reshape_sequence(sequence, n_steps_in):
    X = []
    for i in range(len(sequence)):
        # End of the pattern
        end_idx = i + n_steps_in

        # Checking if its beyond the seq
        if end_idx > len(sequence):
            break

        # Gathering input parts of the pattern
        seq_x = sequence[i:end_idx, :]
        X.append(seq_x)

    return np.array(X)

### Modified function to add validation

In [10]:
# Function to create shuffled train/test sets with validation set
def shuffled_test_train_val(n_inp, n_out, df, test_size=0.2, val_size=0.1):
    from sklearn.model_selection import ShuffleSplit

    # Batches
    batches = []
    for i in range(len(df)):
        if (i + n_inp + n_out > len(df)):
            break
        batches.append(df.iloc[i:i + n_inp + n_out].values)

    rs = ShuffleSplit(n_splits=1, test_size=test_size, random_state=42)

    # First split for train val and test
    for train_val_ind, test_ind in rs.split(batches):
        print(f"Train & Val: {len(train_val_ind)} samples, Test: {len(test_ind)} samples.")
        break

    # Second split for train and val
    train_val_batches = [batches[i] for i in train_val_ind]
    rs_val = ShuffleSplit(n_splits=1, test_size=val_size/(1-test_size), random_state=42)

    for train_ind, val_ind in rs_val.split(train_val_batches):
        print(f"Train: {len(train_ind)} samples, Val: {len(val_ind)} samples.")

    # Final Split
    train = np.array([train_val_batches[i] for i in train_ind])
    val = np.array([train_val_batches[i] for i in val_ind])
    test = np.array([batches[i] for i in test_ind])

    X_train = train[:, :n_inp, :]
    y_train = train[:, n_inp:, :]

    X_val = val[:, :n_inp, :]
    y_val = val[:, n_inp:, :]
    
    X_test = test[:, :n_inp, :]
    y_test = test[:, n_inp:, :]

    return train, val, test, X_train, y_train, X_val, y_val, X_test, y_test

## 2. The second experiment
### Which ML model to choose

In [None]:
# 1. LSTM model
def lstm_model(n_steps_in, n_features, n_steps_out, hidden=16):
    model = Sequential([
        LSTM(hidden, activation='relu', input_shape=(n_steps_in, n_features)),
        Dense(n_steps_out)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

In [None]:
# 2. Bi-directional LSTM model
def bilstm_model(n_steps_in, n_features, n_steps_out, hidden=16):
    model = Sequential([
        Bidirectional(LSTM(hidden, activation='relu'), input_shape=(n_steps_in, n_features)),
        Dense(n_steps_out)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

In [None]:
# 3. CNN model
def cnn_model(n_steps_in, n_features, n_steps_out, hidden=16):
    model = Sequential([
        Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(n_steps_in, n_features))
    ])
    
    if n_steps_in > 3:
        model.add(MaxPooling1D(pool_size=2))

    model.add(Flatten())
    model.add(Dense(hidden, activation='relu'))
    model.add(Dense(n_steps_out))
    model.compile(optimizer='adam', loss='mse')
    return model

In [None]:
# 4. ED LSTM model
def edlstm_model(n_steps_in, n_features, n_steps_out, hidden=16):
    model = Sequential([
        LSTM(hidden, activation='relu', input_shape=(n_steps_in, n_features)),
        RepeatVector(n_steps_out),
        LSTM(hidden, activation='relu', return_sequences=True),
        TimeDistributed(Dense(1))
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

In [None]:
model_types = {
    'LSTM': lstm_model,
    'Bi-LSTM': bilstm_model,
    'CNN': cnn_model,
    'ED-LSTM': edlstm_model
}

In [None]:
# Finding the best model
def find_best_model(feature_data, target_feature, n_steps_in=5, n_steps_out=1, models_to_test=None, epochs=20):
    if models_to_test is None:
        models_to_test = list(model_types.keys())

    print(f"\nTesting models for {target_feature}")
    print(f"Data shape: {feature_data.shape}")

    if len(feature_data) < n_steps_in + n_steps_out + 10:
        print(f"Not enough data for {target_feature}")
        return None
    
    scaler = MinMaxScaler()
    scaled_data = feature_data.copy()
    for col in scaled_data.columns:
        if col not in ['country', 'year']:
                scaled_data[col] = scaler.fit_transform(scaled_data[col].values.reshape(-1, 1))

    model_data = scaled_data.select_dtypes(include=np.number)

    train, val, test, X_train, y_train, X_val, y_val, X_test, y_test = shuffled_test_train_val(
        n_steps_in, n_steps_out, model_data)
    
    target_ind = list(model_data.columns).index(target_feature)

    y_train_target = y_train[:, :, target_ind].reshape(-1, n_steps_out)
    y_val_target = y_val[:, :, target_ind].reshape(-1, n_steps_out)
    y_test_target = y_test[:, :, target_ind].reshape(-1, n_steps_out)

    n_features = X_train.shape[2]
    best_model = None
    best_rmse = float('inf')
    best_model_name = None
    model_results = {}

    for model_name in models_to_test:
        print(f"Test {model_name}")

        model = model_types[model_name](n_steps_in, n_features, n_steps_out)

        # Additional modification for ED LSTM model
        if model_name == 'ED-LSTM':
            y_train_reshaped = y_train_target.reshape(y_train_target.shape[0], y_train_target.shape[1], 1)
            y_val_reshaped = y_val_target.reshape(y_val_target.shape[0], y_val_target.shape[1], 1)

            history = model.fit(
                X_train, y_train_reshaped,
                epochs=epochs, batchsize=32, verbose=0, validation_data=(X_val, y_val_reshaped),
                callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)]
            )
            val_pred = model.predict(X_val)
            val_pred = np.squeeze(val_pred, axis=2)

        else:
            history = model.fit(
                X_train, y_train_target,
                epochs=epochs, batchsize=32, verbose=0, validation_data=(X_val, y_val_target),
                callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)]
            )
            val_pred = model.predict(X_val)

        val_rmse = rmse(val_pred, y_val_target)
        model_results[model_name] = val_rmse

        print(f"{model_name} - validation RMSE: {val_rmse:.4f}")

        if val_rmse < best_rmse:
            best_rmse = val_rmse
            best_model = model
            best_model_name = model_name

    return {
        'best_model': best_model,
        'best_model_name': best_model_name,
        'best_rmse': best_rmse,
        'model_results': model_results,
        'target_feature': target_feature
    }

In [None]:
# Using the best model for each feature
def run_best_model_for_feature(country, target_features=None, start_year=1960, n_steps_in=5, n_steps_out=1):
    
    print(f"Running Best Model for {country.upper()}")

    country_data = g20_df[g20_df['country'] == country].copy()
    country_data = country_data[country_data['year'] >= start_year].copy()

    if target_features is None:
        target_features = ['co2', 'gdp', 'primary_energy_consumption']

    avail_features = [f for f in target_features if f in country_data.columns]
    best_models = {}

    for feature in avail_features:
        print(f"Find best model for {feature}")
        numerical_features = [f for f in country_data.columns 
                              if f not in ['country', 'year'] and country_data[f].dtype in ['float64', 'int64']]
        
        feature_data = country_data[['year'] + numerical_features].copy()
        feature_data = feature_data.dropna()

        result = find_best_model(feature_data, feature, n_steps_in, n_steps_out)
        best_models[feature] = result
        print(f"Best model for {feature}: {result['best_model_name']} (RMSE: {result['best_rmse']:.4f})")
    
    return best_models

In [None]:
# Save the best model
def save_model(best_models, country, filepath='C:\Users\sodjs\RL/'):
    if not os.path.exists(filepath):
        os.makedirs(filepath)

    models = {}
    for feature, model_result in best_models.items():
        model_path = f"{filepath}{country}_{feature}_model.h5"
        model_result['best_model'].save(model_path)

        models[feature] = {
            'model_path': model_path,
            'model_name': model_result['best_model_name'],
            'rmse': model_result['best_rmse'],
            'target_feature': model_result['target_feature']
        }

    with open(f"{filepath}{country}_model_result.pk1", 'wb') as f:
        pickle.dump(models, f)

    print(f"Saved model for {country}")
    return models