# Step 3-1 Feature Selection

In [51]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os
import warnings

warnings.filterwarnings("ignore")

In [52]:
TARGET_VARIABLES = 'co2'
MAX_LAGS = 4
COLS_TO_EXCLUDE = ['co2', 'country', 'year', 'iso_code']

SELECTED_COUNTRIES = ['United States', 'China', 'India']
G7_COUNTRIES = ['United States', 'China', 'India', 'Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom']
G20_COUNTRIES = [
    'United States', 'China', 'Japan', 'Germany', 
    'United Kingdom', 'France', 'Italy', 'Canada',
    'Brazil', 'Russia', 'India', 'Australia', 
    'Mexico', 'Indonesia', 'Turkey', 'Saudi Arabia',
    'South Africa', 'Argentina', 'South Korea'
]

In [53]:
def load_data(save_dir='data'):
    data_files = {
        'all_data_df': os.path.join(save_dir, 'all_data_df.csv'),
        'g20_lag_df': os.path.join(save_dir, 'g20_lag_df.csv'),
        'lag_three_sel_1969_df': os.path.join(save_dir, 'lag_three_sel_1969_df.csv')
    }

    dfs = {}
    for name, filepath in data_files.items():
        if os.path.exists(filepath):
            dfs[name] = pd.read_csv(filepath)
            print(f"Loaded {name}: {dfs[name].shape}")
        else:
            print(f"{filepath} not found")
    
    return dfs

In [54]:
data = load_data()
all_data_df = data['all_data_df']
g20_lag_df = data['g20_lag_df']
g20_lag_1969_df = g20_lag_df[g20_lag_df['year'] >= 1969].copy()
g20_lag_1969_df = g20_lag_1969_df[g20_lag_1969_df['year'] < 2023]
lag_three_sel_1969_df = data['lag_three_sel_1969_df']

Loaded all_data_df: (55529, 200)
Loaded g20_lag_df: (3744, 992)
Loaded lag_three_sel_1969_df: (162, 992)


In [55]:
def find_feature_cols(df, exclude_cols):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    feature_cols = [col for col in numeric_cols if col not in exclude_cols]
    return feature_cols

In [56]:
feature_cols = find_feature_cols(lag_three_sel_1969_df, COLS_TO_EXCLUDE)
g20_feature_cols = find_feature_cols(g20_lag_1969_df, COLS_TO_EXCLUDE)

In [57]:
def calculate_pct_change(df, features, max_lags=MAX_LAGS):
    df_copy = df.copy()
    df_copy = df_copy.sort_values(['country', 'year']).reset_index(drop=True)
    pct_change_cols = []

    for feature in features:
        if feature not in df_copy.columns:
            continue

        # Pct change on current values
        lag1_col = f"{feature}_lag1"
        if lag1_col in df_copy.columns:
            df_copy[f"{feature}_pct_change"] = ((df_copy[feature] - df_copy[lag1_col]) / df_copy[lag1_col] * 100)
            pct_change_cols.append(f"{feature}_pct_change")

        # Pct change on lagged values
        for lag in range(1, max_lags):
            lag_col = f"{feature}_lag{lag}"
            prev_lag_col = f"{feature}_lag{lag+1}"
            
            if lag_col in df_copy.columns and prev_lag_col in df_copy.columns:
                df_copy[f"{lag_col}_pct_change"] = ((df_copy[lag_col] - df_copy[prev_lag_col]) / df_copy[prev_lag_col] * 100)
                pct_change_cols.append(f"{lag_col}_pct_change")

        # Lag4 for the first row = 0, then shift lag3_pct by country
        last_lag_col = f"{feature}_lag{max_lags}"
        lag3_pct_col = f"{feature}_lag{max_lags-1}_pct_change"

        if last_lag_col in df_copy.columns and lag3_pct_col in df_copy.columns:
            df_copy[f"{last_lag_col}_pct_change"] = df_copy.groupby('country')[lag3_pct_col].shift(1).fillna(0)
            pct_change_cols.append(f"{last_lag_col}_pct_change")
            
        df_copy = df_copy.replace([np.inf, -np.inf], np.nan)
    
    return df_copy, pct_change_cols

In [58]:
df_pct_change, pct_change_cols = calculate_pct_change(lag_three_sel_1969_df, feature_cols, MAX_LAGS)
g20_df_pct_change, g20_pct_change_cols = calculate_pct_change(g20_lag_1969_df, g20_feature_cols, MAX_LAGS)

df_selected = df_pct_change[df_pct_change['country'].isin(SELECTED_COUNTRIES)].copy()
g7_df_selected = g20_df_pct_change[g20_df_pct_change['country'].isin(G7_COUNTRIES)].copy()
g20_df_selected = g20_df_pct_change[g20_df_pct_change['country'].isin(G20_COUNTRIES)].copy()

In [59]:
def calculate_correlations(df, target, pct_change_cols, countries):
    country_corr = {}

    for country in countries:
        country_data = df[df['country'] == country].copy()

        corr_cols = [target] + [col for col in pct_change_cols if col in country_data.columns]

        if target in country_data.columns and len(corr_cols) > 1:
            # Calculate correlations
            correlations = country_data[corr_cols].corr()[target].drop(target)
            correlations = correlations.dropna().sort_values(ascending=False)
            country_corr[country] = correlations
        else:
            print(f"Error of {country}")

    return country_corr

In [60]:
country_correlations = calculate_correlations(df_selected, TARGET_VARIABLES, pct_change_cols, SELECTED_COUNTRIES)
g7_correlations = calculate_correlations(g7_df_selected, TARGET_VARIABLES, g20_pct_change_cols, G7_COUNTRIES)
g20_correlations = calculate_correlations(g20_df_selected, TARGET_VARIABLES, g20_pct_change_cols, G20_COUNTRIES)

In [61]:
# Summary for 3 countries
for country, corr in country_correlations.items():
    print(f"{country}: {len(corr)} features")

United States: 985 features
China: 970 features
India: 945 features


In [62]:
# Summary for 3 countries + G7
for country, corr in g7_correlations.items():
    print(f"{country}: {len(corr)} features")

United States: 985 features
China: 970 features
India: 945 features
Canada: 967 features
France: 985 features
Germany: 985 features
Italy: 977 features
Japan: 955 features
United Kingdom: 970 features


In [63]:
# Summary for G20
for country, corr in g20_correlations.items():
    print(f"{country}: {len(corr)} features")

United States: 985 features
China: 970 features
Japan: 955 features
Germany: 985 features
United Kingdom: 970 features
France: 985 features
Italy: 977 features
Canada: 967 features
Brazil: 970 features
Russia: 970 features
India: 945 features
Australia: 920 features
Mexico: 945 features
Indonesia: 915 features
Turkey: 940 features
Saudi Arabia: 715 features
South Africa: 940 features
Argentina: 940 features
South Korea: 910 features


In [64]:
save_dir = 'data/03_01_results'
os.makedirs(save_dir, exist_ok=True)

md_filepath = os.path.join(save_dir, 'correlation_by_3_countries.md')
with open(md_filepath, 'w') as f:
    f.write("# Correlation Analysis with Percentage Change Normalised Features for 3 Countries\n\n")
    f.write("---\n\n")
    
    for country in SELECTED_COUNTRIES:
        f.write(f"## {country}\n\n")
        f.write(f"### Correlation with {TARGET_VARIABLES}\n\n")
        
        corr_df = country_correlations[country].to_frame(name='Correlation')
        corr_df.index.name = 'Feature'
        f.write(corr_df.to_markdown())
        f.write("\n\n---\n\n")

In [65]:
md_filepath = os.path.join(save_dir, 'correlation_by_g7_and_3.md')
with open(md_filepath, 'w') as f:
    f.write("# Correlation Analysis with Percentage Change Normalised Features for G7 + 3 Countries\n\n")
    f.write("---\n\n")
    
    for country in G7_COUNTRIES:
        f.write(f"## {country}\n\n")
        f.write(f"### Correlation with {TARGET_VARIABLES}\n\n")
        
        corr_df = g7_correlations[country].to_frame(name='Correlation')
        corr_df.index.name = 'Feature'
        f.write(corr_df.to_markdown())
        f.write("\n\n---\n\n")

In [66]:
md_filepath = os.path.join(save_dir, 'correlation_by_g20_countries.md')
with open(md_filepath, 'w') as f:
    f.write("# Correlation Analysis with Percentage Change Normalised Features for G20 Countries\n\n")
    f.write("---\n\n")
    
    for country in G20_COUNTRIES:
        f.write(f"## {country}\n\n")
        f.write(f"### Correlation with {TARGET_VARIABLES}\n\n")
        
        corr_df = g20_correlations[country].to_frame(name='Correlation')
        corr_df.index.name = 'Feature'
        f.write(corr_df.to_markdown())
        f.write("\n\n---\n\n")

In [67]:
def create_overall_ranking(country_correlations):
    all_features = {}
    
    for country, corr in country_correlations.items():
        for feature, value in corr.items():
            if feature not in all_features:
                all_features[feature] = {}
            all_features[feature][country] = abs(value)
    
    # Calculate mean absolute correlation
    ranking_data = []
    for feature, country_values in all_features.items():
        mean_abs_corr = np.mean(list(country_values.values()))
        ranking_data.append({
            'Feature': feature,
            'Mean_Abs_Corr': mean_abs_corr,
            'US_Corr': country_values.get('United States', np.nan),
            'China_Corr': country_values.get('China', np.nan),
            'India_Corr': country_values.get('India', np.nan)
        })
    
    ranking_df = pd.DataFrame(ranking_data)
    ranking_df = ranking_df.sort_values('Mean_Abs_Corr', ascending=False)
    
    return ranking_df

In [68]:
ranking_3_df = create_overall_ranking(country_correlations)
ranking_3_df.head(40)

Unnamed: 0,Feature,Mean_Abs_Corr,US_Corr,China_Corr,India_Corr
984,cumulative_luc_co2_pct_change,0.780654,0.786938,0.86641,0.688616
983,cumulative_luc_co2_lag1_pct_change,0.763779,0.741438,0.868791,0.681108
980,cumulative_luc_co2_lag2_pct_change,0.745644,0.703326,0.868895,0.664712
977,cumulative_oil_co2_lag3_pct_change,0.724537,0.661291,0.666847,0.845474
975,cumulative_oil_co2_lag2_pct_change,0.717563,0.644813,0.652289,0.855587
974,cumulative_oil_co2_lag1_pct_change,0.710328,0.63218,0.640786,0.858017
970,cumulative_luc_co2_lag3_pct_change,0.708239,0.596091,0.8723,0.656327
973,cumulative_oil_co2_pct_change,0.706955,0.626816,0.631062,0.862986
880,population_lag3_pct_change,0.694447,0.295118,0.814994,0.973229
762,population_lag2_pct_change,0.654926,0.174106,0.813517,0.977155


In [69]:
def create_overall_g7_ranking(g7_correlations):
    all_features = {}
    
    for country, corr in g7_correlations.items():
        for feature, value in corr.items():
            if feature not in all_features:
                all_features[feature] = {}
            all_features[feature][country] = abs(value)
    
    # Calculate mean absolute correlation
    ranking_data = []
    for feature, country_values in all_features.items():
        mean_abs_corr = np.mean(list(country_values.values()))
        ranking_data.append({
            'Feature': feature,
            'Mean_Abs_Corr': mean_abs_corr,
            'US_Corr': country_values.get('United States', np.nan),
            'China_Corr': country_values.get('China', np.nan),
            'India_Corr': country_values.get('India', np.nan),
            'Canada_Corr': country_values.get('Canada', np.nan),
            'France_Corr': country_values.get('France', np.nan),
            'Germany_Corr': country_values.get('Germany', np.nan),
            'Italy_Corr': country_values.get('Italy', np.nan),
            'Japan_Corr': country_values.get('Japan', np.nan),
            'UK_Corr': country_values.get('United Kingdom', np.nan)
        })
    
    ranking_df = pd.DataFrame(ranking_data)
    ranking_df = ranking_df.sort_values('Mean_Abs_Corr', ascending=False)
    
    return ranking_df

In [70]:
ranking_g7_df = create_overall_g7_ranking(g7_correlations)
ranking_g7_df.head(40)

Unnamed: 0,Feature,Mean_Abs_Corr,US_Corr,China_Corr,India_Corr,Canada_Corr,France_Corr,Germany_Corr,Italy_Corr,Japan_Corr,UK_Corr
977,cumulative_oil_co2_lag3_pct_change,0.726445,0.661291,0.666847,0.845474,0.872545,0.843773,0.689195,0.504449,0.782703,0.671723
975,cumulative_oil_co2_lag2_pct_change,0.720574,0.644813,0.652289,0.855587,0.863579,0.831349,0.683576,0.497797,0.776436,0.679744
974,cumulative_oil_co2_lag1_pct_change,0.714839,0.63218,0.640786,0.858017,0.853315,0.820108,0.677775,0.493927,0.770049,0.687391
951,cumulative_co2_lag3_pct_change,0.713912,0.545295,0.469085,0.737206,0.873429,0.91475,0.91267,0.35449,0.740058,0.878226
936,cumulative_co2_pct_change,0.711347,0.516656,0.590088,0.53729,0.870113,0.933854,0.918392,0.35058,0.755631,0.929517
973,cumulative_oil_co2_pct_change,0.710347,0.626816,0.631062,0.862986,0.844174,0.804631,0.675806,0.492111,0.767241,0.6883
949,cumulative_co2_lag2_pct_change,0.709826,0.538411,0.482149,0.649232,0.878444,0.925621,0.91153,0.353316,0.749533,0.900196
940,cumulative_co2_lag1_pct_change,0.709311,0.526344,0.533897,0.589802,0.875997,0.930305,0.910011,0.35204,0.754296,0.91111
972,cumulative_cement_co2_lag3_pct_change,0.660828,0.626336,0.761688,0.097306,0.869289,0.838221,0.742356,0.439427,0.848557,0.724273
918,cumulative_co2_including_luc_lag2_pct_change,0.652411,0.442887,0.190635,0.963058,0.776958,0.927897,0.91989,0.053658,0.70312,0.893597


In [71]:
def create_overall_g20_ranking(g20_correlations):
    all_features = {}
    
    for country, corr in g20_correlations.items():
        for feature, value in corr.items():
            if feature not in all_features:
                all_features[feature] = {}
            all_features[feature][country] = abs(value)
    
    # Calculate mean absolute correlation
    ranking_data = []
    for feature, country_values in all_features.items():
        mean_abs_corr = np.mean(list(country_values.values()))
        ranking_data.append({
            'Feature': feature,
            'Mean_Abs_Corr': mean_abs_corr,
            'US_Corr': country_values.get('United States', np.nan),
            'China_Corr': country_values.get('China', np.nan),
            'India_Corr': country_values.get('India', np.nan)
        })
    
    ranking_df = pd.DataFrame(ranking_data)
    ranking_df = ranking_df.sort_values('Mean_Abs_Corr', ascending=False)
    
    return ranking_df

In [72]:
ranking_g20_df = create_overall_g20_ranking(g20_correlations)
ranking_g20_df.head(40)

Unnamed: 0,Feature,Mean_Abs_Corr,US_Corr,China_Corr,India_Corr
973,cumulative_oil_co2_pct_change,0.722374,0.626816,0.631062,0.862986
974,cumulative_oil_co2_lag1_pct_change,0.718888,0.63218,0.640786,0.858017
951,cumulative_co2_lag3_pct_change,0.715253,0.545295,0.469085,0.737206
936,cumulative_co2_pct_change,0.714848,0.516656,0.590088,0.53729
975,cumulative_oil_co2_lag2_pct_change,0.713767,0.644813,0.652289,0.855587
972,cumulative_cement_co2_lag3_pct_change,0.712606,0.626336,0.761688,0.097306
949,cumulative_co2_lag2_pct_change,0.712479,0.538411,0.482149,0.649232
940,cumulative_co2_lag1_pct_change,0.711284,0.526344,0.533897,0.589802
977,cumulative_oil_co2_lag3_pct_change,0.707767,0.661291,0.666847,0.845474
971,cumulative_cement_co2_lag2_pct_change,0.70689,0.607438,0.775187,0.046597


In [73]:
ranking_filepath = os.path.join(save_dir, 'overall_3_countries_ranking.md')
with open(ranking_filepath, 'w') as f:
    f.write("# Overall Feature Ranking\n\n")
    f.write("Features ranked by mean absolute correlation across 3 Countries\n\n")
    f.write(ranking_3_df.to_markdown(index=False))

In [74]:
ranking_filepath = os.path.join(save_dir, 'overall_g7_countries_ranking.md')
with open(ranking_filepath, 'w') as f:
    f.write("# Overall Feature Ranking\n\n")
    f.write("Features ranked by mean absolute correlation across 3 Countries and G7\n\n")
    f.write(ranking_g7_df.to_markdown(index=False))

In [75]:
ranking_filepath = os.path.join(save_dir, 'overall_g20_countries_ranking.md')
with open(ranking_filepath, 'w') as f:
    f.write("# Overall Feature Ranking\n\n")
    f.write("Features ranked by mean absolute correlation across G20 Countries\n\n")
    f.write(ranking_g20_df.to_markdown(index=False))

Best 7 features from the selected 3 countries:

cumulative_luc_co2, cumulative_oil_co2, population, cumulative_co2, electricity_demand, share_of_temperature_change_from_ghg, share_global_cumulative_cement_co2



Best 6 features from G7 + selected 3 countries:

cumulative_oil_co2, cumulative_co2, cumulative_cement_co2, cumulative_co2_including_luc, cumulative_gas_co2, cumulative_luc_co2

Best 5 features from G20 countries:

cumulative_oil_co2, cumulative_oil_co2, cumulative_cement_co2, population, cumulative_gas_co2