# Step 3-1 Feature Selection

### Necessary imports

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os
import warnings

warnings.filterwarnings("ignore")

### Config

In [None]:
TARGET_VARIABLES = 'co2'
MAX_LAGS = 4
COLS_TO_EXCLUDE = ['co2', 'country', 'year', 'iso_code']
MIN_DATA_COVERAGE = 0.80
SELECTED_COUNTRIES = ['United States', 'China', 'India']
G7_COUNTRIES = ['United States', 'China', 'India', 'Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom']
G20_COUNTRIES = [
    'United States', 'China', 'Japan', 'Germany', 
    'United Kingdom', 'France', 'Italy', 'Canada',
    'Brazil', 'Russia', 'India', 'Australia', 
    'Mexico', 'Indonesia', 'Turkey', 'Saudi Arabia',
    'South Africa', 'Argentina', 'South Korea'
]

### Data load

In [None]:
def load_data(save_dir='data'):
    data_files = {
        'all_data_df': os.path.join(save_dir, 'all_data_df.csv'),
        'lag_df': os.path.join(save_dir, 'lag_df.csv'),
        'g20_lag_df': os.path.join(save_dir, 'g20_lag_df.csv'),
        'lag_three_sel_1969_df': os.path.join(save_dir, 'lag_three_sel_1969_df.csv')
    }

    dfs = {}
    for name, filepath in data_files.items():
        if os.path.exists(filepath):
            dfs[name] = pd.read_csv(filepath)
            print(f"Loaded {name}: {dfs[name].shape}")
        else:
            print(f"{filepath} not found")
    
    return dfs

In [None]:
data = load_data()
all_data_df = data['all_data_df']

lag_df = data['lag_df']
all_lag_1969_df = lag_df[lag_df['year'] >= 1969].copy()
all_lag_1969_df = all_lag_1969_df[all_lag_1969_df['year'] < 2023]

g20_lag_df = data['g20_lag_df']
g20_lag_1969_df = g20_lag_df[g20_lag_df['year'] >= 1969].copy()
g20_lag_1969_df = g20_lag_1969_df[g20_lag_1969_df['year'] < 2023]

lag_three_sel_1969_df = data['lag_three_sel_1969_df']

In [None]:
# Config of names of all countries in the dataset
ALL_COUNTRIES = []

for country in all_lag_1969_df['country'].unique():
    ALL_COUNTRIES.append(country)

['ASEAN (Ember)',
 'Afghanistan',
 'Africa',
 'Africa (EI)',
 'Africa (EIA)',
 'Africa (Ember)',
 'Africa (GCP)',
 'Africa (Shift)',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Asia',
 'Asia & Oceania (EIA)',
 'Asia (Ember)',
 'Asia (GCP)',
 'Asia (excl. China and India)',
 'Asia Pacific (EI)',
 'Asia and Oceania (Shift)',
 'Australia',
 'Australia and New Zealand (EIA)',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bonaire Sint Eustatius and Saba',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'British Virgin Islands',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'CIS (EI)',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Cayman Islands',
 'Central & South America (EIA)',
 'Central African Republic',
 'Central America (EI)',
 'Central Ameri

### Filtering features with data coverage

In [None]:
def calculate_data_coverage(df, countries, exclude_cols):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    feature_cols = [col for col in numeric_cols if col not in exclude_cols]

    coverage_results = {}

    for country in countries:
        country_data = df[df['country'] == country]
        total_rows = len(country_data)

        coverage = {}
        for feature in feature_cols:
            non_missing = country_data[feature].notna().sum()
            coverage_pct = non_missing / total_rows
            coverage[feature] = {
                'non_missing': non_missing,
                'total': total_rows,
                'coverage_pct': coverage_pct
            }

        coverage_results[country] = coverage

    return coverage_results

In [None]:
def filter_by_coverage(coverage_results, min_coverage=0.80):
    all_features = set()
    for country_coverage in coverage_results.values():
        all_features.update(country_coverage.keys())

    valid_features = []
    dropped_features = []

    for feature in all_features:
        meets_threshold = True

        for country, coverage in coverage_results.items():
            if feature in coverage:
                coverage_pct = coverage[feature]['coverage_pct']
                if coverage_pct < min_coverage:
                    meets_threshold = False
                    break
        
        if meets_threshold:
            valid_features.append(feature)
        else:
            dropped_features.append(feature)
                
    return sorted(valid_features), sorted(dropped_features)

In [None]:
coverage_3_countries = calculate_data_coverage(
    lag_three_sel_1969_df, 
    SELECTED_COUNTRIES, 
    COLS_TO_EXCLUDE
)

valid_features_3, dropped_features_3 = filter_by_coverage(
    coverage_3_countries, 
    MIN_DATA_COVERAGE
)

print(f"Total features before filtering: {len(valid_features_3) + len(dropped_features_3)}")
print(f"Valid features (>= {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(valid_features_3)}")
print(f"Dropped features (< {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(dropped_features_3)}")

In [None]:
coverage_g7 = calculate_data_coverage(
    g20_lag_1969_df, 
    G7_COUNTRIES, 
    COLS_TO_EXCLUDE
)

valid_features_g7, dropped_features_g7 = filter_by_coverage(
    coverage_g7, 
    MIN_DATA_COVERAGE
)

print(f"Total features before filtering: {len(valid_features_g7) + len(dropped_features_g7)}")
print(f"Valid features (>= {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(valid_features_g7)}")
print(f"Dropped features (< {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(dropped_features_g7)}")

In [None]:
coverage_g20 = calculate_data_coverage(
    g20_lag_1969_df, 
    G20_COUNTRIES, 
    COLS_TO_EXCLUDE
)

valid_features_g20, dropped_features_g20 = filter_by_coverage(
    coverage_g20, 
    MIN_DATA_COVERAGE
)

print(f"Total features before filtering: {len(valid_features_g20) + len(dropped_features_g20)}")
print(f"Valid features (>= {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(valid_features_g20)}")
print(f"Dropped features (< {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(dropped_features_g20)}")


In [None]:
coverage_all = calculate_data_coverage(
    all_lag_1969_df, 
    G20_COUNTRIES, 
    COLS_TO_EXCLUDE
)

valid_features_all, dropped_features_all = filter_by_coverage(
    coverage_all, 
    MIN_DATA_COVERAGE
)

print(f"Total features before filtering: {len(valid_features_all) + len(dropped_features_all)}")
print(f"Valid features (>= {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(valid_features_all)}")
print(f"Dropped features (< {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(dropped_features_all)}")

### Filtering countries with data coverage

In [32]:
def filter_countries_by_coverage(df, countries, valid_features, min_coverage=0.80):
    valid_countries = []
    dropped_countries = []
    country_stats = {}

    for country in countries:
        country_data = df[df['country'] == country]
        total_rows = len(country_data)

        if total_rows == 0:
            dropped_countries.append(country)
            continue

        total_possible = len(valid_features) * total_rows

        non_missing_total = 0
        for feature in valid_features:
            if feature in country_data.columns:
                non_missing = country_data[feature].notna().sum()
                non_missing_total += non_missing

        # Calculating data coverage in pct
        data_avail = non_missing_total / total_possible if total_possible > 0 else 0

        country_stats[country] = {
            'total_possible_points': total_possible,
            'non_missing_points': non_missing_total,
            'data_availability': data_avail
        }

        # Filter
        if data_avail >= min_coverage:
            valid_countries.append(country)
        else:
            dropped_countries.append(country)

    return sorted(valid_countries), sorted(dropped_countries), country_stats

In [33]:
valid_g20_countries, dropped_g20_countries, g20_country_stats = filter_countries_by_coverage(
    g20_lag_1969_df,
    G20_COUNTRIES,
    valid_features_g20,
    MIN_DATA_COVERAGE
)

print(f"Valid countries (>= {MIN_DATA_COVERAGE*100:.0f}% data): {len(valid_g20_countries)}")
print(f"Dropped countries (< {MIN_DATA_COVERAGE*100:.0f}% data): {len(dropped_g20_countries)}")
print(f"\nValid G20 countries: {valid_g20_countries}")
print(f"\nDropped G20 countries: {dropped_g20_countries}")

Valid countries (>= 80% data): 19
Dropped countries (< 80% data): 0

Valid G20 countries: ['Argentina', 'Australia', 'Brazil', 'Canada', 'China', 'France', 'Germany', 'India', 'Indonesia', 'Italy', 'Japan', 'Mexico', 'Russia', 'Saudi Arabia', 'South Africa', 'South Korea', 'Turkey', 'United Kingdom', 'United States']

Dropped G20 countries: []


In [36]:
valid_all_countries, dropped_all_countries, all_country_stats = filter_countries_by_coverage(
    all_lag_1969_df,
    ALL_COUNTRIES,
    valid_features_all,
    MIN_DATA_COVERAGE
)

print(f"Valid countries (>= {MIN_DATA_COVERAGE*100:.0f}% data): {len(valid_features_all)}")
print(f"Dropped countries (< {MIN_DATA_COVERAGE*100:.0f}% data): {len(dropped_all_countries)}")
print(f"\nValid G20 countries: {valid_all_countries}")
print(f"\nDropped G20 countries: {dropped_all_countries}")

Valid countries (>= 80% data): 229
Dropped countries (< 80% data): 148

Valid G20 countries: ['Afghanistan', 'Africa', 'Albania', 'Algeria', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Asia', 'Asia (excl. China and India)', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo', 'Cook Islands', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Democratic Republic of Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Estonia', 'Eswatini', 'Ethiopia', 'Europe', 'Europe (excl. EU-27)', 'Europe (excl. EU-28)', 'European Union (27)', 'European Union (28)', 'Fiji', 'Fin

In [None]:
g20_filtered_df = g20_lag_1969_df[g20_lag_1969_df['country'].isin(valid_g20_countries)].copy()
all_filtered_df = all_lag_1969_df[all_lag_1969_df['country'].isin(valid_all_countries)].copy()

### Interpolation Forward
Forward interpolating method is conducted to deal with its missing data

In [None]:
def interpolate_forward(df, valid_features, countries):
    df_interpolated = df.copy()

    for country in countries:
        country_mask = df_interpolated['country'] == country

        for feature in valid_features:
            if feature in df_interpolated.columns:
                # Forward fill missing values for the corresponding country and feature
                df_interpolated.loc[country_mask, feature] = df_interpolated.loc[country_mask, feature].ffill()

    return df_interpolated

In [None]:
three_1969_interpolated_df = interpolate_forward(lag_three_sel_1969_df, valid_features_3, SELECTED_COUNTRIES)
g7_1969_interpolated_df = interpolate_forward(g20_lag_1969_df, valid_features_g7, G7_COUNTRIES)
g20_1969_interpolated_df = interpolate_forward(g20_filtered_df, valid_features_g20, valid_g20_countries)
all_1969_interpolated_df = interpolate_forward(all_filtered_df, valid_features_all, valid_all_countries)

### Percentage Change Normalisation

In [None]:
def calculate_pct_change(df, features, max_lags=MAX_LAGS):
    df_copy = df.copy()
    df_copy = df_copy.sort_values(['country', 'year']).reset_index(drop=True)
    pct_change_cols = []

    for feature in features:
        if feature not in df_copy.columns:
            continue

        # Pct change on current values
        lag1_col = f"{feature}_lag1"
        if lag1_col in df_copy.columns:
            df_copy[f"{feature}_pct_change"] = ((df_copy[feature] - df_copy[lag1_col]) / df_copy[lag1_col] * 100)
            pct_change_cols.append(f"{feature}_pct_change")

        # Pct change on lagged values
        for lag in range(1, max_lags):
            lag_col = f"{feature}_lag{lag}"
            prev_lag_col = f"{feature}_lag{lag+1}"
            
            if lag_col in df_copy.columns and prev_lag_col in df_copy.columns:
                df_copy[f"{lag_col}_pct_change"] = ((df_copy[lag_col] - df_copy[prev_lag_col]) / df_copy[prev_lag_col] * 100)
                pct_change_cols.append(f"{lag_col}_pct_change")

        # Lag4 for the first row = 0, then shift lag3_pct by country
        last_lag_col = f"{feature}_lag{max_lags}"
        lag3_pct_col = f"{feature}_lag{max_lags-1}_pct_change"

        if last_lag_col in df_copy.columns and lag3_pct_col in df_copy.columns:
            df_copy[f"{last_lag_col}_pct_change"] = df_copy.groupby('country')[lag3_pct_col].shift(1).fillna(0)
            pct_change_cols.append(f"{last_lag_col}_pct_change")
            
        df_copy = df_copy.replace([np.inf, -np.inf], np.nan)
    
    return df_copy, pct_change_cols

In [None]:
df_pct_change, pct_change_cols = calculate_pct_change(lag_three_sel_1969_df, valid_features_3, MAX_LAGS)
g7_df_pct_change, g7_pct_change_cols = calculate_pct_change(g20_lag_1969_df, valid_features_g7, MAX_LAGS)
g20_df_pct_change, g20_pct_change_cols = calculate_pct_change(g20_lag_1969_df, valid_features_g20, MAX_LAGS)

df_selected = df_pct_change[df_pct_change['country'].isin(SELECTED_COUNTRIES)].copy()
g7_df_selected = g7_df_pct_change[g7_df_pct_change['country'].isin(G7_COUNTRIES)].copy()
g20_df_selected = g20_df_pct_change[g20_df_pct_change['country'].isin(G20_COUNTRIES)].copy()

In [None]:
def calculate_correlations(df, target, pct_change_cols, countries):
    country_corr = {}

    for country in countries:
        country_data = df[df['country'] == country].copy()

        corr_cols = [target] + [col for col in pct_change_cols if col in country_data.columns]

        if target in country_data.columns and len(corr_cols) > 1:
            # Calculate correlations
            correlations = country_data[corr_cols].corr()[target].drop(target)
            correlations = correlations.dropna().sort_values(ascending=False)
            country_corr[country] = correlations
        else:
            print(f"Error of {country}")

    return country_corr

In [None]:
country_correlations = calculate_correlations(df_selected, TARGET_VARIABLES, pct_change_cols, SELECTED_COUNTRIES)
g7_correlations = calculate_correlations(g7_df_selected, TARGET_VARIABLES, g7_pct_change_cols, G7_COUNTRIES)
g20_correlations = calculate_correlations(g20_df_selected, TARGET_VARIABLES, g20_pct_change_cols, G20_COUNTRIES)

In [None]:
# Summary for 3 countries
for country, corr in country_correlations.items():
    print(f"{country}: {len(corr)} features")

In [None]:
# Summary for 3 countries + G7
for country, corr in g7_correlations.items():
    print(f"{country}: {len(corr)} features")

In [None]:
# Summary for G20
for country, corr in g20_correlations.items():
    print(f"{country}: {len(corr)} features")

In [None]:
save_dir = 'data/03_01_results'
os.makedirs(save_dir, exist_ok=True)

md_filepath = os.path.join(save_dir, 'correlation_by_3_countries.md')
with open(md_filepath, 'w') as f:
    f.write("# Correlation Analysis with Percentage Change Normalised Features for 3 Countries\n\n")
    f.write("---\n\n")
    
    for country in SELECTED_COUNTRIES:
        f.write(f"## {country}\n\n")
        f.write(f"### Correlation with {TARGET_VARIABLES}\n\n")
        
        corr_df = country_correlations[country].to_frame(name='Correlation')
        corr_df.index.name = 'Feature'
        f.write(corr_df.to_markdown())
        f.write("\n\n---\n\n")

In [None]:
md_filepath = os.path.join(save_dir, 'correlation_by_g7_and_3.md')
with open(md_filepath, 'w') as f:
    f.write("# Correlation Analysis with Percentage Change Normalised Features for G7 + 3 Countries\n\n")
    f.write("---\n\n")
    
    for country in G7_COUNTRIES:
        f.write(f"## {country}\n\n")
        f.write(f"### Correlation with {TARGET_VARIABLES}\n\n")
        
        corr_df = g7_correlations[country].to_frame(name='Correlation')
        corr_df.index.name = 'Feature'
        f.write(corr_df.to_markdown())
        f.write("\n\n---\n\n")

In [None]:
md_filepath = os.path.join(save_dir, 'correlation_by_g20_countries.md')
with open(md_filepath, 'w') as f:
    f.write("# Correlation Analysis with Percentage Change Normalised Features for G20 Countries\n\n")
    f.write("---\n\n")
    
    for country in G20_COUNTRIES:
        f.write(f"## {country}\n\n")
        f.write(f"### Correlation with {TARGET_VARIABLES}\n\n")
        
        corr_df = g20_correlations[country].to_frame(name='Correlation')
        corr_df.index.name = 'Feature'
        f.write(corr_df.to_markdown())
        f.write("\n\n---\n\n")

In [None]:
def create_overall_ranking(country_correlations):
    all_features = {}
    
    for country, corr in country_correlations.items():
        for feature, value in corr.items():
            if feature not in all_features:
                all_features[feature] = {}
            all_features[feature][country] = abs(value)
    
    # Calculate mean absolute correlation
    ranking_data = []
    for feature, country_values in all_features.items():
        mean_abs_corr = np.mean(list(country_values.values()))
        ranking_data.append({
            'Feature': feature,
            'Mean_Abs_Corr': mean_abs_corr,
            'US_Corr': country_values.get('United States', np.nan),
            'China_Corr': country_values.get('China', np.nan),
            'India_Corr': country_values.get('India', np.nan)
        })
    
    ranking_df = pd.DataFrame(ranking_data)
    ranking_df = ranking_df.sort_values('Mean_Abs_Corr', ascending=False)
    
    return ranking_df

In [None]:
ranking_3_df = create_overall_ranking(country_correlations)
ranking_3_df.head(40)

In [None]:
def create_overall_g7_ranking(g7_correlations):
    all_features = {}
    
    for country, corr in g7_correlations.items():
        for feature, value in corr.items():
            if feature not in all_features:
                all_features[feature] = {}
            all_features[feature][country] = abs(value)
    
    # Calculate mean absolute correlation
    ranking_data = []
    for feature, country_values in all_features.items():
        mean_abs_corr = np.mean(list(country_values.values()))
        ranking_data.append({
            'Feature': feature,
            'Mean_Abs_Corr': mean_abs_corr,
            'US_Corr': country_values.get('United States', np.nan),
            'China_Corr': country_values.get('China', np.nan),
            'India_Corr': country_values.get('India', np.nan),
            'Canada_Corr': country_values.get('Canada', np.nan),
            'France_Corr': country_values.get('France', np.nan),
            'Germany_Corr': country_values.get('Germany', np.nan),
            'Italy_Corr': country_values.get('Italy', np.nan),
            'Japan_Corr': country_values.get('Japan', np.nan),
            'UK_Corr': country_values.get('United Kingdom', np.nan)
        })
    
    ranking_df = pd.DataFrame(ranking_data)
    ranking_df = ranking_df.sort_values('Mean_Abs_Corr', ascending=False)
    
    return ranking_df

In [None]:
ranking_g7_df = create_overall_g7_ranking(g7_correlations)
ranking_g7_df.head(40)

In [None]:
def create_overall_g20_ranking(g20_correlations):
    all_features = {}
    
    for country, corr in g20_correlations.items():
        for feature, value in corr.items():
            if feature not in all_features:
                all_features[feature] = {}
            all_features[feature][country] = abs(value)
    
    # Calculate mean absolute correlation
    ranking_data = []
    for feature, country_values in all_features.items():
        mean_abs_corr = np.mean(list(country_values.values()))
        ranking_data.append({
            'Feature': feature,
            'Mean_Abs_Corr': mean_abs_corr,
            'US_Corr': country_values.get('United States', np.nan),
            'China_Corr': country_values.get('China', np.nan),
            'India_Corr': country_values.get('India', np.nan)
        })
    
    ranking_df = pd.DataFrame(ranking_data)
    ranking_df = ranking_df.sort_values('Mean_Abs_Corr', ascending=False)
    
    return ranking_df

In [None]:
ranking_g20_df = create_overall_g20_ranking(g20_correlations)
ranking_g20_df.head(40)

In [None]:
ranking_filepath = os.path.join(save_dir, 'overall_3_countries_ranking.md')
with open(ranking_filepath, 'w') as f:
    f.write("# Overall Feature Ranking\n\n")
    f.write("Features ranked by mean absolute correlation across 3 Countries\n\n")
    f.write(ranking_3_df.to_markdown(index=False))

In [None]:
ranking_filepath = os.path.join(save_dir, 'overall_g7_countries_ranking.md')
with open(ranking_filepath, 'w') as f:
    f.write("# Overall Feature Ranking\n\n")
    f.write("Features ranked by mean absolute correlation across 3 Countries and G7\n\n")
    f.write(ranking_g7_df.to_markdown(index=False))

In [None]:
ranking_filepath = os.path.join(save_dir, 'overall_g20_countries_ranking.md')
with open(ranking_filepath, 'w') as f:
    f.write("# Overall Feature Ranking\n\n")
    f.write("Features ranked by mean absolute correlation across G20 Countries\n\n")
    f.write(ranking_g20_df.to_markdown(index=False))

Best 6 features from the selected 3 countries:

cumulative_luc_co2, cumulative_oil_co2, population, cumulative_co2, share_of_temperature_change_from_ghg, share_global_cumulative_cement_co2



Best 6 features from G7 + selected 3 countries:

cumulative_oil_co2, cumulative_co2, cumulative_cement_co2, cumulative_co2_including_luc, cumulative_gas_co2, cumulative_luc_co2

Best 5 features from G20 countries:

cumulative_oil_co2, cumulative_oil_co2, cumulative_cement_co2, population, cumulative_gas_co2