# Step 3-1 Feature Selection

### Necessary imports

In [31]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os
import warnings

warnings.filterwarnings("ignore")

### Config

In [32]:
TARGET_VARIABLES = 'co2'
MAX_LAGS = 4
COLS_TO_EXCLUDE = ['co2', 'country', 'year', 'iso_code']
MIN_DATA_COVERAGE = 0.80
TEST_SIZE = 9
SELECTED_COUNTRIES = ['United States', 'China', 'India']
G7_COUNTRIES = ['United States', 'China', 'India', 'Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom']
G20_COUNTRIES = [
    'United States', 'China', 'Japan', 'Germany', 
    'United Kingdom', 'France', 'Italy', 'Canada',
    'Brazil', 'Russia', 'India', 'Australia', 
    'Mexico', 'Indonesia', 'Turkey', 'Saudi Arabia',
    'South Africa', 'Argentina', 'South Korea'
]

### Data load

In [33]:
def load_data(save_dir='data'):
    data_files = {
        'all_data_df': os.path.join(save_dir, 'all_data_df.csv'),
        'lag_df': os.path.join(save_dir, 'lag_df.csv'),
        'g20_lag_df': os.path.join(save_dir, 'g20_lag_df.csv'),
        'lag_three_sel_1969_df': os.path.join(save_dir, 'lag_three_sel_1969_df.csv')
    }

    dfs = {}
    for name, filepath in data_files.items():
        if os.path.exists(filepath):
            dfs[name] = pd.read_csv(filepath)
            print(f"Loaded {name}: {dfs[name].shape}")
        else:
            print(f"{filepath} not found")
    
    return dfs

In [34]:
data = load_data()
all_data_df = data['all_data_df']

lag_df = data['lag_df']
all_lag_1969_df = lag_df[lag_df['year'] >= 1969].copy()
all_lag_1969_df = all_lag_1969_df[all_lag_1969_df['year'] < 2023]

g20_lag_df = data['g20_lag_df']
g20_lag_1969_df = g20_lag_df[g20_lag_df['year'] >= 1969].copy()
g20_lag_1969_df = g20_lag_1969_df[g20_lag_1969_df['year'] < 2023]

lag_three_sel_1969_df = data['lag_three_sel_1969_df']

Loaded all_data_df: (55529, 200)
Loaded lag_df: (55529, 992)
Loaded g20_lag_df: (3744, 992)
Loaded lag_three_sel_1969_df: (162, 992)


In [35]:
# Config of names of all countries in the dataset
ALL_COUNTRIES = []

for country in all_lag_1969_df['country'].unique():
    ALL_COUNTRIES.append(country)

### Filtering features with data coverage

In [36]:
def calculate_data_coverage(df, countries, exclude_cols):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    feature_cols = [col for col in numeric_cols if col not in exclude_cols]

    coverage_results = {}

    for country in countries:
        country_data = df[df['country'] == country]
        total_rows = len(country_data)

        coverage = {}
        for feature in feature_cols:
            non_missing = country_data[feature].notna().sum()
            coverage_pct = non_missing / total_rows
            coverage[feature] = {
                'non_missing': non_missing,
                'total': total_rows,
                'coverage_pct': coverage_pct
            }

        coverage_results[country] = coverage

    return coverage_results

In [37]:
def filter_by_coverage(coverage_results, min_coverage=0.80):
    all_features = set()
    for country_coverage in coverage_results.values():
        all_features.update(country_coverage.keys())

    valid_features = []
    dropped_features = []

    for feature in all_features:
        meets_threshold = True

        for country, coverage in coverage_results.items():
            if feature in coverage:
                coverage_pct = coverage[feature]['coverage_pct']
                if coverage_pct < min_coverage:
                    meets_threshold = False
                    break
        
        if meets_threshold:
            valid_features.append(feature)
        else:
            dropped_features.append(feature)
                
    return sorted(valid_features), sorted(dropped_features)

In [38]:
coverage_3_countries = calculate_data_coverage(
    lag_three_sel_1969_df, 
    SELECTED_COUNTRIES, 
    COLS_TO_EXCLUDE
)

valid_features_3, dropped_features_3 = filter_by_coverage(
    coverage_3_countries, 
    MIN_DATA_COVERAGE
)

print(f"Total features before filtering: {len(valid_features_3) + len(dropped_features_3)}")
print(f"Valid features (>= {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(valid_features_3)}")
print(f"Dropped features (< {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(dropped_features_3)}")

Total features before filtering: 989
Valid features (>= 80% coverage): 744
Dropped features (< 80% coverage): 245


In [39]:
coverage_g7 = calculate_data_coverage(
    g20_lag_1969_df, 
    G7_COUNTRIES, 
    COLS_TO_EXCLUDE
)

valid_features_g7, dropped_features_g7 = filter_by_coverage(
    coverage_g7, 
    MIN_DATA_COVERAGE
)

print(f"Total features before filtering: {len(valid_features_g7) + len(dropped_features_g7)}")
print(f"Valid features (>= {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(valid_features_g7)}")
print(f"Dropped features (< {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(dropped_features_g7)}")

Total features before filtering: 989
Valid features (>= 80% coverage): 714
Dropped features (< 80% coverage): 275


In [40]:
coverage_g20 = calculate_data_coverage(
    g20_lag_1969_df, 
    G20_COUNTRIES, 
    COLS_TO_EXCLUDE
)

valid_features_g20, dropped_features_g20 = filter_by_coverage(
    coverage_g20, 
    MIN_DATA_COVERAGE
)

print(f"Total features before filtering: {len(valid_features_g20) + len(dropped_features_g20)}")
print(f"Valid features (>= {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(valid_features_g20)}")
print(f"Dropped features (< {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(dropped_features_g20)}")


Total features before filtering: 989
Valid features (>= 80% coverage): 229
Dropped features (< 80% coverage): 760


In [41]:
coverage_all = calculate_data_coverage(
    all_lag_1969_df, 
    G20_COUNTRIES, 
    COLS_TO_EXCLUDE
)

valid_features_all, dropped_features_all = filter_by_coverage(
    coverage_all, 
    MIN_DATA_COVERAGE
)

print(f"Total features before filtering: {len(valid_features_all) + len(dropped_features_all)}")
print(f"Valid features (>= {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(valid_features_all)}")
print(f"Dropped features (< {MIN_DATA_COVERAGE*100:.0f}% coverage): {len(dropped_features_all)}")

Total features before filtering: 989
Valid features (>= 80% coverage): 229
Dropped features (< 80% coverage): 760


### Filtering countries with data coverage

In [42]:
def filter_countries_by_coverage(df, countries, valid_features, min_coverage=0.80):
    valid_countries = []
    dropped_countries = []
    country_stats = {}

    for country in countries:
        country_data = df[df['country'] == country]
        total_rows = len(country_data)

        if total_rows == 0:
            dropped_countries.append(country)
            continue

        total_possible = len(valid_features) * total_rows

        non_missing_total = 0
        for feature in valid_features:
            if feature in country_data.columns:
                non_missing = country_data[feature].notna().sum()
                non_missing_total += non_missing

        # Calculating data coverage in pct
        data_avail = non_missing_total / total_possible if total_possible > 0 else 0

        country_stats[country] = {
            'total_possible_points': total_possible,
            'non_missing_points': non_missing_total,
            'data_availability': data_avail
        }

        # Filter
        if data_avail >= min_coverage:
            valid_countries.append(country)
        else:
            dropped_countries.append(country)

    return sorted(valid_countries), sorted(dropped_countries), country_stats

In [43]:
valid_g20_countries, dropped_g20_countries, g20_country_stats = filter_countries_by_coverage(
    g20_lag_1969_df,
    G20_COUNTRIES,
    valid_features_g20,
    MIN_DATA_COVERAGE
)

print(f"Valid countries (>= {MIN_DATA_COVERAGE*100:.0f}% data): {len(valid_g20_countries)}")
print(f"Dropped countries (< {MIN_DATA_COVERAGE*100:.0f}% data): {len(dropped_g20_countries)}")
print(f"\nValid G20 countries: {valid_g20_countries}")
print(f"\nDropped G20 countries: {dropped_g20_countries}")

Valid countries (>= 80% data): 19
Dropped countries (< 80% data): 0

Valid G20 countries: ['Argentina', 'Australia', 'Brazil', 'Canada', 'China', 'France', 'Germany', 'India', 'Indonesia', 'Italy', 'Japan', 'Mexico', 'Russia', 'Saudi Arabia', 'South Africa', 'South Korea', 'Turkey', 'United Kingdom', 'United States']

Dropped G20 countries: []


In [44]:
valid_all_countries, dropped_all_countries, all_country_stats = filter_countries_by_coverage(
    all_lag_1969_df,
    ALL_COUNTRIES,
    valid_features_all,
    MIN_DATA_COVERAGE
)

print(f"Valid countries (>= {MIN_DATA_COVERAGE*100:.0f}% data): {len(valid_features_all)}")
print(f"Dropped countries (< {MIN_DATA_COVERAGE*100:.0f}% data): {len(dropped_all_countries)}")
print(f"\nValid G20 countries: {valid_all_countries}")
print(f"\nDropped G20 countries: {dropped_all_countries}")

Valid countries (>= 80% data): 229
Dropped countries (< 80% data): 148

Valid G20 countries: ['Afghanistan', 'Africa', 'Albania', 'Algeria', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Asia', 'Asia (excl. China and India)', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo', 'Cook Islands', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Democratic Republic of Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Estonia', 'Eswatini', 'Ethiopia', 'Europe', 'Europe (excl. EU-27)', 'Europe (excl. EU-28)', 'European Union (27)', 'European Union (28)', 'Fiji', 'Fin

In [45]:
g20_filtered_df = g20_lag_1969_df[g20_lag_1969_df['country'].isin(valid_g20_countries)].copy()
all_filtered_df = all_lag_1969_df[all_lag_1969_df['country'].isin(valid_all_countries)].copy()

### Interpolation Forward
Forward interpolating method is conducted to deal with its missing data

In [46]:
def interpolate_forward(df, valid_features, countries):
    df_interpolated = df.copy()

    for country in countries:
        country_mask = df_interpolated['country'] == country

        for feature in valid_features:
            if feature in df_interpolated.columns:
                # Forward fill missing values for the corresponding country and feature
                df_interpolated.loc[country_mask, feature] = df_interpolated.loc[country_mask, feature].ffill()

    return df_interpolated

In [47]:
three_interpolated_df = interpolate_forward(lag_three_sel_1969_df, valid_features_3, SELECTED_COUNTRIES)
g7_interpolated_df = interpolate_forward(g20_lag_1969_df, valid_features_g7, G7_COUNTRIES)
g20_interpolated_df = interpolate_forward(g20_filtered_df, valid_features_g20, valid_g20_countries)
all_interpolated_df = interpolate_forward(all_filtered_df, valid_features_all, valid_all_countries)

save_dir = 'data'
three_interpolated_df.to_csv(os.path.join(save_dir, "three_interpolated_df.csv"), index=False)
g7_interpolated_df.to_csv(os.path.join(save_dir, "g7_interpolated_df.csv"), index=False)
g20_interpolated_df.to_csv(os.path.join(save_dir, "g20_interpolated_df.csv"), index=False)
all_interpolated_df.to_csv(os.path.join(save_dir, "all_interpolated_df.csv"), index=False)

### Data Split

In [48]:
def tts_by_year(df, test_size=9):
    train_data = {}
    test_data = {}

    for country in df['country'].unique():
        country_data = df[df['country'] == country].sort_values('year')

        split_idx = len(country_data) - test_size
        train_data[country] = country_data.iloc[:split_idx]
        test_data[country] = country_data.iloc[split_idx:]

    train_df = pd.concat(train_data.values(), ignore_index=True)
    test_df = pd.concat(test_data.values(), ignore_index=True)

    return train_df, test_df

In [49]:
three_train_df, three_test_df = tts_by_year(three_interpolated_df, test_size=TEST_SIZE)
g7_train_df, g7_test_df = tts_by_year(g7_interpolated_df, test_size=TEST_SIZE)
g20_train_df, g20_test_df = tts_by_year(g20_interpolated_df, test_size=TEST_SIZE)
all_train_df, all_test_df = tts_by_year(all_interpolated_df, test_size=TEST_SIZE)

### Percentage Change Normalisation

In [50]:
def calculate_pct_change(df, features, max_lags=MAX_LAGS):
    df_copy = df.copy()
    df_copy = df_copy.sort_values(['country', 'year']).reset_index(drop=True)
    pct_change_cols = []

    for feature in features:
        if feature not in df_copy.columns:
            continue

        # Pct change on current values
        lag1_col = f"{feature}_lag1"
        if lag1_col in df_copy.columns:
            df_copy[f"{feature}_pct_change"] = ((df_copy[feature] - df_copy[lag1_col]) / df_copy[lag1_col] * 100)
            pct_change_cols.append(f"{feature}_pct_change")

        # Pct change on lagged values
        for lag in range(1, max_lags):
            lag_col = f"{feature}_lag{lag}"
            prev_lag_col = f"{feature}_lag{lag+1}"
            
            if lag_col in df_copy.columns and prev_lag_col in df_copy.columns:
                df_copy[f"{lag_col}_pct_change"] = ((df_copy[lag_col] - df_copy[prev_lag_col]) / df_copy[prev_lag_col] * 100)
                pct_change_cols.append(f"{lag_col}_pct_change")

        # Lag4 for the first row = 0, then shift lag3_pct by country
        last_lag_col = f"{feature}_lag{max_lags}"
        lag3_pct_col = f"{feature}_lag{max_lags-1}_pct_change"

        if last_lag_col in df_copy.columns and lag3_pct_col in df_copy.columns:
            df_copy[f"{last_lag_col}_pct_change"] = df_copy.groupby('country')[lag3_pct_col].shift(1).fillna(0)
            pct_change_cols.append(f"{last_lag_col}_pct_change")
            
        df_copy = df_copy.replace([np.inf, -np.inf], np.nan)
    
    return df_copy, pct_change_cols

In [51]:
three_df_pct_change, three_pct_change_cols = calculate_pct_change(three_train_df, valid_features_3, MAX_LAGS)
g7_df_pct_change, g7_pct_change_cols = calculate_pct_change(g7_train_df, valid_features_g7, MAX_LAGS)
g20_df_pct_change, g20_pct_change_cols = calculate_pct_change(g20_train_df, valid_features_g20, MAX_LAGS)
all_df_pct_change, all_pct_change_cols = calculate_pct_change(all_train_df, valid_features_all, MAX_LAGS)

three_df_selected = three_df_pct_change[three_df_pct_change['country'].isin(SELECTED_COUNTRIES)].copy()
g7_df_selected = g7_df_pct_change[g7_df_pct_change['country'].isin(G7_COUNTRIES)].copy()
g20_df_selected = g20_df_pct_change[g20_df_pct_change['country'].isin(valid_g20_countries)].copy()
all_df_selected = all_df_pct_change[all_df_pct_change['country'].isin(valid_all_countries)].copy()

### Calcaulte Correlation

In [52]:
def calculate_correlations(df, target, pct_change_cols, countries):
    country_corr = {}

    for country in countries:
        country_data = df[df['country'] == country].copy()

        corr_cols = [target] + [col for col in pct_change_cols if col in country_data.columns]

        if target in country_data.columns and len(corr_cols) > 1:
            # Calculate correlations
            correlations = country_data[corr_cols].corr()[target].drop(target)
            correlations = correlations.dropna().sort_values(ascending=False)
            country_corr[country] = correlations
        else:
            print(f"Error of {country}")

    return country_corr

In [53]:
three_correlations = calculate_correlations(three_df_selected, TARGET_VARIABLES, three_pct_change_cols, SELECTED_COUNTRIES)
g7_correlations = calculate_correlations(g7_df_selected, TARGET_VARIABLES, g7_pct_change_cols, G7_COUNTRIES)
g20_correlations = calculate_correlations(g20_df_selected, TARGET_VARIABLES, g20_pct_change_cols, valid_g20_countries)
all_correlations = calculate_correlations(all_df_selected, TARGET_VARIABLES, all_pct_change_cols, valid_all_countries)

### Summary for pct_change only

In [54]:
def summary_pct_change_only(correlations, countries):
    all_features = {}

    for country, corr in correlations.items():
        for feature, value in corr.items():
            # Including only its native pct_value, not lags
            if feature.endswith('_pct_change') and '_lag' not in feature:
                if feature not in all_features:
                    all_features[feature] = {}
                all_features[feature][country] = value

    # Summary
    summary_data = []
    for feature, country_values in all_features.items():
        row = {'Feature': feature}
        # Mean abs corr - leftmost column
        abs_corr_values = [abs(v) for v in country_values.values()]
        row['Overall_Abs_Corr'] = np.mean(abs_corr_values)

        # Individual country corr
        for country in countries:
            row[f'{country}_Corr'] = country_values.get(country, np.nan)

        summary_data.append(row)

    # DataFrame and sort by Overall_Abs_Corr
    summary_df = pd.DataFrame(summary_data)
    summary_df = summary_df.sort_values('Overall_Abs_Corr', ascending=False).reset_index(drop=True)

    return summary_df

### Summary for pct_change and its lags

In [55]:
def summary_pct_change_with_lags(correlations, countries):
    all_features = {}

    for country, corr in correlations.items():
        for feature, value in corr.items():
            # Any pct_change columns (with or without _lag)
            if feature.endswith('_pct_change'):
                if feature not in all_features:
                    all_features[feature] = {}
                all_features[feature][country] = value

    # Group and calculate average of the features with its lags
    base_features = {}

    for feature, country_values in all_features.items():
        if '_lag' in feature:
            # 'feature_lag1_pct_change' -> 'feature_pct_change'
            base_name = feature.split('_lag')[0] + '_pct_change'
        else:
            base_name = feature

        if base_name not in base_features:
            base_features[base_name] = {country: [] for country in countries}

        for country in countries:
            if country in country_values:
                base_features[base_name][country].append(country_values[country])

    # Summary
    summary_data = []
    for feature, country_values in base_features.items():
        row = {'Feature': feature}

        country_mean_abs_corrs = []
        for country in countries:
            if country_values[country]:
                abs_corr_values = [abs(corr) for corr in country_values[country]]
                mean_abs_corr = np.mean(abs_corr_values)
                row[f'{country}_Corr'] = mean_abs_corr
                country_mean_abs_corrs.append(mean_abs_corr)
            else:
                row[f'{country}_Corr'] = np.nan

        # Leftmost is Overall
        if country_mean_abs_corrs:
            row['Overall_Abs_Corr'] = np.mean(country_mean_abs_corrs)
        else:
            row['Overall_Abs_Corr'] = np.nan
        
        summary_data.append(row)

    # DataFrame and sort by Overall_Abs_Corr
    summary_df = pd.DataFrame(summary_data)

    cols = ['Feature', 'Overall_Abs_Corr'] + [f'{country}_Corr' for country in countries]
    summary_df = summary_df[cols]

    summary_df = summary_df.sort_values('Overall_Abs_Corr', ascending=False).reset_index(drop=True)

    return summary_df

In [56]:
# Three countries
three_summary_no_lags = summary_pct_change_only(three_correlations, SELECTED_COUNTRIES)
print(three_summary_no_lags.head())

three_summary_with_lags = summary_pct_change_with_lags(three_correlations, SELECTED_COUNTRIES)
print(three_summary_with_lags.head())

                                      Feature  Overall_Abs_Corr  \
0               cumulative_luc_co2_pct_change          0.755616   
1               cumulative_oil_co2_pct_change          0.721510   
2                       population_pct_change          0.657313   
3  share_global_cumulative_oil_co2_pct_change          0.647764   
4     cumulative_co2_including_luc_pct_change          0.602702   

   United States_Corr  China_Corr  India_Corr  
0           -0.827236   -0.807815   -0.631796  
1           -0.737343   -0.608473   -0.818714  
2           -0.247386   -0.757744   -0.966807  
3            0.852783   -0.539500    0.551010  
4           -0.630464    0.214794    0.962849  
                                      Feature  Overall_Abs_Corr  \
0               cumulative_luc_co2_pct_change          0.730085   
1               cumulative_oil_co2_pct_change          0.691325   
2  share_global_cumulative_oil_co2_pct_change          0.666460   
3                       population_pct_ch

In [57]:
# G7 + three countries
g7_summary_no_lags = summary_pct_change_only(g7_correlations, G7_COUNTRIES)
g7_summary_with_lags = summary_pct_change_with_lags(g7_correlations, G7_COUNTRIES)

# G20
g20_summary_no_lags = summary_pct_change_only(g20_correlations, valid_g20_countries)
g20_summary_with_lags = summary_pct_change_with_lags(g20_correlations, valid_g20_countries)

# All valid countries
all_summary_no_lags = summary_pct_change_only(all_correlations, valid_all_countries)
all_summary_with_lags = summary_pct_change_with_lags(all_correlations, valid_all_countries)

### Exclude features in the summary

In [58]:
vars_to_excl = [
    'cumulative_',
    'temperature_',
    '_including_luc'
]

def exclude_vars(summary_df):
    filtered_df = summary_df.copy()

    # Filter out vars to excl
    for var in vars_to_excl:
        filtered_df = filtered_df[
            ~(filtered_df['Feature'].str.contains(var, case=False, na=False))
        ]

    return filtered_df

In [59]:
three_summary_no_lags = exclude_vars(three_summary_no_lags)
print(three_summary_no_lags.head())

three_summary_with_lags = exclude_vars(three_summary_with_lags)
print(three_summary_with_lags.head())

                               Feature  Overall_Abs_Corr  United States_Corr  \
2                population_pct_change          0.657313           -0.247386   
12       solar_share_energy_pct_change          0.498002           -0.279586   
19  biofuel_cons_per_capita_pct_change          0.377764            0.417736   
20      biofuel_consumption_pct_change          0.374478            0.413879   
21    solar_cons_change_twh_pct_change          0.357080            0.172489   

    China_Corr  India_Corr  
2    -0.757744   -0.966807  
12    0.691372    0.523047  
19   -0.562209    0.153348  
20   -0.561442    0.148114  
21    0.650542    0.248208  
                               Feature  Overall_Abs_Corr  United States_Corr  \
3                population_pct_change          0.624549            0.251616   
10       solar_share_energy_pct_change          0.507432            0.381350   
19      biofuel_consumption_pct_change          0.359677            0.281350   
20  biofuel_cons_per_capi

In [60]:
g7_summary_no_lags = exclude_vars(g7_summary_no_lags)
g7_summary_with_lags = exclude_vars(g7_summary_with_lags)

g20_summary_no_lags = exclude_vars(g20_summary_no_lags)
g20_summary_with_lags = exclude_vars(g20_summary_with_lags)

all_summary_no_lags = exclude_vars(all_summary_no_lags)
all_summary_with_lags = exclude_vars(all_summary_with_lags)

### Save summaries as md

In [61]:
def summary_to_md(summary, filepath, title):
    with open(filepath, 'w') as f:
        f.write(f"# {title}\n\n")
        f.write(f"Number of features: {len(summary)}\n\n")
        f.write("## Top 40 Features\n\n")
        f.write(summary.head(40).to_markdown(index=False))

In [62]:
save_dir = 'data/03_01_results'
os.makedirs(save_dir, exist_ok=True)

# Three countries
summary_to_md(three_summary_no_lags, os.path.join(save_dir, 'three_summary_no_lags.md'),
              '3 Countries Summary - pct change only')
summary_to_md(three_summary_with_lags, os.path.join(save_dir, 'three_summary_with_lags.md'),
              '3 Countries Summary - pct change with lags')

# G7 + three countries (EU excluded)
summary_to_md(g7_summary_no_lags, os.path.join(save_dir, 'g7_summary_no_lags.md'),
              'G7 + 3 Countries (EU excluded) Summary - pct change only')
summary_to_md(g7_summary_no_lags, os.path.join(save_dir, 'g7_summary_with_lags.md'),
              'G7 + 3 Countries (EU excluded) Summary - pct change with lags')

# G20 countries (EU excluded)
summary_to_md(g20_summary_no_lags, os.path.join(save_dir, 'g20_summary_no_lags.md'),
              'G20 Countries (EU excluded) Summary - pct change only')
summary_to_md(g20_summary_with_lags, os.path.join(save_dir, 'g20_summary_with_lags.md'),
              'G20 Countries (EU excluded) Summary - pct change with lags')

# All valid countries
summary_to_md(all_summary_no_lags, os.path.join(save_dir, 'all_summary_no_lags.md'),
              'All Valid Countries Summary - pct change only')
summary_to_md(all_summary_no_lags, os.path.join(save_dir, 'all_summary_with_lags.md'),
              'All Valid Countries Summary - pct change with lags')