# Data Preparation

In [1]:
# Necessary imports
import pandas as pd
import numpy as np
import gc
import pickle
import os
import warnings
warnings.filterwarnings("ignore")

# Data load
df_co2 = pd.read_csv("https://raw.githubusercontent.com/owid/co2-data/master/owid-co2-data.csv")
df_energy = pd.read_csv("https://raw.githubusercontent.com/owid/energy-data/refs/heads/master/owid-energy-data.csv")

### Data merge

In [2]:
common_columns = set(df_co2.columns).intersection(set(df_energy.columns))
print(common_columns)

{'primary_energy_consumption', 'iso_code', 'year', 'country', 'population', 'energy_per_gdp', 'energy_per_capita', 'gdp'}


In [3]:
# G20 countries
g20_countries = [
    'United States', 'China', 'Japan', 'Germany', 
    'United Kingdom', 'France', 'Italy', 'Canada',
    'Brazil', 'Russia', 'India', 'Australia', 
    'Mexico', 'Indonesia', 'Turkey', 'Saudi Arabia',
    'South Africa', 'Argentina', 'South Korea'
]

# Remove iso_code from the common columns
common_cols = list(common_columns)
common_cols.remove('iso_code')

# Merge keys
merge_keys = ['country', 'year']

# Unique columns in each dataframe
remain_common_cols = [col for col in common_cols if col not in merge_keys]
df_co2_unique_cols = list(set(df_co2.columns) - set(df_energy.columns) - set(merge_keys))
df_energy_unique_cols = list(set(df_energy.columns) - set(df_co2.columns) - set(merge_keys))
overlap_cols = list(set(df_co2.columns).intersection(set(df_energy.columns)) - set(common_cols) - set(merge_keys))

# Dataframe with unique rows
df_co2_clean = df_co2.drop_duplicates(merge_keys)
df_energy_clean = df_energy.drop_duplicates(merge_keys)

# Merge common cols and unique cols
col_co2 = remain_common_cols + df_co2_unique_cols
col_energy = df_energy_unique_cols

# Common cols appear once
merged_df = pd.merge(
    df_co2_clean[merge_keys + col_co2], 
    df_energy_clean[merge_keys + col_energy],
    on=merge_keys, 
    how='outer'
)

# Overlapping cols not in common cols
if overlap_cols:
    for col in overlap_cols:
        # Temp cols
        co2_data = df_co2_clean[merge_keys + [col]].rename(columns={col: f"{col}_co2"})
        energy_data = df_energy_clean[merge_keys + [col]].rename(columns={col: f"{col}_energy"})

        merged_df = merged_df.merge(co2_data, on=merge_keys, how='left')
        merged_df = merged_df.merge(energy_data, on=merge_keys, how='left')

        merged_df[col] = merged_df[f"{col}_co2"].combine_first(merged_df[f"{col}_energy"])

        # Drop temp cols
        merged_df = merged_df.drop([f"{col}_co2", f"{col}_energy"], axis=1)

df = merged_df.drop(['iso_code'], axis=1).copy()
g20_df = df[df['country'].isin(g20_countries)].copy()

# Clean up
del df_co2, df_energy, merged_df, df_co2_clean, df_energy_clean
gc.collect()

0

In [4]:
g20_df['country'].unique()

array(['Argentina', 'Australia', 'Brazil', 'Canada', 'China', 'France',
       'Germany', 'India', 'Indonesia', 'Italy', 'Japan', 'Mexico',
       'Russia', 'Saudi Arabia', 'South Africa', 'South Korea', 'Turkey',
       'United Kingdom', 'United States'], dtype=object)

### Time lag

In [5]:
# Time lag feature
def time_lag_feature(df, periods=[1, 2, 3, 4]):

    dup_df = df.copy()

    # All numerical columns except year
    feature_columns = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'year' in feature_columns:
        feature_columns.remove('year')

    if 'country' in dup_df.columns and 'year' in dup_df.columns:
        dup_df = dup_df.sort_values(['country', 'year'])

    for country, country_data in dup_df.groupby('country'):
        for col in feature_columns:
            if col in country_data.columns:
                for lag in periods:
                    lag_col_name = f"{col}_lag{lag}"
                    dup_df.loc[country_data.index, lag_col_name] = country_data[col].shift(lag)

    return dup_df

numerical_features = g20_df.select_dtypes(include=[np.number]).columns.tolist()
if 'year' in numerical_features:
    numerical_features.remove('year')

lag_df = g20_df[['country', 'year'] + numerical_features].copy()
lag_df = time_lag_feature(lag_df, [1, 2, 3, 4])

In [6]:
lag_df.tail(3)

Unnamed: 0,country,year,primary_energy_consumption,population,energy_per_gdp,energy_per_capita,gdp,temperature_change_from_ghg,land_use_change_co2_per_capita,other_co2_per_capita,...,hydro_cons_change_pct_lag3,hydro_cons_change_pct_lag4,solar_share_energy_lag1,solar_share_energy_lag2,solar_share_energy_lag3,solar_share_energy_lag4,hydro_consumption_lag1,hydro_consumption_lag2,hydro_consumption_lag3,hydro_consumption_lag4
51601,United States,2022,26504.305,341534041.0,1.36,78347.914,19493170000000.0,0.285,0.299,0.073,...,-1.398,-2.457,1.674,1.408,1.071,0.932,651.455,742.663,752.485,765.977
51602,United States,2023,26189.199,343477332.0,,77027.836,,0.288,0.337,0.072,...,-0.941,-1.398,2.038,1.674,1.408,1.071,655.104,651.455,742.663,752.485
51603,United States,2024,,,,,,,,,...,-11.959,-0.941,2.386,2.038,1.674,1.408,613.883,655.104,651.455,742.663


### Simple Analysis of data of g20 countries

In [7]:
# Filter data from 1965
def filter_from_1965(df):
    return df[df['year'] >= 1965].copy()

In [8]:
# Simple analysis of each features
def df_ini_analysis(df):
    
    # exclude non-numeric
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'year' in numeric_cols:
        numeric_cols.remove('year')
    
    stats = []

    for col in numeric_cols:
        total_values = len(df)
        non_null_values = df[col].notna().sum()
        coverage = (non_null_values / total_values) * 100

        if non_null_values > 0:
            start_year = df[df[col].notna()]['year'].min()
            end_year = df[df[col].notna()]['year'].max()
            years_span = end_year - start_year + 1
            countries_with_data = df[df[col].notna()]['country'].nunique()
        else:
            start_year = 'No data'
            end_year = 'No data'
            years_span = 0
            countries_with_data = 0

        stats.append({
            'Feature': col,
            'Coverage_(%)': coverage,
            'Non_null_values': non_null_values,
            'Total_possible_values': total_values,
            'Start_year': start_year,
            'End_year': end_year,
            'Years_span': years_span,
            'Countries_with_data': countries_with_data
        })
        
    avail_df = pd.DataFrame(stats)
    avail_df = avail_df.sort_values('Coverage_(%)', ascending=False)

    print("Top 10 features by coverage")
    print(avail_df.head(10).to_string(index=False))

    return avail_df

In [9]:
df_ini_analysis(g20_df)

Top 10 features by coverage
                    Feature  Coverage_(%)  Non_null_values  Total_possible_values  Start_year  End_year  Years_span  Countries_with_data
                 population     96.552646             3613                   3742        1750      2023         274                   19
           share_global_co2     89.898450             3364                   3742        1750      2023         274                   19
                        co2     89.898450             3364                   3742        1750      2023         274                   19
share_global_cumulative_co2     89.898450             3364                   3742        1750      2023         274                   19
             cumulative_co2     89.898450             3364                   3742        1750      2023         274                   19
             co2_growth_abs     89.176911             3337                   3742        1751      2023         273                   19
             

Unnamed: 0,Feature,Coverage_(%),Non_null_values,Total_possible_values,Start_year,End_year,Years_span,Countries_with_data
1,population,96.552646,3613,3742,1750,2023,274,19
18,share_global_co2,89.898450,3364,3742,1750,2023,274,19
60,co2,89.898450,3364,3742,1750,2023,274,19
62,share_global_cumulative_co2,89.898450,3364,3742,1750,2023,274,19
54,cumulative_co2,89.898450,3364,3742,1750,2023,274,19
...,...,...,...,...,...,...,...,...
22,share_global_other_co2,11.811865,442,3742,1990,2023,34,13
152,biofuel_cons_change_pct,10.929984,409,3742,1971,2023,53,18
130,other_renewables_elec_per_capita_exc_biofuel,5.852485,219,3742,1990,2024,35,7
192,other_renewable_exc_biofuel_electricity,5.852485,219,3742,1990,2024,35,7


### Simple Analysis of g20 countries from start year

In [10]:
# Same anaylsis but with start year
def df_analysis_start(df):
    
    # exclude non-numeric
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'year' in numeric_cols:
        numeric_cols.remove('year')
    
    stats = []

    for col in numeric_cols:
        feature_data = df[df[col].notna()]
        
        if len(feature_data) == 0:
            stats.append({
                'Feature': col,
                'Coverage_from_start_(%)': 0.0,
                'Non_null_values': 0,
                'Total_possible_from_start': 0,
                'Original_start_year': 'No data',
                'End_year': 'No data',
                'Years_span': 0,
                'Countries_with_data': 0
            })
            continue
            
        start_year = feature_data['year'].min()
        end_year = feature_data['year'].max()
        years_span = end_year - start_year + 1
        
        # Calculate total possible from the actual start year of this feature (but >= 1965)
        countries_with_data = feature_data['country'].nunique()
        total_possible_from_start = countries_with_data * years_span
        non_null_values = len(feature_data)
        coverage_from_start = (non_null_values / total_possible_from_start) * 100

        stats.append({
            'Feature': col,
            'Coverage_from_start_(%)': coverage_from_start,
            'Non_null_values': non_null_values,
            'Total_possible_from_start': total_possible_from_start,
            'Original_start_year': start_year,
            'End_year': end_year,
            'Years_span': years_span,
            'Countries_with_data': countries_with_data
        })
        
    start_year_df = pd.DataFrame(stats)
    start_year_df = start_year_df.sort_values('Coverage_from_start_(%)', ascending=False)

    print("Top 10 features by coverage from start year (1965+)")
    print(start_year_df.head(10).to_string(index=False))

    return start_year_df

In [11]:
df_analysis_start(g20_df)

Top 10 features by coverage from start year (1965+)
                        Feature  Coverage_from_start_(%)  Non_null_values  Total_possible_from_start  Original_start_year  End_year  Years_span  Countries_with_data
    temperature_change_from_ghg                    100.0             3287                       3287                 1851      2023         173                   19
                      trade_co2                    100.0              627                        627                 1990      2022          33                   19
            land_use_change_co2                    100.0             3306                       3306                 1850      2023         174                   19
                trade_co2_share                    100.0              627                        627                 1990      2022          33                   19
     consumption_co2_per_capita                    100.0              627                        627                 1990  

Unnamed: 0,Feature,Coverage_from_start_(%),Non_null_values,Total_possible_from_start,Original_start_year,End_year,Years_span,Countries_with_data
5,temperature_change_from_ghg,100.000000,3287,3287,1851,2023,173,19
21,trade_co2,100.000000,627,627,1990,2022,33,19
15,land_use_change_co2,100.000000,3306,3306,1850,2023,174,19
17,trade_co2_share,100.000000,627,627,1990,2022,33,19
10,consumption_co2_per_capita,100.000000,627,627,1990,2022,33,19
...,...,...,...,...,...,...,...,...
36,cumulative_other_co2,35.769231,558,1560,1904,2023,120,13
53,other_industry_co2,35.769231,558,1560,1904,2023,120,13
48,flaring_co2,25.009604,1302,5206,1750,2023,274,19
51,cumulative_flaring_co2,25.009604,1302,5206,1750,2023,274,19


### Save Dataframes and analysis

In [None]:
# Create markdown README
def create_readme(analysis_df, save_dir=):
    
    readme_content = f"""# G20 Countries Feature Analysis
    
### Feature Analysis sorted by Coverage and over 60%

| Feature | Coverage (%) | Start Year | End Year | Years Span | Countries with Data |
|---------|--------------|------------|----------|------------|---------------------|"""

    # Features with greater than 60% of coverage
    acpt_coverage = analysis_df[analysis_df['Coverage_(%)'] >= 60.0]
    for _, row in acpt_coverage.iterrows():
        readme_content += f"\n| {row['Feature']} | {row['Coverage_(%)']:.1f} | {row['Start_year']} | {row['End_year']} | {row['Years_span']} | {row['Countries_with_data']} |"

    # Save README
    readme_path = os.path.join(save_dir, '00_results', f'feature_g20_analysis.md')
    with open(readme_path, 'w', encoding='utf-8') as f:
        f.write(readme_content)

In [21]:
# Create start year README
def create_start_year_readme(analysis_df, save_dir):
    
    readme_content = f"""# G20 Countries Feature Analysis from Start Year
    
### Feature Analysis sorted by Coverage from Start Year and over 60%

| Feature | Coverage from Start (%) | Original Start Year | End Year | Years Span | Countries with Data |
|---------|-------------------------|---------------------|----------|------------|---------------------|"""

    # Features with greater than 60% of coverage from start
    acpt_coverage = analysis_df[analysis_df['Coverage_from_start_(%)'] >= 60.0]
    for _, row in acpt_coverage.iterrows():
        readme_content += f"\n| {row['Feature']} | {row['Coverage_from_start_(%)']:.1f} | {row['Original_start_year']} | {row['End_year']} | {row['Years_span']} | {row['Countries_with_data']} |"

    # Save README
    readme_path = os.path.join(save_dir, '00_results', f'feature_g20_start_year_analysis.md')
    with open(readme_path, 'w', encoding='utf-8') as f:
        f.write(readme_content)

In [None]:
# Saving dataframes and analysis
def save_dfs_analysis(g20_df, lag_df, save_dir='data_export'):

    # Create dir
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        print(f"Directory created: {save_dir}")
    
    # Filter data from 1960
    g20_df_1965 = filter_from_1965(g20_df)
    lag_df_1965 = filter_from_1965(lag_df)

    # Save dataframes as pickle
    print(f"Saving dfs as pickle")
    g20_df.to_pickle(os.path.join(save_dir, f"g20_df.pkl"))
    lag_df.to_pickle(os.path.join(save_dir, f"lag_df.pkl"))
    g20_df_1965.to_pickle(os.path.join(save_dir, f"g20_df_1965.pkl"))
    lag_df_1965.to_pickle(os.path.join(save_dir, f"lag_df_1965.pkl"))

    # Save dataframes as csv
    print(f"Saving dfs as csv")
    g20_df.to_csv(os.path.join(save_dir, f"g20_df.csv"), index=False)
    lag_df.to_csv(os.path.join(save_dir, f"lag_df.csv"), index=False)
    g20_df_1965.to_csv(os.path.join(save_dir, f"g20_df_1965.csv"), index=False)
    lag_df_1965.to_csv(os.path.join(save_dir, f"lag_df_1965.csv"), index=False)

    # Create and save analyses
    all_analysis_df = df_ini_analysis(g20_df)
    start_year_analysis_df = df_analysis_start(g20_df)

    #all_analysis_df.to_csv(os.path.join(save_dir, f'feature_g20_analysis.csv'), index=False)
    #start_year_analysis_df.to_csv(os.path.join(save_dir, f'feature_g20_start_year_analysis.csv'), index=False)

    save_dir = 'data_export/00_results'
    create_readme(all_analysis_df, save_dir)
    create_start_year_readme(start_year_analysis_df, save_dir)

In [15]:
save_dfs_analysis(g20_df, lag_df)

Saving dfs as pickle
Saving dfs as csv
Top 10 features by coverage
                    Feature  Coverage_(%)  Non_null_values  Total_possible_values  Start_year  End_year  Years_span  Countries_with_data
                 population     96.552646             3613                   3742        1750      2023         274                   19
           share_global_co2     89.898450             3364                   3742        1750      2023         274                   19
                        co2     89.898450             3364                   3742        1750      2023         274                   19
share_global_cumulative_co2     89.898450             3364                   3742        1750      2023         274                   19
             cumulative_co2     89.898450             3364                   3742        1750      2023         274                   19
             co2_growth_abs     89.176911             3337                   3742        1751      2023        

PermissionError: [Errno 13] Permission denied: 'data_export\\00_results\\feature_g20_analysis.md'