# Data Preprocessing

In [1]:
# Import dependencies

import pandas as pd
import numpy as np

from itertools import groupby

In [2]:
# Read in dataset
file_path = 'data/world_energy_consumption.csv'
energy_df = pd.read_csv(file_path)
energy_df.head()

Unnamed: 0,iso_code,country,year,coal_prod_change_pct,coal_prod_change_twh,gas_prod_change_pct,gas_prod_change_twh,oil_prod_change_pct,oil_prod_change_twh,energy_cons_change_pct,...,solar_elec_per_capita,solar_energy_per_capita,gdp,wind_share_elec,wind_cons_change_pct,wind_share_energy,wind_cons_change_twh,wind_consumption,wind_elec_per_capita,wind_energy_per_capita
0,AFG,Afghanistan,1900,,,,,,,,...,,,,,,,,,,
1,AFG,Afghanistan,1901,,0.0,,,,,,...,,,,,,,,,,
2,AFG,Afghanistan,1902,,0.0,,,,,,...,,,,,,,,,,
3,AFG,Afghanistan,1903,,0.0,,,,,,...,,,,,,,,,,
4,AFG,Afghanistan,1904,,0.0,,,,,,...,,,,,,,,,,


In [3]:
# Get length
energy_df_len = len(energy_df)
energy_df_len

17432

In [4]:
# Get general coverage information
energy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17432 entries, 0 to 17431
Columns: 122 entries, iso_code to wind_energy_per_capita
dtypes: float64(119), int64(1), object(2)
memory usage: 16.2+ MB


In [5]:
# Categorize/group columns
energy_cols = list(energy_df.columns)
energy_cols.sort()
column_groups = { j: list(i) for j, i in groupby(energy_cols, lambda a: a.split('_')[0]) }

column_groups

{'biofuel': ['biofuel_cons_change_pct',
  'biofuel_cons_change_twh',
  'biofuel_cons_per_capita',
  'biofuel_consumption',
  'biofuel_elec_per_capita',
  'biofuel_electricity',
  'biofuel_share_elec',
  'biofuel_share_energy'],
 'carbon': ['carbon_intensity_elec'],
 'coal': ['coal_cons_change_pct',
  'coal_cons_change_twh',
  'coal_cons_per_capita',
  'coal_consumption',
  'coal_elec_per_capita',
  'coal_electricity',
  'coal_prod_change_pct',
  'coal_prod_change_twh',
  'coal_prod_per_capita',
  'coal_production',
  'coal_share_elec',
  'coal_share_energy'],
 'country': ['country'],
 'electricity': ['electricity_generation'],
 'energy': ['energy_cons_change_pct',
  'energy_cons_change_twh',
  'energy_per_capita',
  'energy_per_gdp'],
 'fossil': ['fossil_cons_change_pct',
  'fossil_cons_change_twh',
  'fossil_cons_per_capita',
  'fossil_electricity',
  'fossil_energy_per_capita',
  'fossil_fuel_consumption',
  'fossil_share_elec',
  'fossil_share_energy'],
 'gas': ['gas_cons_change_pct

In [6]:
# Show columns categories by number of values
sorted_groups = sorted(column_groups.items(), key=lambda x: len(x[1]))

curr_length = 0
for x in sorted_groups:
    if (tmp_len := len(x[1])) != curr_length:
        curr_length = tmp_len
        print(f'\n{tmp_len})')
    print(x[0])


1)
carbon
country
electricity
gdp
iso
per
population
primary
year

4)
energy

8)
biofuel
fossil
hydro
low
nuclear
renewables
solar
wind

9)
other

12)
coal
gas
oil


In [7]:
# Define types
energy_types = ['biofuel', 'fossil', 'hydro', 'low_carbon', 'nuclear', 
                'renewables', 'solar', 'wind', 'coal', 'gas', 'oil', 
                'other_renewable']

measurement_postfixes = ['cons_change_pct', 'cons_change_twh', 'cons_per_capita', 
                         'consumption', 'energy_per_capita', 'electricity',
                        'share_elec', 'share_energy']

energy_cols = [[p for p in [x for x in 
                            sorted(column_groups[t.split('_')[0]], key=lambda a: a.split('_')[-1])
                           if not any(z in x for z in ['prod', 'exc'])]
               ] for t in energy_types]

In [8]:
energy_cols

[['biofuel_cons_per_capita',
  'biofuel_elec_per_capita',
  'biofuel_consumption',
  'biofuel_share_elec',
  'biofuel_electricity',
  'biofuel_share_energy',
  'biofuel_cons_change_pct',
  'biofuel_cons_change_twh'],
 ['fossil_cons_per_capita',
  'fossil_energy_per_capita',
  'fossil_fuel_consumption',
  'fossil_share_elec',
  'fossil_electricity',
  'fossil_share_energy',
  'fossil_cons_change_pct',
  'fossil_cons_change_twh'],
 ['hydro_elec_per_capita',
  'hydro_energy_per_capita',
  'hydro_consumption',
  'hydro_share_elec',
  'hydro_electricity',
  'hydro_share_energy',
  'hydro_cons_change_pct',
  'hydro_cons_change_twh'],
 ['low_carbon_elec_per_capita',
  'low_carbon_energy_per_capita',
  'low_carbon_consumption',
  'low_carbon_share_elec',
  'low_carbon_electricity',
  'low_carbon_share_energy',
  'low_carbon_cons_change_pct',
  'low_carbon_cons_change_twh'],
 ['nuclear_elec_per_capita',
  'nuclear_energy_per_capita',
  'nuclear_consumption',
  'nuclear_share_elec',
  'nuclear_e

## Inspect data coverage

In [9]:
null_vals_dict = { t : [energy_df[x].isna().sum() for x in energy_cols[idx]] 
                  for idx, t in enumerate(energy_types) }


In [13]:
null_vals_dict

{'biofuel': [11806, 13243, 11806, 13226, 13183, 13148, 16913, 11923],
 'fossil': [12673, 13148, 13148, 12376, 12333, 13148, 13231, 13231],
 'hydro': [11933, 13142, 13142, 11356, 11313, 13148, 13768, 13225],
 'low_carbon': [11933, 13142, 13142, 11391, 11348, 13148, 13597, 13225],
 'nuclear': [11933, 13142, 13142, 11356, 11313, 13148, 15910, 13225],
 'renewables': [11933, 13142, 13142, 11391, 11348, 13148, 13604, 13225],
 'solar': [11933, 13142, 13142, 11356, 11313, 13148, 16107, 13225],
 'wind': [11933, 13142, 13142, 11356, 11313, 13148, 15889, 13225],
 'coal': [13142, 12673, 12262, 12376, 12333, 13148, 13670, 13225],
 'gas': [12673, 13142, 12262, 12376, 12333, 13148, 13728, 13225],
 'oil': [12673, 13148, 12248, 12376, 12333, 13148, 13231, 13231],
 'other_renewable': [11933, 13142, 13142, 11391, 11348, 13148, 15106, 13225]}

In [11]:
pd.DataFrame.from_dict(null_vals_dict, orient='index', columns=measurement_postfixes)

Unnamed: 0,cons_change_pct,cons_change_twh,cons_per_capita,consumption,energy_per_capita,electricity,share_elec,share_energy
biofuel,11806,13243,11806,13226,13183,13148,16913,11923
fossil,12673,13148,13148,12376,12333,13148,13231,13231
hydro,11933,13142,13142,11356,11313,13148,13768,13225
low_carbon,11933,13142,13142,11391,11348,13148,13597,13225
nuclear,11933,13142,13142,11356,11313,13148,15910,13225
renewables,11933,13142,13142,11391,11348,13148,13604,13225
solar,11933,13142,13142,11356,11313,13148,16107,13225
wind,11933,13142,13142,11356,11313,13148,15889,13225
coal,13142,12673,12262,12376,12333,13148,13670,13225
gas,12673,13142,12262,12376,12333,13148,13728,13225


In [18]:
percent_null_dict = { k : [(x / len(energy_df)) for x in v] for k, v in null_vals_dict.items() }

In [19]:
pd.DataFrame.from_dict(percent_null_dict, orient='index', columns=measurement_postfixes)

Unnamed: 0,cons_change_pct,cons_change_twh,cons_per_capita,consumption,energy_per_capita,electricity,share_elec,share_energy
biofuel,0.67726,0.759695,0.67726,0.75872,0.756253,0.754245,0.970227,0.683972
fossil,0.726996,0.754245,0.754245,0.709959,0.707492,0.754245,0.759006,0.759006
hydro,0.684546,0.753901,0.753901,0.651446,0.648979,0.754245,0.789812,0.758662
low_carbon,0.684546,0.753901,0.753901,0.653453,0.650987,0.754245,0.780002,0.758662
nuclear,0.684546,0.753901,0.753901,0.651446,0.648979,0.754245,0.912689,0.758662
renewables,0.684546,0.753901,0.753901,0.653453,0.650987,0.754245,0.780404,0.758662
solar,0.684546,0.753901,0.753901,0.651446,0.648979,0.754245,0.92399,0.758662
wind,0.684546,0.753901,0.753901,0.651446,0.648979,0.754245,0.911485,0.758662
coal,0.753901,0.726996,0.703419,0.709959,0.707492,0.754245,0.78419,0.758662
gas,0.726996,0.753901,0.703419,0.709959,0.707492,0.754245,0.787517,0.758662


In [None]:
len(energy_df)

In [9]:
agg_null_vals = {x:energy_df[x].isna().sum() for x in energy_df.columns }

# pd.DataFrame.from_dict(agg_null_vals,orient='index')
agg_null_vals

# for c in energy_df.columns:
#     print(c, energy_df[c].isna().sum())

{'iso_code': 1802,
 'country': 0,
 'year': 0,
 'coal_prod_change_pct': 9987,
 'coal_prod_change_twh': 7038,
 'gas_prod_change_pct': 12570,
 'gas_prod_change_twh': 9539,
 'oil_prod_change_pct': 10911,
 'oil_prod_change_twh': 8867,
 'energy_cons_change_pct': 7590,
 'energy_cons_change_twh': 7540,
 'biofuel_share_elec': 13226,
 'biofuel_elec_per_capita': 13243,
 'biofuel_cons_change_pct': 16913,
 'biofuel_share_energy': 13148,
 'biofuel_cons_change_twh': 11923,
 'biofuel_consumption': 11806,
 'biofuel_cons_per_capita': 11806,
 'carbon_intensity_elec': 16844,
 'coal_share_elec': 12376,
 'coal_cons_change_pct': 13670,
 'coal_share_energy': 13148,
 'coal_cons_change_twh': 13225,
 'coal_consumption': 12262,
 'coal_elec_per_capita': 12673,
 'coal_cons_per_capita': 13142,
 'coal_production': 6803,
 'coal_prod_per_capita': 7779,
 'electricity_generation': 11313,
 'biofuel_electricity': 13183,
 'coal_electricity': 12333,
 'fossil_electricity': 12333,
 'gas_electricity': 12333,
 'hydro_electricity