# Exploring GLobal Energy Use with Machine Learning Models

In [156]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist

In [150]:
df = pd.read_csv('https://raw.githubusercontent.com/owid/energy-data/master/owid-energy-data.csv')

# Big overview variables
core_variables = ['country', 'year', 'iso_code', 'gdp', 'population',
    'primary_energy_consumption', 'energy_per_capita', 'energy_per_gdp',
    'fossil_share_energy', 'renewables_share_energy', 'nuclear_share_energy',
    'coal_share_energy', 'oil_share_energy', 'gas_share_energy',
    'hydro_share_energy', 'solar_share_energy', 'wind_share_energy',
    'electricity_generation', 'per_capita_electricity']

# Energy transition, how is energy created
transition_variables = ['country', 'year', 'gdp', 'population',
    'primary_energy_consumption', 'energy_per_capita',
    'fossil_share_energy', 'renewables_share_energy', 'nuclear_share_energy',
    'solar_share_energy', 'wind_share_energy', 'hydro_share_energy',
    'electricity_generation']

# Climate impact, how is energy being used
climate_variables = ['country', 'year', 'gdp', 'population',
    'primary_energy_consumption', 'energy_per_capita',
    'fossil_share_energy', 'renewables_share_energy']

# Small dataset, perhaps to use for testing models
minimal_variables = ['country', 'year', 'gdp', 'population',
    'energy_per_capita', 'fossil_share_energy', 'renewables_share_energy']

In [152]:
def create_analysis_dataset(df, variables, year = None, 
                          min_completeness = 0.8, remove_outliers = True):

    analysis_df = df[variables].copy()
    initial_shape = analysis_df.shape
    analysis_df = analysis_df[analysis_df['year'] == year]
    
    if 'country' in variables:
        regions_to_exclude = [
            'World', 'Europe', 'Asia', 'Africa', 'North America', 'South America',
            'European Union', 'OECD', 'G20', 'G7', 'High-income countries',
            'Upper-middle-income countries', 'Lower-middle-income countries',
            'Low-income countries', 'Asia Pacific', 'Middle East', 'Europe (other)',
            'CIS', 'Other Caribbean', 'Other Middle East', 'Other Africa',
            'Other Asia Pacific', 'Other Europe', 'Other South America']
        
        analysis_df = analysis_df[~analysis_df['country'].isin(regions_to_exclude)]
    
    numeric_cols = [col for col in variables if col not in ['country', 'year', 'iso_code']]
    completeness_by_row = analysis_df[numeric_cols].notna().mean(axis=1)
    
    complete_rows = completeness_by_row >= min_completeness
    analysis_df = analysis_df[complete_rows]
    removed_incomplete = (~complete_rows).sum()
    
    analysis_df = analysis_df.dropna() 

    return analysis_df

In [154]:
ANALYSIS_YEAR = 2021

datasets = {}

datasets['core'] = create_analysis_dataset(
    df, core_variables, year = ANALYSIS_YEAR, min_completeness = 0.7)

datasets['transition'] = create_analysis_dataset(
    df, transition_variables, year = ANALYSIS_YEAR, min_completeness = 0.8)

datasets['climate'] = create_analysis_dataset(
    df, climate_variables, year = ANALYSIS_YEAR, min_completeness = 0.9)

datasets['minimal'] = create_analysis_dataset(
    df, minimal_variables, year = ANALYSIS_YEAR, min_completeness = 0.95)


In [147]:
core_df = datasets['core']
transition_df = datasets['transition']
climate_df = datasets['climate']
minimal_df = datasets['minimal']