In [2]:
import os
import pandas as pd
import numpy as np
import requests
from dotenv import load_dotenv
from datetime import datetime
import time

# Load environment variables
load_dotenv()

# Define the indicators we want to collect
INDICATORS = {
    'EG.ELC.RNEW.ZS': 'Renewable electricity output',
    'EG.FEC.RNEW.ZS': 'Renewable energy consumption',
    'EG.ELC.ACCS.ZS': 'Access to electricity',
    'NY.GDP.PCAP.CD': 'GDP per capita',
    'EN.ATM.CO2E.PC': 'CO2 emissions per capita'
}

def fetch_world_bank_data(indicator, start_year=2010, end_year=2022):
    """
    Fetch data for a specific indicator from World Bank API
    """
    print(f"\nFetching data for {INDICATORS[indicator]}...")
    
    url = f"http://api.worldbank.org/v2/country/all/indicator/{indicator}"
    params = {
        'format': 'json',
        'date': f"{start_year}:{end_year}",
        'per_page': 1000
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        
        data = response.json()
        if len(data) < 2 or not data[1]:
            print(f"No data available for {indicator}")
            return None
            
        # Convert to DataFrame
        df = pd.json_normalize(data[1])
        
        # Clean and reshape the data
        cleaned_df = df[['country.value', 'date', 'value']]
        cleaned_df.columns = ['country', 'year', indicator]
        
        # Convert value column to numeric
        cleaned_df[indicator] = pd.to_numeric(cleaned_df[indicator], errors='coerce')
        
        print(f"✓ Successfully collected {len(cleaned_df)} records")
        return cleaned_df
        
    except Exception as e:
        print(f"Error fetching {indicator}: {str(e)}")
        return None

def collect_all_data():
    """
    Collect data for all indicators and merge them
    """
    print("Starting data collection...")
    
    # Start with the first indicator
    first_indicator = list(INDICATORS.keys())[0]
    final_df = fetch_world_bank_data(first_indicator)
    
    if final_df is None:
        print("Failed to fetch initial data")
        return None
    
    # Fetch and merge remaining indicators
    for indicator in list(INDICATORS.keys())[1:]:
        df = fetch_world_bank_data(indicator)
        if df is not None:
            final_df = final_df.merge(
                df,
                on=['country', 'year'],
                how='outer'
            )
        time.sleep(1)  # Be nice to the API
    
    return final_df

# Collect the data
renewable_data = collect_all_data()

# Basic data cleaning and saving
if renewable_data is not None:
    print("\nProcessing and saving data...")
    
    # Get list of available columns for dropna
    available_indicators = [col for col in INDICATORS.keys() 
                          if col in renewable_data.columns]
    
    # Remove rows where all indicator values are NaN
    renewable_data = renewable_data.dropna(
        how='all', 
        subset=available_indicators
    )
    
    # Create the data directory if it doesn't exist
    os.makedirs('data/raw', exist_ok=True)
    
    # Save the data
    output_path = 'data/raw/renewable_energy_data.csv'
    renewable_data.to_csv(output_path, index=False)
    print(f"Data saved to {output_path}")
    
    # Show basic information
    print("\nDataset Overview:")
    print(f"Shape: {renewable_data.shape}")
    print("\nColumns in dataset:")
    print(renewable_data.columns.tolist())
    print("\nSample of the data:")
    print(renewable_data.head())
    print("\nMissing values:")
    print(renewable_data.isnull().sum())
    
    # Basic statistics
    print("\nBasic statistics for numerical columns:")
    print(renewable_data.describe())
else:
    print("Failed to collect data")

Starting data collection...

Fetching data for Renewable electricity output...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df[indicator] = pd.to_numeric(cleaned_df[indicator], errors='coerce')


✓ Successfully collected 1000 records

Fetching data for Renewable energy consumption...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df[indicator] = pd.to_numeric(cleaned_df[indicator], errors='coerce')


✓ Successfully collected 1000 records

Fetching data for Access to electricity...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df[indicator] = pd.to_numeric(cleaned_df[indicator], errors='coerce')


✓ Successfully collected 1000 records

Fetching data for GDP per capita...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df[indicator] = pd.to_numeric(cleaned_df[indicator], errors='coerce')


✓ Successfully collected 1000 records

Fetching data for CO2 emissions per capita...
No data available for EN.ATM.CO2E.PC

Processing and saving data...
Data saved to data/raw/renewable_energy_data.csv

Dataset Overview:
Shape: (987, 6)

Columns in dataset:
['country', 'year', 'EG.ELC.RNEW.ZS', 'EG.FEC.RNEW.ZS', 'EG.ELC.ACCS.ZS', 'NY.GDP.PCAP.CD']

Sample of the data:
       country  year  EG.ELC.RNEW.ZS  EG.FEC.RNEW.ZS  EG.ELC.ACCS.ZS  \
0  Afghanistan  2010       85.986547            15.2            42.7   
1  Afghanistan  2011       82.487562            12.6            43.2   
2  Afghanistan  2012       85.909980            15.4            69.1   
3  Afghanistan  2013       78.636408            16.9            68.0   
4  Afghanistan  2014       85.323549            19.1            89.5   

   NY.GDP.PCAP.CD  
0      562.499222  
1      608.738850  
2      653.417475  
3      638.733181  
4      626.512929  

Missing values:
country             0
year                0
EG.ELC.RNEW.ZS 

In [3]:
# Test with just two indicators first
TEST_INDICATORS = {
    'EG.ELC.RNEW.ZS': 'Renewable electricity output',
    'NY.GDP.PCAP.CD': 'GDP per capita'
}

def test_data_collection():
    """
    Test data collection with just two indicators
    """
    print("Testing data collection with minimal indicators...")
    
    dfs = []
    for indicator in TEST_INDICATORS.keys():
        url = f"http://api.worldbank.org/v2/country/all/indicator/{indicator}"
        params = {
            'format': 'json',
            'date': '2010:2022',
            'per_page': 1000
        }
        
        try:
            response = requests.get(url, params=params)
            data = response.json()
            
            if len(data) > 1 and data[1]:
                df = pd.json_normalize(data[1])
                cleaned_df = df[['country.value', 'date', 'value']]
                cleaned_df.columns = ['country', 'year', indicator]
                cleaned_df[indicator] = pd.to_numeric(cleaned_df[indicator], errors='coerce')
                dfs.append(cleaned_df)
                print(f"✓ Successfully collected {TEST_INDICATORS[indicator]}")
            
        except Exception as e:
            print(f"Error with {indicator}: {str(e)}")
    
    if dfs:
        final_df = dfs[0]
        for df in dfs[1:]:
            final_df = final_df.merge(df, on=['country', 'year'], how='outer')
        
        print("\nTest collection successful!")
        print(final_df.head())
        return final_df
    
    return None

# Run the test
test_df = test_data_collection()

Testing data collection with minimal indicators...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df[indicator] = pd.to_numeric(cleaned_df[indicator], errors='coerce')


✓ Successfully collected Renewable electricity output
✓ Successfully collected GDP per capita

Test collection successful!
       country  year  EG.ELC.RNEW.ZS  NY.GDP.PCAP.CD
0  Afghanistan  2010       85.986547      562.499222
1  Afghanistan  2011       82.487562      608.738850
2  Afghanistan  2012       85.909980      653.417475
3  Afghanistan  2013       78.636408      638.733181
4  Afghanistan  2014       85.323549      626.512929


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df[indicator] = pd.to_numeric(cleaned_df[indicator], errors='coerce')


In [5]:
import pandas as pd
import numpy as np
import requests
from dotenv import load_dotenv
import time
from typing import Optional, Dict, List

class RenewableDataCollector:
    def __init__(self):
        self.INDICATORS = {
            'EG.ELC.RNEW.ZS': 'Renewable electricity output',
            'EG.FEC.RNEW.ZS': 'Renewable energy consumption',
            'EG.ELC.ACCS.ZS': 'Access to electricity',
            'NY.GDP.PCAP.CD': 'GDP per capita',
            'NY.GDP.MKTP.KD.ZG': 'GDP growth',
            'EN.ATM.CO2E.PC': 'CO2 emissions per capita'
        }
        
        self.YEARS = range(2010, 2023)
    
    def fetch_indicator_data(self, indicator: str, retries: int = 3) -> Optional[pd.DataFrame]:
        """
        Fetch data for a specific indicator with error handling and retries
        """
        url = f"http://api.worldbank.org/v2/country/all/indicator/{indicator}"
        params = {
            'format': 'json',
            'date': f"{min(self.YEARS)}:{max(self.YEARS)}",
            'per_page': 1000
        }
        
        for attempt in range(retries):
            try:
                response = requests.get(url, params=params)
                response.raise_for_status()
                
                data = response.json()
                if len(data) < 2 or not data[1]:
                    print(f"No data available for {indicator}")
                    return None
                
                df = pd.json_normalize(data[1])
                df = df[['country.value', 'date', 'value']]
                df.columns = ['country', 'year', indicator]
                df[indicator] = pd.to_numeric(df[indicator], errors='coerce')
                
                # Remove outliers
                Q1 = df[indicator].quantile(0.25)
                Q3 = df[indicator].quantile(0.75)
                IQR = Q3 - Q1
                df = df[
                    (df[indicator] >= Q1 - 3 * IQR) & 
                    (df[indicator] <= Q3 + 3 * IQR)
                ]
                
                print(f"✓ Successfully collected {len(df)} records for {self.INDICATORS[indicator]}")
                return df
                
            except Exception as e:
                print(f"Attempt {attempt + 1} failed for {indicator}: {str(e)}")
                if attempt < retries - 1:
                    time.sleep(2)  # Wait before retry
                else:
                    print(f"Failed to fetch {indicator} after {retries} attempts")
                    return None
    
    def collect_and_process_data(self) -> Optional[pd.DataFrame]:
        """
        Collect and process data for all indicators
        """
        print("Starting data collection...")
        
        dataframes = []
        for indicator in self.INDICATORS.keys():
            df = self.fetch_indicator_data(indicator)
            if df is not None:
                dataframes.append(df)
                time.sleep(1)  # Be nice to the API
        
        if not dataframes:
            print("No data collected")
            return None
        
        # Merge all dataframes
        final_df = dataframes[0]
        for df in dataframes[1:]:
            final_df = final_df.merge(
                df,
                on=['country', 'year'],
                how='outer'
            )
        
        # Clean and process
        final_df['year'] = pd.to_numeric(final_df['year'])
        final_df = final_df[final_df['year'].isin(self.YEARS)]
        
        # Fill missing values with forward/backward fill for each country
        for col in final_df.columns:
            if col not in ['country', 'year']:
                final_df[col] = final_df.groupby('country')[col].transform(
                    lambda x: x.fillna(method='ffill').fillna(method='bfill')
                )
        
        return final_df
    
    def save_data(self, df: pd.DataFrame, output_path: str = 'data/raw/renewable_energy_data.csv'):
        """
        Save data with quality report
        """
        # Create quality report
        quality_report = pd.DataFrame({
            'column': df.columns,
            'missing_values': df.isnull().sum(),
            'missing_percentage': (df.isnull().sum() / len(df)) * 100,
            'unique_values': df.nunique(),
            'mean': df.select_dtypes(include=[np.number]).mean(),
            'std': df.select_dtypes(include=[np.number]).std()
        })
        
        # Save data and report
        df.to_csv(output_path, index=False)
        quality_report.to_csv('data/raw/data_quality_report.csv')
        
        print(f"\nData saved to {output_path}")
        print("\nData Quality Summary:")
        print(quality_report)
        
        return quality_report

# Run the collector
collector = RenewableDataCollector()
data = collector.collect_and_process_data()
if data is not None:
    quality_report = collector.save_data(data)

Starting data collection...
✓ Successfully collected 455 records for Renewable electricity output
✓ Successfully collected 874 records for Renewable energy consumption
✓ Successfully collected 974 records for Access to electricity
✓ Successfully collected 914 records for GDP per capita
✓ Successfully collected 958 records for GDP growth
No data available for EN.ATM.CO2E.PC

Data saved to data/raw/renewable_energy_data.csv

Data Quality Summary:
                              column  missing_values  missing_percentage  \
EG.ELC.ACCS.ZS               country              13            1.317123   
EG.ELC.RNEW.ZS                  year               0            0.000000   
EG.FEC.RNEW.ZS        EG.ELC.RNEW.ZS               0            0.000000   
NY.GDP.MKTP.KD.ZG     EG.FEC.RNEW.ZS              12            1.215805   
NY.GDP.PCAP.CD        EG.ELC.ACCS.ZS              51            5.167173   
country               NY.GDP.PCAP.CD               0            0.000000   
year               

  lambda x: x.fillna(method='ffill').fillna(method='bfill')
  lambda x: x.fillna(method='ffill').fillna(method='bfill')
  lambda x: x.fillna(method='ffill').fillna(method='bfill')
  lambda x: x.fillna(method='ffill').fillna(method='bfill')
  lambda x: x.fillna(method='ffill').fillna(method='bfill')


In [6]:
import pandas as pd
import numpy as np
import requests
import time
from typing import Dict, List, Optional
from pathlib import Path

class EnhancedRenewableCollector:
    def __init__(self):
        # Core indicators focused on crucial metrics
        self.INDICATORS = {
            # Renewable Energy Metrics
            'EG.ELC.RNEW.ZS': 'Renewable electricity output (% of total)',
            'EG.FEC.RNEW.ZS': 'Renewable energy consumption (% of total)',
            
            # Technology Adoption
            'EG.ELC.ACCS.ZS': 'Access to electricity (% of population)',
            'EG.USE.ELEC.KH.PC': 'Electric power consumption (kWh per capita)',
            
            # Economic Indicators
            'NY.GDP.PCAP.CD': 'GDP per capita (current US$)',
            'NY.GDP.MKTP.KD.ZG': 'GDP growth (annual %)',
            
            # Investment & Development
            'GB.XPD.RSDV.GD.ZS': 'R&D expenditure (% of GDP)',
            'IE.PPI.ENGY.CD': 'Investment in energy projects (current US$)'
        }
        
        self.years = range(2010, 2023)
        self.base_url = "http://api.worldbank.org/v2"
        
    def create_directory_structure(self):
        """Create necessary directories for data storage"""
        paths = ['data/raw', 'data/processed', 'data/interim']
        for path in paths:
            Path(path).mkdir(parents=True, exist_ok=True)
    
    def fetch_indicator_data(self, indicator: str, retries: int = 3) -> Optional[pd.DataFrame]:
        """
        Fetch data for a specific indicator with enhanced error handling
        """
        print(f"\nFetching {self.INDICATORS[indicator]}...")
        
        url = f"{self.base_url}/country/all/indicator/{indicator}"
        params = {
            'format': 'json',
            'date': f"{min(self.years)}:{max(self.years)}",
            'per_page': 1000
        }
        
        for attempt in range(retries):
            try:
                response = requests.get(url, params=params)
                response.raise_for_status()
                
                data = response.json()
                if len(data) < 2 or not data[1]:
                    print(f"No data available for {indicator}")
                    return None
                
                # Create DataFrame and clean data
                df = pd.json_normalize(data[1])
                df = df[['country.value', 'date', 'value']]
                df.columns = ['country', 'year', indicator]
                
                # Convert to numeric and handle errors
                df[indicator] = pd.to_numeric(df[indicator], errors='coerce')
                
                # Remove obvious outliers using IQR method
                Q1 = df[indicator].quantile(0.25)
                Q3 = df[indicator].quantile(0.75)
                IQR = Q3 - Q1
                df = df[
                    (df[indicator] >= Q1 - 3 * IQR) & 
                    (df[indicator] <= Q3 + 3 * IQR)
                ]
                
                # Basic statistics for quality check
                stats = df[indicator].describe()
                print(f"Data collected: {len(df)} records")
                print(f"Value range: {stats['min']:.2f} to {stats['max']:.2f}")
                print(f"Mean value: {stats['mean']:.2f}")
                
                return df
                
            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt < retries - 1:
                    time.sleep(2)
                else:
                    print(f"Failed to fetch {indicator} after {retries} attempts")
                    return None
    
    def clean_and_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Clean and transform the collected data
        """
        print("\nCleaning and transforming data...")
        
        # Convert year to numeric
        df['year'] = pd.to_numeric(df['year'])
        
        # Filter relevant years
        df = df[df['year'].isin(self.years)]
        
        # Handle missing values by country
        for col in df.columns:
            if col not in ['country', 'year']:
                # Forward fill within each country group
                df[col] = df.groupby('country')[col].transform(
                    lambda x: x.fillna(method='ffill').fillna(method='bfill')
                )
        
        # Calculate additional metrics
        if 'EG.ELC.RNEW.ZS' in df.columns and 'EG.FEC.RNEW.ZS' in df.columns:
            df['renewable_adoption_score'] = (
                df['EG.ELC.RNEW.ZS'] * 0.6 + 
                df['EG.FEC.RNEW.ZS'] * 0.4
            )
        
        return df
    
    def generate_analysis_report(self, df: pd.DataFrame) -> Dict:
        """
        Generate basic analysis report
        """
        report = {
            'total_countries': df['country'].nunique(),
            'time_range': f"{df['year'].min()} - {df['year'].max()}",
            'data_completeness': {},
            'key_statistics': {}
        }
        
        # Calculate completeness for each indicator
        for col in df.columns:
            if col not in ['country', 'year']:
                completeness = (df[col].count() / len(df)) * 100
                report['data_completeness'][col] = f"{completeness:.1f}%"
        
        # Calculate key statistics
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            if col != 'year':
                report['key_statistics'][col] = {
                    'mean': df[col].mean(),
                    'median': df[col].median(),
                    'std': df[col].std()
                }
        
        return report
    
    def collect_and_process(self) -> pd.DataFrame:
        """
        Main method to collect and process all data
        """
        self.create_directory_structure()
        print("Starting enhanced data collection...")
        
        # Collect data for each indicator
        dataframes = []
        for indicator in self.INDICATORS.keys():
            df = self.fetch_indicator_data(indicator)
            if df is not None:
                dataframes.append(df)
                time.sleep(1)
        
        if not dataframes:
            raise Exception("No data collected")
        
        # Merge all dataframes
        print("\nMerging datasets...")
        final_df = dataframes[0]
        for df in dataframes[1:]:
            final_df = final_df.merge(
                df,
                on=['country', 'year'],
                how='outer'
            )
        
        # Clean and transform data
        final_df = self.clean_and_transform(final_df)
        
        # Generate and save report
        report = self.generate_analysis_report(final_df)
        
        # Save data and report
        final_df.to_csv('data/processed/renewable_energy_data.csv', index=False)
        pd.DataFrame.from_dict(report, orient='index').to_csv('data/processed/data_report.csv')
        
        print("\nData collection completed!")
        print(f"Total countries: {report['total_countries']}")
        print(f"Time range: {report['time_range']}")
        print("\nData completeness:")
        for indicator, completeness in report['data_completeness'].items():
            print(f"{self.INDICATORS.get(indicator, indicator)}: {completeness}")
        
        return final_df

# Usage example
if __name__ == "__main__":
    collector = EnhancedRenewableCollector()
    data = collector.collect_and_process()

Starting enhanced data collection...

Fetching Renewable electricity output (% of total)...
Data collected: 455 records
Value range: 0.00 to 100.00
Mean value: 27.99

Fetching Renewable energy consumption (% of total)...
Data collected: 874 records
Value range: 0.00 to 84.90
Mean value: 26.91

Fetching Access to electricity (% of population)...
Data collected: 974 records
Value range: 26.18 to 100.00
Mean value: 85.19

Fetching Electric power consumption (kWh per capita)...
Data collected: 290 records
Value range: 89.04 to 10726.93
Mean value: 2620.14

Fetching GDP per capita (current US$)...
Data collected: 914 records
Value range: 352.60 to 46165.62
Mean value: 10021.05

Fetching GDP growth (annual %)...
Data collected: 958 records
Value range: -7.84 to 14.36
Mean value: 3.15

Fetching R&D expenditure (% of GDP)...
Data collected: 442 records
Value range: 0.03 to 3.43
Mean value: 1.40

Fetching Investment in energy projects (current US$)...
Data collected: 269 records
Value range: 50

  lambda x: x.fillna(method='ffill').fillna(method='bfill')
  lambda x: x.fillna(method='ffill').fillna(method='bfill')
  lambda x: x.fillna(method='ffill').fillna(method='bfill')
  lambda x: x.fillna(method='ffill').fillna(method='bfill')
  lambda x: x.fillna(method='ffill').fillna(method='bfill')
  lambda x: x.fillna(method='ffill').fillna(method='bfill')
  lambda x: x.fillna(method='ffill').fillna(method='bfill')
  lambda x: x.fillna(method='ffill').fillna(method='bfill')


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os

class RenewableEnergyVisualizer:
    def __init__(self, data_path='data/raw/renewable_energy_data.csv'):
        # Create output directory if it doesn't exist
        os.makedirs('outputs', exist_ok=True)
        
        # Load and prepare data
        self.df = pd.read_csv(data_path)
        self.latest_year = self.df['year'].max()
        
        # Set default plotly template
        self.template = 'plotly_white'
        
    def create_global_trends(self):
        """Create global renewable energy adoption trends"""
        # Calculate global averages by year
        global_trends = self.df.groupby('year')[
            ['EG.ELC.RNEW.ZS', 'EG.FEC.RNEW.ZS']
        ].mean().reset_index()
        
        fig = go.Figure()
        
        fig.add_trace(go.Scatter(
            x=global_trends['year'],
            y=global_trends['EG.ELC.RNEW.ZS'],
            name='Renewable Electricity Output',
            mode='lines+markers',
            line=dict(color='#1f77b4')
        ))
        
        fig.add_trace(go.Scatter(
            x=global_trends['year'],
            y=global_trends['EG.FEC.RNEW.ZS'],
            name='Renewable Energy Consumption',
            mode='lines+markers',
            line=dict(color='#2ca02c')
        ))
        
        fig.update_layout(
            template=self.template,
            title='Global Renewable Energy Trends',
            xaxis_title='Year',
            yaxis_title='Percentage (%)',
            hovermode='x unified',
            legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.02,
                xanchor="right",
                x=1
            )
        )
        
        return fig
    
    def create_country_rankings(self):
        """Create top countries ranking visualization"""
        latest_data = self.df[self.df['year'] == self.latest_year]
        
        fig = make_subplots(
            rows=1, cols=2,
            subplot_titles=(
                'Top 15 Countries by Renewable Electricity Output',
                'Top 15 Countries by Renewable Energy Consumption'
            )
        )
        
        # Renewable Electricity Output
        top_electricity = latest_data.nlargest(15, 'EG.ELC.RNEW.ZS')
        fig.add_trace(
            go.Bar(
                x=top_electricity['country'],
                y=top_electricity['EG.ELC.RNEW.ZS'],
                name='Electricity Output',
                marker_color='#1f77b4'
            ),
            row=1, col=1
        )
        
        # Renewable Energy Consumption
        top_consumption = latest_data.nlargest(15, 'EG.FEC.RNEW.ZS')
        fig.add_trace(
            go.Bar(
                x=top_consumption['country'],
                y=top_consumption['EG.FEC.RNEW.ZS'],
                name='Energy Consumption',
                marker_color='#2ca02c'
            ),
            row=1, col=2
        )
        
        fig.update_layout(
            template=self.template,
            height=600,
            showlegend=False,
            title_text=f"Top Countries in Renewable Energy ({self.latest_year})"
        )
        
        # Update axes
        fig.update_xaxes(tickangle=45)
        
        return fig
    
    def create_gdp_renewable_relationship(self):
        """Create GDP vs Renewable Energy scatter plot"""
        latest_data = self.df[self.df['year'] == self.latest_year].copy()
        
        fig = px.scatter(
            latest_data,
            x='NY.GDP.PCAP.CD',
            y='EG.ELC.RNEW.ZS',
            hover_data=['country'],
            size='EG.FEC.RNEW.ZS',
            color='EG.FEC.RNEW.ZS',
            title=f'GDP per Capita vs Renewable Energy Output ({self.latest_year})',
            template=self.template
        )
        
        fig.update_layout(
            xaxis_title='GDP per Capita (USD)',
            yaxis_title='Renewable Electricity Output (%)',
            coloraxis_colorbar_title='Renewable Energy\nConsumption (%)'
        )
        
        return fig
    
    def create_regional_comparison(self):
        """Create regional comparison visualization"""
        regions = {
            'Europe': ['Germany', 'France', 'Spain', 'Italy', 'United Kingdom'],
            'North America': ['United States', 'Canada', 'Mexico'],
            'Asia': ['China', 'Japan', 'India', 'South Korea'],
            'Africa': ['South Africa', 'Egypt', 'Morocco', 'Kenya']
        }
        
        fig = go.Figure()
        
        for region, countries in regions.items():
            region_data = self.df[self.df['country'].isin(countries)]
            avg_data = region_data.groupby('year')['EG.ELC.RNEW.ZS'].mean()
            
            fig.add_trace(go.Scatter(
                x=avg_data.index,
                y=avg_data.values,
                name=region,
                mode='lines+markers'
            ))
        
        fig.update_layout(
            template=self.template,
            title='Regional Renewable Energy Adoption',
            xaxis_title='Year',
            yaxis_title='Average Renewable Electricity Output (%)',
            hovermode='x unified',
            legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.02,
                xanchor="right",
                x=1
            )
        )
        
        return fig

    def generate_dashboard(self):
        """Generate all visualizations and save them"""
        print("Generating visualizations...")
        
        # Create visualizations
        trends_fig = self.create_global_trends()
        rankings_fig = self.create_country_rankings()
        gdp_fig = self.create_gdp_renewable_relationship()
        regional_fig = self.create_regional_comparison()
        
        # Save visualizations
        trends_fig.write_html("outputs/global_trends.html")
        rankings_fig.write_html("outputs/country_rankings.html")
        gdp_fig.write_html("outputs/gdp_relationship.html")
        regional_fig.write_html("outputs/regional_comparison.html")
        
        print("Visualizations have been generated and saved to 'outputs' directory!")
        
        return {
            'trends': trends_fig,
            'rankings': rankings_fig,
            'gdp': gdp_fig,
            'regional': regional_fig
        }

# Usage example
if __name__ == "__main__":
    try:
        visualizer = RenewableEnergyVisualizer()
        dashboard = visualizer.generate_dashboard()
        print("Dashboard generated successfully!")
    except Exception as e:
        print(f"Error generating dashboard: {str(e)}")

Generating visualizations...
Visualizations have been generated and saved to 'outputs' directory!
Dashboard generated successfully!


In [9]:
import pandas as pd
import plotly.graph_objects as go

class RegionalAnalyzer:
    def __init__(self, data_path='data/raw/renewable_energy_data.csv'):
        self.df = pd.read_csv(data_path)
        
    def print_available_countries(self):
        """Print all available countries in the dataset"""
        print("Available countries:")
        for country in sorted(self.df['country'].unique()):
            print(f"- {country}")
    
    def create_regional_comparison(self):
        """Create regional comparison with verified country names"""
        # First, let's verify available countries
        available_countries = set(self.df['country'].unique())
        
        # Define regions with verified country names
        regions = {
            'Europe': [
                'Germany', 'France', 'Spain', 'Italy', 'United Kingdom',
                'Norway', 'Sweden', 'Denmark', 'Finland', 'Netherlands'
            ],
            'North America': [
                'United States', 'Canada', 'Mexico'
            ],
            'Asia': [
                'China', 'Japan', 'India', 'Korea, Rep.',
                'Indonesia', 'Thailand', 'Malaysia'
            ],
            'Africa': [
                'South Africa', 'Egypt, Arab Rep.', 'Morocco', 
                'Kenya', 'Nigeria', 'Ethiopia'
            ],
            'South America': [
                'Brazil', 'Argentina', 'Chile', 'Colombia', 
                'Peru', 'Uruguay'
            ]
        }
        
        # Verify and adjust country lists
        verified_regions = {}
        for region, countries in regions.items():
            verified_countries = [c for c in countries if c in available_countries]
            if verified_countries:  # Only include regions with available data
                verified_regions[region] = verified_countries
        
        # Create visualization
        fig = go.Figure()
        
        for region, countries in verified_regions.items():
            # Get data for countries in this region
            region_data = self.df[self.df['country'].isin(countries)]
            
            # Calculate average renewable energy percentage by year
            avg_data = region_data.groupby('year')['EG.ELC.RNEW.ZS'].mean().reset_index()
            
            # Add trace for this region
            fig.add_trace(go.Scatter(
                x=avg_data['year'],
                y=avg_data['EG.ELC.RNEW.ZS'],
                name=region,
                mode='lines+markers',
                hovertemplate=
                '<b>%{x}</b><br>' +
                '%{y:.1f}%<br>' +
                '<extra>' + region + '</extra>'
            ))
        
        # Update layout
        fig.update_layout(
            title={
                'text': 'Regional Renewable Energy Adoption Trends',
                'y':0.95,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'
            },
            xaxis_title='Year',
            yaxis_title='Average Renewable Electricity Output (%)',
            hovermode='x unified',
            template='plotly_white',
            legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.02,
                xanchor="right",
                x=1
            ),
            margin=dict(t=100)
        )
        
        # Add range slider
        fig.update_xaxes(rangeslider_visible=True)
        
        return fig

# Usage
if __name__ == "__main__":
    analyzer = RegionalAnalyzer()
    
    # Print available countries for verification
    analyzer.print_available_countries()
    
    # Create and save the visualization
    fig = analyzer.create_regional_comparison()
    fig.write_html("outputs/regional_comparison.html")
    
    print("Regional comparison visualization has been generated!")

Available countries:
- Afghanistan
- Africa Eastern and Southern
- Africa Western and Central
- Albania
- Algeria
- American Samoa
- Andorra
- Angola
- Antigua and Barbuda
- Arab World
- Argentina
- Armenia
- Aruba
- Australia
- Austria
- Azerbaijan
- Bahamas, The
- Bahrain
- Bangladesh
- Barbados
- Belarus
- Belgium
- Belize
- Benin
- Bermuda
- Bhutan
- Bolivia
- Bosnia and Herzegovina
- Botswana
- Brazil
- British Virgin Islands
- Caribbean small states
- Central Europe and the Baltics
- Early-demographic dividend
- East Asia & Pacific
- East Asia & Pacific (IDA & IBRD countries)
- East Asia & Pacific (excluding high income)
- Euro area
- Europe & Central Asia
- Europe & Central Asia (IDA & IBRD countries)
- Europe & Central Asia (excluding high income)
- European Union
- Fragile and conflict affected situations
- Heavily indebted poor countries (HIPC)
- High income
- IBRD only
- IDA & IBRD total
- IDA blend
- IDA only
- IDA total
- Late-demographic dividend
- Latin America & Caribbe