# Human Development Index (HDI) Dataset by the UN
This I think is a pretty full dataset by the UN on Human Development Index. More about it you can find here: https://hdr.undp.org/data-center/human-development-index#/indicies/HDI. \
I have also included in the repository the metadata file (HDR25_Composite_indices_metadata), in which you can find all the indicators and identifiers.\
There are pretty interesting indicators, which for Gender Inequality they have a specific category, with these indicators:\
GII Rank	gii_rank\
Gender Inequality Index (value)	gii\
Maternal Mortality Ratio (deaths per 100,000 live births)	mmr\
Adolescent Birth Rate (births per 1,000 women ages 15-19)	abr\
Population with at least some secondary education, female (% ages 25 and older)	se_f\
Population with at least some secondary education, male (% ages 25 and older)	se_m\
Share of seats in parliament, female (% held by women)	pr_f\
Share of seats in parliament, male (% held by men)	pr_m\
Labour force participation rate, female (% ages 15 and older)	lfpr_f\
Labour force participation rate, male (% ages 15 and older)	lfpr_m\
The letters are an id for the data queries.

In [13]:
import pandas as pd

class HDIData:
    def __init__(self, filepath: str):
        """
        Initialize the HDIData object by loading a CSV file into a DataFrame.
        """
        # Read CSV safely
        self.df = pd.read_csv(filepath, encoding="ISO-8859-1")
        
        # Standardize column names
        self.df.columns = self.df.columns.str.strip().str.lower().str.replace(' ', '_')
        
        # Convert wide-format year columns into long format
        self.long_df = self._reshape_long(self.df)
    
    def _reshape_long(self, df):
        """
        Converts wide-format year columns into long format for easier filtering.
        """
        id_vars = ['iso3', 'country', 'region']  # columns to keep
        value_vars = [col for col in df.columns if any(col.startswith(prefix) for prefix in ['hdi_', 'le_', 'eys_', 'mys_', 'gnipc_', 'gdi_', 'gii_', 'co2_prod_'])]
        
        # Melt the wide columns into long format
        long_df = df.melt(id_vars=id_vars, value_vars=value_vars, var_name='metric_year', value_name='value')
        
        # Split 'metric_year' into 'metric' and 'year'
        long_df[['metric', 'year']] = long_df['metric_year'].str.rsplit('_', n=1, expand=True)
        long_df['year'] = long_df['year'].astype(int)
        long_df.drop(columns='metric_year', inplace=True)
        
        return long_df

    def get_data(self, countries=None, years=None, metric=None):
        """
        Retrieve data for a given metric, list of countries, and/or years.
        """
        df = self.long_df.copy()
        
        # Filter by countries
        if countries is not None:
            if isinstance(countries, str):
                countries = [countries]
            df = df[df['country'].str.lower().isin([c.lower() for c in countries])]
        
        # Filter by years
        if years is not None:
            if isinstance(years, tuple):  # range
                df = df[(df['year'] >= years[0]) & (df['year'] <= years[1])]
            elif isinstance(years, list):
                df = df[df['year'].isin(years)]
            else:  # single year
                df = df[df['year'] == years]
        
        # Filter by metric
        if metric is not None:
            metric = metric.lower().replace(' ', '_')
            df = df[df['metric'] == metric]
        
        return df.reset_index(drop=True)


In [24]:
# Load CSV
hdi = HDIData("HDR25_Composite_indices_complete_time_series.csv")

# Get HDI for Norway, 2010-2020
norway_hdi = hdi.get_data(countries="Norway", years=(2010, 2020), metric="HDI")

# Life expectancy for Japan and Sweden in 2020
life_exp = hdi.get_data(countries=["Nicaragua", "Sweden"], years=(2000, 2020), metric="gii")


In [25]:
print(life_exp)

   iso3    country region  value metric  year
0   NIC  Nicaragua    LAC  0.569    gii  2000
1   SWE     Sweden    NaN  0.059    gii  2000
2   NIC  Nicaragua    LAC  0.573    gii  2001
3   SWE     Sweden    NaN  0.056    gii  2001
4   NIC  Nicaragua    LAC  0.526    gii  2002
5   SWE     Sweden    NaN  0.053    gii  2002
6   NIC  Nicaragua    LAC  0.521    gii  2003
7   SWE     Sweden    NaN  0.048    gii  2003
8   NIC  Nicaragua    LAC  0.524    gii  2004
9   SWE     Sweden    NaN  0.047    gii  2004
10  NIC  Nicaragua    LAC  0.521    gii  2005
11  SWE     Sweden    NaN  0.047    gii  2005
12  NIC  Nicaragua    LAC  0.528    gii  2006
13  SWE     Sweden    NaN  0.046    gii  2006
14  NIC  Nicaragua    LAC  0.515    gii  2007
15  SWE     Sweden    NaN  0.046    gii  2007
16  NIC  Nicaragua    LAC  0.506    gii  2008
17  SWE     Sweden    NaN  0.048    gii  2008
18  NIC  Nicaragua    LAC  0.488    gii  2009
19  SWE     Sweden    NaN  0.049    gii  2009
20  NIC  Nicaragua    LAC  0.478  

# Environmental Performance Index (EPI) - Yale
First, a quick look through the metadata file, basically all the indicators. You can also instead look at the epi2024variables2024-12-11.csv file. A nicer description of it is on the website, however, for indicator IDs its easier to use this me thinks.
Website: https://epi.yale.edu/

In [None]:
import pandas as pd

class IndicatorCSVReader:
    def __init__(self, filepath: str):
        """
        Initialize the reader for indicator metadata CSV files.
        
        Parameters:
        -----------
        filepath : str
            Path to the CSV file.
        encoding : str
            File encoding (default: ISO-8859-1)
        """
        # Read CSV safely
        self.df = pd.read_csv(filepath)
        
        # Standardize column names
        self.df.columns = self.df.columns.str.strip().str.lower().str.replace(' ', '_')

    def get_data(self, types=None, variables=None, categories=None):
        """
        Filter indicator metadata by type, variable, or category.
        
        Parameters:
        -----------
        types : str | list[str] | None
            Filter by Type column.
        variables : str | list[str] | None
            Filter by Variable column.
        categories : str | list[str] | None
            Filter by IssueCategory or PolicyObjective.
            
        Returns:
        --------
        pd.DataFrame
            Filtered metadata.
        """
        df = self.df.copy()
        
        # Filter by Type
        if types is not None:
            if isinstance(types, str):
                types = [types]
            df = df[df['type'].str.lower().isin([t.lower() for t in types])]
        
        # Filter by Variable
        if variables is not None:
            if isinstance(variables, str):
                variables = [variables]
            df = df[df['variable'].str.lower().isin([v.lower() for v in variables])]
        
        # Filter by IssueCategory or PolicyObjective
        if categories is not None:
            if isinstance(categories, str):
                categories = [categories]
            df = df[
                df['issuecategory'].str.lower().isin([c.lower() for c in categories]) |
                df['policyobjective'].str.lower().isin([c.lower() for c in categories])
            ]
        
        return df.reset_index(drop=True)


In [56]:
reader = IndicatorCSVReader('epi2024variables2024-12-11.csv')

# Get only EPI rows
df_epi = reader.get_data()
print(df_epi)

# Get all indicators under "Ecosystem Vitality"
df_eco = reader.get_data(categories='Ecosystem Vitality')
print(df_eco)

# Get a specific variable
df_bdh = reader.get_data(variables='BDH')
print(df_bdh)


               type abbreviation  \
0               EPI          EPI   
1   PolicyObjective          ECO   
2     IssueCategory          BDH   
3         Indicator          MKP   
4         Indicator          MHP   
..              ...          ...   
68        Indicator          LUF   
69        Indicator          GTI   
70        Indicator          GTP   
71        Indicator          GHN   
72        Indicator          CBP   

                                             variable  \
0                     Environmental Performance Index   
1                                  Ecosystem Vitality   
2                              Biodiversity & Habitat   
3                               Marine KBA Protection   
4                           Marine Habitat Protection   
..                                                ...   
68         Net carbon fluxes due to land cover change   
69    GHG growth rate adjusted by emissions intensity   
70   GHG growth rate adjusted by per capita emissions 

Next, we retrieve a df depending on the country, variable, and years:

In [46]:
import pandas as pd
import os

def load_indicators(variables, countries=None, years=None, folder_path='P5_Indicator'):
    """
    Load environmental/social indicators from CSV files and filter by country/year.

    Parameters:
    -----------
    variables : str | list[str]
        Variable abbreviations, e.g., 'BCA' or ['BCA', 'BER'].
    countries : str | list[str] | None
        Country or list of countries to filter.
    years : int | list[int] | tuple(int, int) | None
        Single year, list of years, or range (start, end) to filter.
    folder_path : str
        Path to the folder containing CSV files.

    Returns:
    --------
    pd.DataFrame
        Long-format DataFrame with columns: country, iso, variable, year, value
    """
    if isinstance(variables, str):
        variables = [variables]
    
    all_dfs = []

    for var in variables:
        filename = os.path.join(folder_path, f"{var}_ind_na.csv")
        if not os.path.exists(filename):
            raise FileNotFoundError(f"File {filename} not found.")
        
        # Load CSV safely
        df = pd.read_csv(filename)
        
        # Standardize columns
        df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
        
        # Identify year columns for this variable
        year_cols = [col for col in df.columns if col.startswith(var.lower() + '.ind.')]
        
        # Melt wide -> long
        long_df = df.melt(id_vars=['iso','country'], value_vars=year_cols, 
                          var_name='metric_year', value_name='value')
        
        # Extract year from column name
        long_df['year'] = long_df['metric_year'].str.split('.').str[-1].astype(int)
        long_df['variable'] = var
        long_df = long_df.drop(columns='metric_year')
        
        all_dfs.append(long_df)
    
    # Combine multiple variables
    result = pd.concat(all_dfs, ignore_index=True)
    
    # Filter countries
    if countries is not None:
        if isinstance(countries, str):
            countries = [countries]
        result = result[result['country'].str.lower().isin([c.lower() for c in countries])]
    
    # Filter years
    if years is not None:
        if isinstance(years, tuple):  # range
            result = result[(result['year'] >= years[0]) & (result['year'] <= years[1])]
        elif isinstance(years, list):
            result = result[result['year'].isin(years)]
        else:  # single year
            result = result[result['year'] == years]
    
    return result.reset_index(drop=True)


In [47]:
# Load BCA indicator for Afghanistan and Albania for 2000-2005
df = load_indicators(variables='BCA', countries=['Afghanistan','Albania'], years=(2000,2005))
print(df.head())

# Load multiple variables for all countries in 2010
df2 = load_indicators(variables=['BCA','BER'], years=2010)
print(df2.head())


   iso      country  value  year variable
0  AFG  Afghanistan   42.1  2000      BCA
1  ALB      Albania   30.0  2000      BCA
2  AFG  Afghanistan   41.9  2001      BCA
3  ALB      Albania    0.0  2001      BCA
4  AFG  Afghanistan   41.6  2002      BCA
   iso      country  value  year variable
0  AFG  Afghanistan   15.6  2010      BCA
1  ALB      Albania   42.3  2010      BCA
2  DZA      Algeria   15.0  2010      BCA
3  AND      Andorra    NaN  2010      BCA
4  AGO       Angola   12.1  2010      BCA
