## Setup

In [None]:
import pandas as pd
from pathlib import Path
import pandas as pd
import time
from typing import List, Dict, Optional, Tuple

import json
import re
import unicodedata
from glob import glob
import numpy as np
from glob import glob


from datetime import datetime
def normalize_text(text):
    decomposed = unicodedata.normalize('NFKD', str(text))
    # Keep only non-combining characters
    stripped = ''.join(c for c in decomposed if not unicodedata.combining(c))
    return stripped.strip().lower()

def gather_survey_year_pip_data(
    country_list: pd.DataFrame, 
    pip_data: pd.DataFrame,
    poverty_line
) -> pd.DataFrame:
    """
    Add poverty rates to country_list using pip_data with interpolation logic.
    
    Args:
        country_list: DataFrame with columns 'Country', 'Year', 'Country Code'
        pip_data: DataFrame with PIP poverty data (non-interpolated)
        poverty_line: Poverty line to use (default 2.15)
    
    Returns:
        DataFrame with added 'poverty_rate' and 'estimate_type' columns
    """

    # Filter pip_data for the specified poverty line
    
    # Create a copy of country_list to avoid modifying the original
    result = country_list[['country_code', 'survey_year']].copy()
    result['wb_poverty_rate_survey_year'] = np.nan
    result['interpolation_method_rate_survey_year'] = ''
    
    # Group pip data by country for efficient lookup
    pip_by_country = pip_data.groupby('country_code')
    
    for idx, row in result.iterrows():

        country_code = row['country_code']
        target_year = int(row['survey_year'])  # Ensure target_year is an integer
        
        # Get pip data for this country
        if country_code not in pip_by_country.groups:
            result.loc[idx, 'interpolation_method_rate_survey_year'] = 'no data available'
            continue
            
        country_data = pip_by_country.get_group(country_code).copy()
        country_data = country_data.sort_values('reporting_year')
        
        # Check for exact year match
        exact_match = country_data[country_data['reporting_year'] == target_year]
        if not exact_match.empty:
            poverty_rate = exact_match.iloc[0]['headcount']
            poverty_gap = exact_match.iloc[0]['poverty_gap']
            estimate_type = 'exact year'
        else:            
            # No exact match - need to interpolate or use nearest
            poverty_rate, poverty_gap, estimate_type = (
                get_interpolated_poverty_estimate(country_data, target_year)
            )

        result.loc[idx, 'wb_poverty_rate_survey_year'] = poverty_rate
        result.loc[idx, 'wb_poverty_gap_index_survey_year'] = poverty_gap
        result.loc[idx, 'interpolation_method_rate_survey_year'] = estimate_type

    return result

def get_interpolated_poverty_estimate(country_data: pd.DataFrame, target_year: int) -> Tuple[Optional[float], str]:
    """
    Get poverty estimate for a target year using interpolation or nearest year logic.
    
    Args:
        country_data: PIP data for a single country, sorted by year
        target_year: Year for which to estimate poverty rate
    
    Returns:
        Tuple of (poverty_rate, poverty_gap, estimate_type)
    """
    country_data.dropna(subset=['reporting_year', 'headcount'], inplace=True)
    if country_data.empty:
        return None, None, 'no data available'
    
    # Ensure target_year is an integer and years are integers
    target_year = int(target_year)
    years = country_data['reporting_year'].astype(int).values
    rates = country_data['headcount'].values
    gaps = country_data['poverty_gap'].values

    # Find years before and after target year
    years_before = years[years < target_year]
    years_after = years[years > target_year]
    
    # Case 1: Can interpolate (have years both before and after)
    if len(years_before) > 0 and len(years_after) > 0:
        # Get closest years before and after
        year_before = years_before.max()
        year_after = years_after.min()
        
        # Get corresponding poverty rates
        try:
            rate_before = country_data[country_data['reporting_year'] == year_before]['headcount'].iloc[0]
            rate_after = country_data[country_data['reporting_year'] == year_after]['headcount'].iloc[0]

            gap_before = country_data[country_data['reporting_year'] == year_before]['poverty_gap'].iloc[0]
            gap_after = country_data[country_data['reporting_year'] == year_after]['poverty_gap'].iloc[0]
        except:
            from IPython import embed; embed()
        
        # Linear interpolation
        weight = (target_year - year_before) / (year_after - year_before)
        interpolated_rate = rate_before + weight * (rate_after - rate_before)
        interpolated_gap = gap_before + weight * (gap_after - gap_before)
        
        estimate_type = f'interpolated using {year_before} and {year_after}'
        return interpolated_rate, interpolated_gap, estimate_type
    
    # Case 2: Only extrapolation possible - use nearest year instead
    else:
        # Find nearest year
        year_distances = np.abs(years - target_year)
        nearest_idx = np.argmin(year_distances)
        nearest_year = years[nearest_idx]
        nearest_rate = rates[nearest_idx]
        nearest_gap = gaps[nearest_idx]
        
        estimate_type = f'from nearest year: {nearest_year}'
        return nearest_rate, nearest_gap, estimate_type

survey_directory_to_country_name_map = { # E: don't need it in this code
    'burkina_faso': 'burkina faso',
    'cote_divoire': "côte d'ivoire",
    'south_africa': 'south africa',
    'south_sudan': 'south sudan',
    'yemen': 'yemen, rep.'
}



# All countries

In [None]:
pip_data_povertyline_2017 = pd.read_csv(
    'data/eop/compiled_country_data/pip_2017_20251024.csv',
    dtype={'reporting_year': int}
)

pip_data_povertyline_2021 = pd.read_csv(
    'data/eop/compiled_country_data/pip_2021_20251024.csv',
    dtype={'reporting_year': int}
)


In [None]:
def get_latest_value_with_year(df, value_col):
    def latest_func(group):
        if group[value_col].notna().any():
            idx = group['reporting_year'].idxmax()
            return pd.Series({
                f'latest_{value_col}': group.loc[idx, value_col],
                'year': group.loc[idx, 'reporting_year']
            })
        else:
            return pd.Series({
                f'latest_{value_col}': np.nan,
                'year': np.nan
            })
    return df.groupby('country_code').apply(latest_func).reset_index()

latest_poverty_rate_povertyline_2017 = get_latest_value_with_year(pip_data_povertyline_2017, 'headcount')
latest_poverty_gap_index_povertyline_2017 = get_latest_value_with_year(pip_data_povertyline_2017, 'poverty_gap')

latest_poverty_rate_povertyline_2021 = get_latest_value_with_year(pip_data_povertyline_2021, 'headcount')
latest_poverty_gap_index_povertyline_2021 = get_latest_value_with_year(pip_data_povertyline_2021, 'poverty_gap')
superset = latest_poverty_rate_povertyline_2017
superset = superset.merge(
    pip_data_povertyline_2017[['country_code', 'country_name']].drop_duplicates(
        subset='country_code'
    ),
    on='country_code', how='left'
).rename(
    columns={'country_name': 'country'}
)
superset.country = superset.country.apply(normalize_text)


In [None]:

latest_poverty_rate_povertyline_2017.rename(columns={
    'latest_headcount': 'wb_poverty_rate_povertyline_2017_most_recent',
    'year': 'wb_poverty_rate_povertyline_2017_most_recent_year'
}, inplace=True)
latest_poverty_gap_index_povertyline_2017.rename(columns={
    'latest_poverty_gap': 'wb_poverty_gap_index_povertyline_2017_most_recent',
    'year': 'wb_poverty_gap_index_povertyline_2017_most_recent_year'
}, inplace=True)

latest_poverty_rate_povertyline_2021.rename(columns={
    'latest_headcount': 'wb_poverty_rate_povertyline_2021_most_recent',
    'year': 'wb_poverty_rate_povertyline_2021_most_recent_year'
}, inplace=True)
latest_poverty_gap_index_povertyline_2021.rename(columns={
    'latest_poverty_gap': 'wb_poverty_gap_index_povertyline_2021_most_recent',
    'year': 'wb_poverty_gap_index_povertyline_2021_most_recent_year'
}, inplace=True)

superset = superset[['country_code', 'country']].merge(
    latest_poverty_rate_povertyline_2017, on='country_code', how='left'
)

superset = superset.merge(
    latest_poverty_gap_index_povertyline_2017, on='country_code', how='left'
)

superset = superset.merge(
    latest_poverty_rate_povertyline_2021, on='country_code', how='left'
)

superset = superset.merge(
    latest_poverty_gap_index_povertyline_2021, on='country_code', how='left'
)

In [None]:
population_data = pd.read_csv(
    'data/eop/compiled_country_data/world_bank_data_download_20250708/API_SP.POP.TOTL_DS2_en_csv_v2_38144.csv',
    skiprows=4
)

population_data.rename(columns={
    'Country Code': 'country_code', '2023': 'total_population_2023'
}, inplace=True)

superset = superset.merge(
    population_data[['country_code', 'total_population_2023']], 
    on='country_code', how='left'
)

# Data for survey countries

In [None]:
survey_countries = pd.read_csv(
    'data/eop/compiled_country_data/survey_countries.csv',
    dtype=str
).drop(columns='country')
survey_countries = survey_countries[survey_countries.using == 'True'].drop(columns='using')
accumulated_data = superset.copy()  # checkpoint
accumulated_data = accumulated_data.merge(survey_countries, on='country_code', how='left') # E: somalia is not in PIP

In [None]:
# Add poverty rates and gaps
survey_year_poverty_data_povertyline_2017 = gather_survey_year_pip_data(
    accumulated_data[accumulated_data.survey_year.notna()], pip_data_povertyline_2017, 2.15
).drop(columns='survey_year')

survey_year_poverty_data_povertyline_2017.rename(columns={
    'wb_poverty_rate_survey_year': 'wb_poverty_rate_povertyline_2017_survey_year', 
    'wb_poverty_gap_index_survey_year': 'wb_poverty_gap_index_povertyline_2017_survey_year',
    'interpolation_method_rate_survey_year': 'interpolation_method_rate_povertyline_2017_survey_year',
}, inplace=True)

survey_year_poverty_data_povertyline_2021 = gather_survey_year_pip_data(
    accumulated_data[accumulated_data.survey_year.notna()], pip_data_povertyline_2021, 3.0
).drop(columns='survey_year')
survey_year_poverty_data_povertyline_2021.rename(columns={
    'wb_poverty_rate_survey_year': 'wb_poverty_rate_povertyline_2021_survey_year', 
    'wb_poverty_gap_index_survey_year': 'wb_poverty_gap_index_povertyline_2021_survey_year',
    'interpolation_method_rate_survey_year': 'interpolation_method_rate_povertyline_2021_survey_year'
}, inplace=True)

accumulated_data = accumulated_data.merge(
    survey_year_poverty_data_povertyline_2017, on='country_code', how='left'
)
accumulated_data = accumulated_data.merge(
    survey_year_poverty_data_povertyline_2021, on='country_code', how='left'
)

In [None]:
population_data = pd.read_csv(
    'data/eop/compiled_country_data/world_bank_data_download_20250708/API_SP.POP.TOTL_DS2_en_csv_v2_38144.csv',
    skiprows=4
).dropna(axis=1, how='all') # E: drop columns that are all NaN

population_data.rename(columns={
    'Country Code': 'country_code'
}, inplace=True)
population_data = population_data.merge(
    accumulated_data[['country_code', 'survey_year']], on='country_code', how='inner' # E: Taiwan is lost here
)


population_list = []
for _, row in population_data.iterrows():

    if pd.isna(row.survey_year):
        continue
    result = {'country_code': row.country_code}
    result['total_population_survey_year'] = row[row.survey_year]
    population_list.append(result)
population = pd.DataFrame(population_list)

accumulated_data = accumulated_data.merge(
    population, on='country_code', how='left'
)

In [None]:
composite_wb_data = pd.read_csv(
    'data/eop/compiled_country_data/world_bank_data_download_20250627/ba42da22-4cc7-4d46-86ef-7fc5dcd2ac5e_Data.csv'
)


def rename_year_columns(col):
    match = re.match(r'^(\d{4}) \[YR\d{4}\]$', col)
    if match:
        return match.group(1)
    return col

composite_wb_data.columns = [rename_year_columns(c) for c in composite_wb_data.columns]
composite_wb_data = composite_wb_data.rename(columns={
    'Country Name': 'country',
    'Country Code': 'country_code',
    'Series Code': 'series_code'
}).drop(columns='Series Name')


year_cols = [col for col in composite_wb_data.columns if re.match(r'^\d{4}$', col)]
year_cols.sort()
composite_wb_data[year_cols] = composite_wb_data[year_cols].apply(pd.to_numeric, errors='coerce')

In [None]:
oda_raw = composite_wb_data[composite_wb_data['series_code'] == 'DT.ODA.ALLD.KD'].copy()

cpi = pd.read_csv(
    'data/eop/compiled_country_data/world_bank_data_download_20251117/API_FP.CPI.TOTL_DS2_en_csv_v2_216045.csv',
    skiprows=4,
)
us_cpi_2021 = cpi.loc[cpi['Country Code'] == 'USA', '2021'].values[0]
us_cpi_2023 = cpi.loc[cpi['Country Code'] == 'USA', '2023'].values[0]

oda_raw[year_cols] = oda_raw[year_cols] * (us_cpi_2023 / us_cpi_2021)

oda_list = []
for _, row in oda_raw.iterrows():
    result = {'country_code': row.country_code}
    result['ODA_most_recent_year'] = None
    result['ODA_most_recent'] = np.nan
    for year in year_cols:
        if pd.notna(row[year]):
            result['ODA_most_recent_year'] = year
            result['ODA_most_recent'] = row[year] / 1e9  # Units of billions of USD
    oda_list.append(result)
oda = pd.DataFrame(oda_list)
accumulated_data = accumulated_data.merge(
    oda, on='country_code', how='left'
)

In [None]:
gdp_raw = pd.read_csv(
    'data/eop/compiled_country_data/world_bank_data_download_20250628/API_NY.GDP.MKTP.KD_DS2_en_csv_v2_127117.csv',
    skiprows=4,
)
gdp_raw = gdp_raw.rename(columns={'Country Code': 'country_code'})

cpi = pd.read_csv(
    'data/eop/compiled_country_data/world_bank_data_download_20251117/API_FP.CPI.TOTL_DS2_en_csv_v2_216045.csv',
    skiprows=4,
)
us_cpi_2015 = cpi.loc[cpi['Country Code'] == 'USA', '2015'].values[0]
us_cpi_2023 = cpi.loc[cpi['Country Code'] == 'USA', '2023'].values[0]

gdp_raw = accumulated_data.loc[
    accumulated_data.survey_year.notna(), ['country_code', 'survey_year']
].merge(gdp_raw, on='country_code', how='left')

gdp_list = []
for _, row in gdp_raw.iterrows():
    result = {
        'country_code': row.country_code,
        'GDP_survey_year': row[row.survey_year] * (us_cpi_2023 / us_cpi_2015) / 1e9
    }
    gdp_list.append(result)
gdp = pd.DataFrame(gdp_list)
accumulated_data = accumulated_data.merge(gdp, on='country_code', how='left')

In [None]:
revenue_data_raw = pd.read_csv(
    'data/eop/compiled_country_data/imf_data_download_20250714/imf-dm-export-20250714.csv'
)
revenue_data_raw.rename(columns={'Revenue (% of GDP)': 'country'}, inplace=True)
revenue_data_raw.country = revenue_data_raw.country.str.lower()

revenue_data_raw.dropna(subset='country', inplace=True)
revenue_data_raw.country.replace({
    'congo, dem. rep. of the': 'congo, dem. rep.',
    'south sudan, republic of': 'south sudan',
    'yemen': 'yemen, rep.'
}, inplace=True)
revenue_data_raw.country = revenue_data_raw.country.apply(normalize_text)

revenue_data_raw = accumulated_data.loc[
    accumulated_data.survey_year.notna(), ['country', 'survey_year']
].merge(revenue_data_raw, on='country', how='left')
revenue_list = []
for _, row in revenue_data_raw.iterrows():
    result = {
        'country': row.country,
        'government_revenue_percentage_survey_year': row[row.survey_year]
    }
    revenue_list.append(result)
revenue = pd.DataFrame(revenue_list)

accumulated_data = accumulated_data.merge(revenue, on='country', how='left')

In [None]:
include_poverty_shares = False # E: if true, it will not work due to 'wb_poverty_rate_most_recent', 'wb_poverty_rate_most_recent_year' not being in accumulated_data
if include_poverty_shares:
    poverty_rate_data_from_data_portal = pd.read_csv(
        '/data/eop/compiled_country_data/world_bank_data_download_20250714/API_SI.POV.DDAY_DS2_en_csv_v2_38376.csv',
        skiprows=4
    )


    poverty_rate_data_from_data_portal.loc[poverty_rate_data_from_data_portal['Country Name'] == 'World', '2019'] = (
        poverty_rate_data_from_data_portal.loc[poverty_rate_data_from_data_portal['Country Name'] == 'World', '2018']
        + poverty_rate_data_from_data_portal.loc[poverty_rate_data_from_data_portal['Country Name'] == 'World', '2020']
    ) / 2



    poverty_rate_data_from_data_portal.loc[poverty_rate_data_from_data_portal['Country Name'] == 'World', '2024'] = (
        2 * poverty_rate_data_from_data_portal.loc[poverty_rate_data_from_data_portal['Country Name'] == 'World', '2023']
        - poverty_rate_data_from_data_portal.loc[poverty_rate_data_from_data_portal['Country Name'] == 'World', '2022']
    )

In [None]:
if include_poverty_shares:
    population_data = pd.read_csv(
        'data/eop/compiled_country_data/world_bank_data_download_20250708/API_SP.POP.TOTL_DS2_en_csv_v2_38144.csv',
        skiprows=4
    )

    # Find the intersection of year columns in both dataframes
    year_columns = [
        col for col in poverty_rate_data_from_data_portal.columns if col in population_data.columns and col.isdigit()
    ]

    # Calculate world poverty count for each year
    world_poverty_count = []
    for year in year_columns:
        rate = poverty_rate_data_from_data_portal.loc[poverty_rate_data_from_data_portal['Country Name'] == 'World', year].values[0]
        pop = population_data.loc[population_data['Country Name'] == 'World', year].values[0]
        poverty_count = rate * pop / 100 if pd.notna(rate) and pd.notna(pop) else np.nan
        world_poverty_count.append({'year': int(year), 'world_poverty_count': poverty_count})

    world_poverty_count_df = pd.DataFrame(world_poverty_count)

In [None]:
if include_poverty_shares:
    population_raw = pd.read_csv(
        'data/eop/compiled_country_data/world_bank_data_download_20250708/API_SP.POP.TOTL_DS2_en_csv_v2_38144.csv',
        skiprows=4
    )
    population_raw = population_raw.rename(columns={'Country Code': 'country_code'})

    # first, using pip poverty rates
    population_data = (
        accumulated_data[['country_code', 'survey_year', 'wb_poverty_rate_most_recent', 'wb_poverty_rate_most_recent_year']]
        .merge(population_raw, on='country_code', how='left')
    )
    population_list = []
    for _, row in population_data.iterrows():
        result = {
            'country_code': row.country_code,
            'share_of_worlds_poor_most_recent_year': str(int(row.wb_poverty_rate_most_recent_year)),
            'poverty_headcount': (
                row.wb_poverty_rate_most_recent * row[str(int(row.wb_poverty_rate_most_recent_year))]
            ),
        }
        population_list.append(result)
    share_of_worlds_poor_pip = pd.DataFrame(population_list)

    def compute_share_of_world_poor(row):
        year = row.share_of_worlds_poor_most_recent_year
        world_poverty_headcount = world_poverty_count_df.loc[
            world_poverty_count_df.year.astype(str) == year, 'world_poverty_count'
        ].values[0]
        return row.poverty_headcount / world_poverty_headcount

    share_of_worlds_poor_pip['share_of_worlds_poor_most_recent'] = share_of_worlds_poor_pip.apply(compute_share_of_world_poor, axis=1)


    # now using dev indicators poverty rates
    year_cols = [col for col in poverty_rate_data_from_data_portal.columns if re.match(r'^\d{4}$', col)]
    year_cols.sort()
    poverty_rate_data_from_data_portal[year_cols] = poverty_rate_data_from_data_portal[year_cols].apply(pd.to_numeric, errors='coerce')

    l = []
    for _, row in poverty_rate_data_from_data_portal.iterrows():
        result = {
            'country_code': row['Country Code'],
            'country_dev_indicators': row['Country Name']
        }
        result['poverty_rate_most_recent_dev_indicators_year'] = None
        result['poverty_rate_most_recent_dev_indicators'] = np.nan
        for year in year_cols:
            if pd.notna(row[year]):
                result['poverty_rate_most_recent_dev_indicators_year'] = int(year)
                result['poverty_rate_most_recent_dev_indicators'] = row[year] / 100
        l.append(result)

    poverty_rates_dev_indicators = (
        pd.DataFrame(l)
    )
    population_with_dev_indicators = (
        poverty_rates_dev_indicators[[
            'country_code', 'country_dev_indicators', 'poverty_rate_most_recent_dev_indicators', 'poverty_rate_most_recent_dev_indicators_year'
        ]]
        .merge(population_raw, on='country_code', how='inner')
    )
    population_list = []
    for _, row in population_with_dev_indicators.iterrows():
        result = {
            'country_code': row.country_code,
            'country': row.country_dev_indicators,
        }
        if not np.isnan(row.poverty_rate_most_recent_dev_indicators_year):
            result.update({
                'share_of_worlds_poor_most_recent_year': str(int(row.poverty_rate_most_recent_dev_indicators_year)),
                'poverty_headcount': (
                    row.poverty_rate_most_recent_dev_indicators * row[str(int(row.poverty_rate_most_recent_dev_indicators_year))]
                )
            })
        else:
            result.update({
                'share_of_worlds_poor_most_recent_year': None,
                'poverty_headcount': np.nan
            })
        population_list.append(result)
    share_of_worlds_poor_dev_indicators = pd.DataFrame(population_list)

    def compute_share_of_world_poor(row):
        year = row.share_of_worlds_poor_most_recent_year
        if year is None:
            return np.nan
        world_poverty_headcount = world_poverty_count_df.loc[
            world_poverty_count_df.year.astype(str) == year, 'world_poverty_count'
        ].values[0]
        return row.poverty_headcount / world_poverty_headcount

    share_of_worlds_poor_dev_indicators['share_of_worlds_poor_most_recent'] = (
        share_of_worlds_poor_dev_indicators.apply(compute_share_of_world_poor, axis=1)
    )


In [None]:
if include_poverty_shares:
    accumulated_data = accumulated_data.merge(
        share_of_worlds_poor_pip[['country_code', 'share_of_worlds_poor_most_recent', 'share_of_worlds_poor_most_recent_year']], 
        on='country_code', how='left'
    )

### Exchange rates and PPP conversions: start with WB values

In [None]:
wb_exchange_rates = pd.read_csv(
    'data/eop/compiled_country_data/world_bank_data_download_20250715/API_PA.NUS.FCRF_DS2_en_csv_v2_22859.csv',
    skiprows=4,
)
wb_exchange_rates.rename(
    columns={
        'Country Code': 'country_code', 'Country Name': 'country', '2021': 'market_exchange_rate_2021', 
        '2017': 'market_exchange_rate_2017'
    }, inplace=True
)
wb_ppp_conversions = pd.read_csv(
    'data/eop/compiled_country_data/world_bank_data_download_20250715/API_PA.NUS.PRVT.PP_DS2_en_csv_v2_22915.csv',
    skiprows=4,
)
wb_ppp_conversions.rename(
    columns={
        'Country Code': 'country_code', 'Country Name': 'country', 
        '2021': 'PPP_conversion_factor_2021', '2017': 'PPP_conversion_factor_2017'
    }, inplace=True
)

wb_exchange_rates['country']= (
    wb_exchange_rates['country']
    .str.normalize('NFD')
    .str.encode('ascii', errors='ignore')
    .str.decode('ascii').str.replace(r'[^\w\s]', '', regex=True)
    .str.replace(' ', '_')
    .str.lower()
    .replace({'venezuela_rb': 'venezuela, rb'})
    .replace({'yemen_republic_of': 'yemen, rep.'})

)

wb_ppp_conversions['country']= (
    wb_ppp_conversions['country']
    .str.normalize('NFD')
    .str.encode('ascii', errors='ignore')
    .str.decode('ascii').str.replace(r'[^\w\s]', '', regex=True)
    .str.replace(' ', '_')
    .str.lower()
    .replace({'venezuela_rb': 'venezuela, rb'})
    .replace({'yemen_republic_of': 'yemen, rep.'})

)


accumulated_data = accumulated_data.merge(
    wb_exchange_rates[['country_code', 'market_exchange_rate_2017', 'market_exchange_rate_2021']],
    on='country_code', how='left'
)
accumulated_data = accumulated_data.merge(
    wb_ppp_conversions[['country_code', 'PPP_conversion_factor_2021', 'PPP_conversion_factor_2017']],
    on='country_code', how='left'
)

### Now impute with IMF

In [None]:
imf_exchange_rates = pd.read_csv( 
    (
        'data/eop/compiled_country_data/imf_data_download_20250903/'
        'imf_exchange_rates_dataset_2025-09-03T16_58_04.844661537Z_DEFAULT_INTEGRATION_IMF.STA_ER_4.0.1.csv'
    )
)

imf_exchange_rates.rename(columns={'COUNTRY': 'country'}, inplace=True)

imf_exchange_rates = imf_exchange_rates[
    (imf_exchange_rates.INDICATOR == 'US Dollar per domestic currency')
    & (imf_exchange_rates.TYPE_OF_TRANSFORMATION == 'Period average')
    & (imf_exchange_rates.FREQUENCY == 'Annual')
]

imf_exchange_rates['country'] = (
    imf_exchange_rates['country']
    .str.normalize('NFD')
    .str.encode('ascii', errors='ignore')
    .str.decode('ascii').str.replace(r'[^\w\s]', '', regex=True)
    .str.replace(' ', '_')
    .str.lower()
    .replace({'taiwan_province_of_china': 'taiwan, china'})
    .replace({'venezuela_republica_bolivariana_de': 'venezuela, rb'})
)
imf_exchange_rates.country = imf_exchange_rates.country.apply(normalize_text)

In [None]:
with_imf_exchange_rates = accumulated_data.merge(imf_exchange_rates, how='left', on='country') 

imputing_2017_exchange_rate = []
imputing_2021_exchange_rate = []
for _, row in with_imf_exchange_rates.iterrows():
    if pd.isna(row.market_exchange_rate_2017):
        if pd.notna(row['2017']):
            imputing_2017_exchange_rate.append(row.country)
            accumulated_data.loc[
                accumulated_data.country_code == row.country_code, 'market_exchange_rate_2017'
            ] = row['2017']

    if pd.isna(row.market_exchange_rate_2021):
        if pd.notna(row['2021']):
            imputing_2021_exchange_rate.append(row.country)
            accumulated_data.loc[
                accumulated_data.country_code == row.country_code, 'market_exchange_rate_2021'
            ] = row['2021'] # market exchange rate 


In [None]:
imf_ppp_rates = pd.read_csv( 
    (
        'data/eop/compiled_country_data/imf_data_download_20250903/'
        'imf_ppp_dataset_2025-09-03T18_24_12.655968722Z_DEFAULT_INTEGRATION_IMF.RES_WEO_6.0.0.csv'
    )
)


imf_ppp_rates = imf_ppp_rates[
    (imf_ppp_rates.FREQUENCY == 'Annual')
]

imf_ppp_rates.rename(columns={'COUNTRY': 'country'}, inplace=True)


imf_ppp_rates['country']= (
    imf_ppp_rates['country']
    .str.normalize('NFD')
    .str.encode('ascii', errors='ignore')
    .str.decode('ascii').str.replace(r'[^\w\s]', '', regex=True)
    .str.replace(' ', '_')
    .str.lower()
    .replace({'taiwan_province_of_china': 'taiwan, china'})
    .replace({'venezuela_republica_bolivariana_de': 'venezuela, rb'})
    .replace({'yemen_republic_of': 'yemen, rep.'})

)


with_imf_ppp = accumulated_data.merge(imf_ppp_rates, how='left', on='country')

imputing_2017_ppp = []
imputing_2021_ppp = []
for _, row in with_imf_ppp.iterrows():
    if pd.isna(row.PPP_conversion_factor_2017):
        if pd.notna(row['2017']):
            imputing_2017_ppp.append(row.country)
            accumulated_data.loc[
                accumulated_data.country_code == row.country_code, 
                'PPP_conversion_factor_2017'
            ] = row['2017']

    if pd.isna(row.PPP_conversion_factor_2021):
        if pd.notna(row['2021']):
            imputing_2021_ppp.append(row.country)
            accumulated_data.loc[
                accumulated_data.country_code == row.country_code, 
                'PPP_conversion_factor_2021'
            ] = row['2021']

In [None]:
print('imputing 2017 exchange rate', imputing_2017_exchange_rate)
print('imputing 2021 exchange rate', imputing_2021_exchange_rate)
print('imputing 2017 ppp', imputing_2017_ppp)
print('imputing 2021 ppp', imputing_2021_ppp)

### Extrapolate for still-missing values

In [None]:
any_missing_data = accumulated_data[
    (accumulated_data.PPP_conversion_factor_2017.isna())
    | (accumulated_data.PPP_conversion_factor_2021.isna())
    | (accumulated_data.market_exchange_rate_2017.isna())
    | (accumulated_data.market_exchange_rate_2021.isna())
]

In [None]:
# Change back for merging + extrapolation
wb_exchange_rates.rename(
    columns={
        'market_exchange_rate_2021': '2021',
        'market_exchange_rate_2017': '2017'
    }, inplace=True
)
wb_ppp_conversions.rename(
    columns={
        'PPP_conversion_factor_2021': '2021',
        'PPP_conversion_factor_2017': '2017'
    }, inplace=True
)

wb_exchange_rates = wb_exchange_rates[wb_exchange_rates.country_code.isin(any_missing_data.country_code)]
wb_ppp_conversions = wb_ppp_conversions[wb_ppp_conversions.country_code.isin(any_missing_data.country_code)]

imf_exchange_rates = imf_exchange_rates[imf_exchange_rates.country.isin(any_missing_data.country)]
imf_ppp_rates = imf_ppp_rates[imf_ppp_rates.country.isin(any_missing_data.country)]

exchange_rates_merged = pd.merge(
    wb_exchange_rates, imf_exchange_rates, suffixes=('', '_imf'), on='country', how='outer'
)
ppp_conversions_merged = pd.merge(
    wb_ppp_conversions, imf_ppp_rates, suffixes=('', '_imf'), on='country', how='outer'
)

# Add missing PPP (just venezuela)
numeric_cols = [col for col in ppp_conversions_merged.columns if col.isdigit()]

for col in numeric_cols:
    if f'{col}_imf' in ppp_conversions_merged.columns:
        ppp_conversions_merged.loc[ppp_conversions_merged.country_code == 'VEN', col] = ppp_conversions_merged[f'{col}_imf']



In [None]:
exchange_rates_merged = exchange_rates_merged[
    ['country', 'country_code'] + sorted([c for c in exchange_rates_merged.columns if c.isdigit()])
]
ppp_conversions_merged = ppp_conversions_merged[
    ['country', 'country_code'] + sorted([c for c in ppp_conversions_merged.columns if c.isdigit()])
]

In [None]:
# Guinea: Impute 2021 market exchange rate using 2020 ratio and 2021 PPP
accumulated_data.loc[
    accumulated_data.country == 'guinea', 'market_exchange_rate_2021'
] = (
    (
        exchange_rates_merged.loc[exchange_rates_merged.country == 'guinea', '2020']
        / ppp_conversions_merged.loc[ppp_conversions_merged.country == 'guinea', '2020']
    ) * ppp_conversions_merged.loc[ppp_conversions_merged.country == 'guinea', '2021']
).values[0]

# Myanmar: Impute 2021 market exchange rate using 2020 ratio and 2021 PPP
accumulated_data.loc[
    accumulated_data.country == 'myanmar', 'market_exchange_rate_2021'
] = (
        (
        exchange_rates_merged.loc[exchange_rates_merged.country == 'myanmar', '2020']
        / ppp_conversions_merged.loc[ppp_conversions_merged.country == 'myanmar', '2020']
    ) * ppp_conversions_merged.loc[ppp_conversions_merged.country == 'myanmar', '2021']
).values[0]

# West bank and gaza: use Israel market exchange rate numbers
accumulated_data.loc[
    accumulated_data.country_code == 'PSE', 'market_exchange_rate_2017'
] = accumulated_data.loc[
    accumulated_data.country_code == 'ISR', 'market_exchange_rate_2017'
].values[0]
accumulated_data.loc[
    accumulated_data.country_code == 'PSE', 'market_exchange_rate_2021'
] = accumulated_data.loc[
    accumulated_data.country_code == 'ISR', 'market_exchange_rate_2021'
].values[0]

# Turkmenistan: Can't usefully extrapolate. Omitting.

# Venezuela
# PPP 2017: Extrapolate using ratio from 2011
accumulated_data.loc[
    accumulated_data.country_code == 'VEN', 'PPP_conversion_factor_2017'
] = (
        (
        ppp_conversions_merged.loc[ppp_conversions_merged.country_code == 'VEN', '2011']
        / exchange_rates_merged.loc[exchange_rates_merged.country_code == 'VEN', '2011']
    ) * exchange_rates_merged.loc[exchange_rates_merged.country_code == 'VEN', '2017']
).values[0]


# Exchange rate: Extrapolate using ratio from 2011
accumulated_data.loc[
    accumulated_data.country_code == 'VEN', 'market_exchange_rate_2021'
] = (
        (
        exchange_rates_merged.loc[exchange_rates_merged.country_code == 'VEN', '2011']
        / ppp_conversions_merged.loc[ppp_conversions_merged.country_code == 'VEN', '2011']
    ) * ppp_conversions_merged.loc[ppp_conversions_merged.country_code == 'VEN', '2021']
).values[0]

# Zimbabwe: impute 2017 market exchange rate using 2020 ratio and 2017 PPP
accumulated_data.loc[
    accumulated_data.country == 'zimbabwe', 'market_exchange_rate_2017'
] = (
        (
        exchange_rates_merged.loc[exchange_rates_merged.country == 'zimbabwe', '2020']
        / ppp_conversions_merged.loc[ppp_conversions_merged.country == 'zimbabwe', '2020']
    ) * ppp_conversions_merged.loc[ppp_conversions_merged.country == 'zimbabwe', '2017']
).values[0]


In [None]:
# Print diagnostics
if False:
    display(
            accumulated_data[accumulated_data.country.str.contains('turkmen')][[
            'market_exchange_rate_2017', 'market_exchange_rate_2021', 'PPP_conversion_factor_2017', 
            'PPP_conversion_factor_2021'
        ]]
    )

if False:
    wb_exchange_rates = pd.read_csv(
        '/data/eop/compiled_country_data/world_bank_data_download_20250715/API_PA.NUS.FCRF_DS2_en_csv_v2_22859.csv',
        skiprows=4,
    )
    wb_exchange_rates.rename(
        columns={
            'Country Code': 'country_code', 'Country Name': 'country',
        }, inplace=True
    )
    wb_ppp_conversions = pd.read_csv(
        '/data/eop/compiled_country_data/world_bank_data_download_20250715/API_PA.NUS.PRVT.PP_DS2_en_csv_v2_22915.csv',
        skiprows=4,
    )
    wb_ppp_conversions.rename(
        columns={
            'Country Code': 'country_code', 'Country Name': 'country', 
        }, inplace=True
    )
    with pd.option_context('display.max_columns', 100):
        print('market exchange rate')
        display(exchange_rates_merged[exchange_rates_merged.country.str.contains('west')])
        display(wb_exchange_rates[wb_exchange_rates.country.str.contains('rael')])



        print('ppp')
        display(ppp_conversions_merged[ppp_conversions_merged.country.str.contains('west')])
        display(wb_ppp_conversions[wb_ppp_conversions.country.str.contains('rael')])


In [None]:
cpi = pd.read_csv(
    'data/eop/compiled_country_data/world_bank_data_download_20251117/API_FP.CPI.TOTL_DS2_en_csv_v2_216045.csv',
    skiprows=4,
).rename(columns={'Country Code': 'country_code'})
temp_merged = accumulated_data.merge(cpi, on='country_code', how='left')

def get_cpi_value(row, year):
    year = float(year)
    if year == int(year):
        if str(int(year)) in row:
            return row[str(int(year))]
        else:
            print(f'CPI {int(year)} not available for {row.country}')
            if int(year) == 2024:
                print('Using 2023 instead; confirm overwrite by curated data')
                return row['2023']
    print(f'fractional base year {row.country}')
    lower_year = int(year)
    upper_year = lower_year + 1
    print(lower_year, upper_year)
    weight = year - lower_year 

    lower_value = row[str(upper_year)]
    upper_value = row[str(lower_year)]
    overall = lower_value * (1 - weight) + upper_value * weight
    print(
        f'upper: {lower_value}, lower: {upper_value}, '
        f'overall: {overall}'
    )
    return overall

def assemble(row):
    base_year = row['currency_base_year']
    base_cpi = get_cpi_value(row, base_year)
    local_currency_conversion_2017 = row['2017'] / base_cpi
    local_currency_conversion_2021 = row['2021'] / base_cpi
    ppp_conversion_2017 = row['PPP_conversion_factor_2017']
    ppp_conversion_2021 = row['PPP_conversion_factor_2021']
    overall_conversion_2017 = local_currency_conversion_2017 / ppp_conversion_2017
    overall_conversion_2021 = local_currency_conversion_2021 / ppp_conversion_2021

    return pd.Series({
        'country_code': row['country_code'],
        '2017_cpi': row['2017'],
        '2021_cpi': row['2021'],
        'overall_currency_conversion_to_2017_ppp': overall_conversion_2017,
        'overall_currency_conversion_to_2021_ppp': overall_conversion_2021,
        'overall_conversion_factor_ratio_from_2021_to_2017': overall_conversion_2017 / overall_conversion_2021
    })
assembled = temp_merged[temp_merged.survey_year.notna()].apply(assemble, axis=1)
accumulated_data = accumulated_data.merge(
    assembled[['country_code', 'overall_conversion_factor_ratio_from_2021_to_2017', 'overall_currency_conversion_to_2017_ppp', 'overall_currency_conversion_to_2021_ppp']],
    on='country_code', how='left'
)


### Override with manually curated data

We override survey currency conversion factors, not by-year PPP or CPI or exchange rates, since these conversions are survey-specific.

In [None]:
conversion_factor_overrides = pd.read_csv('data/eop/compiled_country_data/cpi_ppp_exec_portal_accessed_20251019.csv')
additional_conversion_factor_overrides = pd.read_csv('data/eop/compiled_country_data/cpi_ppp_exec_portal_ad_hoc.csv')
conversion_factor_overrides = pd.concat([conversion_factor_overrides, additional_conversion_factor_overrides], ignore_index=True)
matching = accumulated_data.merge(conversion_factor_overrides, left_on='country_code', right_on='code', how='inner')
matching = matching[matching.pip_using == 'True']
matching = matching[matching.survey_year.astype(float) ==  matching.year]
# Ensure duplicate country_code rows have matching CPI/ICP values, then drop duplicates
dup_mask = matching.duplicated(subset='country_code', keep=False)
dup_codes = matching.loc[dup_mask, 'country_code'].unique().tolist()

for cc in dup_codes:
    rows = matching[matching['country_code'] == cc]
    for col in ['cpi2017', 'cpi2021', 'icp2017', 'icp2021']:
        if col not in matching.columns:
            continue
        vals = rows[col].dropna().unique()
        if len(vals) > 1:
            raise AssertionError(f"Mismatch for {col} in country_code {cc}: {vals}")

matching = matching.drop_duplicates(subset='country_code', keep='first').reset_index(drop=True)

In [None]:
matching

In [None]:
for _, country_row in matching.iterrows():
    accumulated_data.loc[
        accumulated_data.country_code == country_row.country_code, 'overall_currency_conversion_to_2017_ppp'
    ] = 1/(country_row['cpi2017'] * country_row['icp2017'])
    accumulated_data.loc[
        accumulated_data.country_code == country_row.country_code, 'overall_currency_conversion_to_2021_ppp'
    ] = 1/(country_row['cpi2021'] * country_row['icp2021'])
    accumulated_data.loc[
        accumulated_data.country_code == country_row.country_code, 'overall_conversion_factor_ratio_from_2021_to_2017'
    ] = (
        accumulated_data['overall_currency_conversion_to_2017_ppp'] 
        / accumulated_data['overall_currency_conversion_to_2021_ppp']
    )
accumulated_data.drop(columns=['pip_using'], inplace=True)

In [None]:
# Elizabeth Foster's overrides for the eight WAMU countries
ehcvm_overrides = pd.read_csv('data/eop/compiled_country_data/cpi_icp_from_elizabeth_202510.csv')
for _, row in accumulated_data.iterrows():
    if row.country_code in ehcvm_overrides.code.values:
        country_row = ehcvm_overrides[
            (ehcvm_overrides.code == row.country_code)
            & (ehcvm_overrides.year == 2018)
        ]
        assert len(country_row) == 1


        accumulated_data.loc[
            accumulated_data.country_code == row.country_code, 'overall_currency_conversion_to_2017_ppp'
        ] = 1/(country_row['cpi2017'].values[0] * country_row['icp2017'].values[0])
        accumulated_data.loc[
            accumulated_data.country_code == row.country_code, 'overall_currency_conversion_to_2021_ppp'
        ] = 1/(country_row['cpi2021'].values[0] * country_row['icp2021'].values[0])
        accumulated_data.loc[
            accumulated_data.country_code == row.country_code, 'overall_conversion_factor_ratio_from_2021_to_2017'
        ] = (
            accumulated_data['overall_currency_conversion_to_2017_ppp'] 
            / accumulated_data['overall_currency_conversion_to_2021_ppp']
        )

In [None]:
survey_data_path = Path('data/eop/country_data')

survey_metrics = []

name_map = {
    'burkina_faso': 'burkina faso',
    'cote_divoire': "côte d'ivoire",
    'south_africa': 'south africa',
    'south_sudan': 'south sudan'
}
for dir in survey_data_path.iterdir():
    country_code = dir.name
    try:
        if (dir / 'cleaned').exists():
            cleaned_dir = dir / 'cleaned'
        elif (dir / 'clean').exists():
            print(f'found "clean" instead of "cleaned": {country_code}')
            cleaned_dir = dir / 'clean'
        else:
            print(f'No cleaned data directory found for {country_code}')

        # Assert that train.parquet and test.parquet are newer than any other parquet file under country_data_path
        all_parquet_files = list(cleaned_dir.glob('*.parquet'))
        train_path = cleaned_dir / 'train.parquet'
        test_path = cleaned_dir / 'test.parquet'
        for f in all_parquet_files:
            if f not in [train_path, test_path]:
                assert train_path.stat().st_mtime > f.stat().st_mtime, f"{train_path.name} is not newer than {f.name}"
                assert test_path.stat().st_mtime > f.stat().st_mtime, f"{test_path.name} is not newer than {f.name}"
        train = pd.read_parquet(cleaned_dir / 'train.parquet')
        test = pd.read_parquet(cleaned_dir / 'test.parquet')

        data = pd.concat((train, test), ignore_index=True)

        conversion_factor = accumulated_data.loc[
            accumulated_data.country_code == country_code, 'overall_conversion_factor_ratio_from_2021_to_2017'
        ].values[0]
        data['consumption_per_capita_per_day_povertyline_2017'] = (
            data.consumption_per_capita_per_day * conversion_factor
        )

        data['consumption_per_capita_per_day_povertyline_2021'] = data.consumption_per_capita_per_day

        count_poor_povertyline_2017 = (
            data[data.consumption_per_capita_per_day_povertyline_2017 < 2.15].headcount_adjusted_hh_wgt
        ).sum()
        count_poor_povertyline_2021 = (
            data[data.consumption_per_capita_per_day_povertyline_2021 < 3.0].headcount_adjusted_hh_wgt
        ).sum()

        total = (
            data.headcount_adjusted_hh_wgt
        ).sum()

        rate_povertyline_2017 = count_poor_povertyline_2017 / total
        rate_povertyline_2021 = count_poor_povertyline_2021 / total

        poverty_gap_index_povertyline_2017 = (
            (
                (2.15 - data['consumption_per_capita_per_day_povertyline_2017'])
                .clip(lower=0) * data['headcount_adjusted_hh_wgt']
            )
        ).sum() / (total * 2.15)

        poverty_gap_index_povertyline_2021 = (
            (
                (3.0 - data['consumption_per_capita_per_day_povertyline_2021'])
                .clip(lower=0) * data['headcount_adjusted_hh_wgt']
            )
        ).sum() / (total * 3.0)

        survey_metrics.append(
            {
                'country_code': country_code, 
                'survey_poverty_rate_povertyline_2017': rate_povertyline_2017, 
                'survey_poverty_rate_povertyline_2021': rate_povertyline_2021,
                'survey_poverty_gap_index_povertyline_2017': poverty_gap_index_povertyline_2017,
                'survey_poverty_gap_index_povertyline_2021': poverty_gap_index_povertyline_2021,
            }
        )
    
    except Exception as e:
        print(f'Error encountered with {dir.name}; skipping')
        print(e)

survey_metrics_df = pd.DataFrame(survey_metrics)


survey_metrics_df = accumulated_data.loc[
    accumulated_data.survey_year.notna(), ['country_code']
].merge(survey_metrics_df, on='country_code', how='left')

accumulated_data = accumulated_data.merge(survey_metrics_df, on='country_code', how='outer')

In [None]:
today_str = datetime.today().strftime('%Y%m%d')
accumulated_data.to_csv(
    f'data/eop/compiled_country_data/auxiliary_data/auxiliary_data_{today_str}.csv', index=False
)

In [None]:
# E: Comparing two most recent auxiliary data files
auxiliary_data_20251124 = pd.read_csv(
    'data/eop/compiled_country_data/auxiliary_data/auxiliary_data_20251124.csv'
)
auxiliary_data_20251201 = pd.read_csv(
    'data/eop/compiled_country_data/auxiliary_data/auxiliary_data_20251201.csv'
)   

# compare auxiliary_data_20251124 and auxiliary_data_20251201 for differences
print(auxiliary_data_20251124.compare(auxiliary_data_20251201))

# Currency conversion table

In [None]:
aux_files = glob('data/eop/compiled_country_data/auxiliary_data/auxiliary_data_*.csv')
latest_file = max(aux_files, key=lambda x: x.split('_')[-1].split('.')[0])
aux_data = pd.read_csv(latest_file)
print(f'Latest file: {latest_file}')

previous_conversion_factors_table = pd.read_csv('data/eop/compiled_country_data/currency_conversion.csv')


In [None]:
new_conversion_factors_table = aux_data.loc[aux_data.survey_year.notna(), ['country', 'country_code', 'overall_currency_conversion_to_2021_ppp']]

new_conversion_factors_table.rename(columns={'overall_currency_conversion_to_2021_ppp': 'Conversion Factor'}, inplace=True)

In [None]:
if True: # Report changes
    merged = new_conversion_factors_table.merge(
        previous_conversion_factors_table[['country_code', 'Conversion Factor']], 
        how='outer',
        suffixes=('_new', '_old'),
        on='country_code'
    )

    mismatched = merged[
        ~np.isclose(merged['Conversion Factor_new'], merged['Conversion Factor_old'], equal_nan=True)
    ]
    display(mismatched)

In [None]:
new_conversion_factors_table.to_csv('/data/eop/compiled_country_data/currency_conversion.csv', index=False)

# Secondary aux data

In [None]:
aux_accumulated_data = dict()

In [None]:
gdp_raw = pd.read_csv(
    'data/eop/compiled_country_data/world_bank_data_download_20250628/API_NY.GDP.MKTP.KD_DS2_en_csv_v2_127117.csv',
    skiprows=4,
)
gdp_raw = gdp_raw.rename(columns={'Country Code': 'country_code'})

cpi = pd.read_csv(
    'data/eop/compiled_country_data/world_bank_data_download_20251117/API_FP.CPI.TOTL_DS2_en_csv_v2_216045.csv',
    skiprows=4,
)
us_cpi_2015 = cpi.loc[cpi['Country Code'] == 'USA', '2015'].values[0]
us_cpi_2023 = cpi.loc[cpi['Country Code'] == 'USA', '2023'].values[0]
aux_accumulated_data['china_GDP_2023'] = (
    gdp_raw.loc[gdp_raw['country_code'] == 'CHN', '2023'].values[0] * (us_cpi_2023 / us_cpi_2015) / 1e9
)
aux_accumulated_data['OECD_GDP_2023'] = (
    gdp_raw.loc[gdp_raw['country_code'] == 'OED', '2023'].values[0] * (us_cpi_2023 / us_cpi_2015) / 1e9
)
aux_accumulated_data['global_GDP_2023'] = (
    gdp_raw.loc[gdp_raw['country_code'] == 'WLD', '2023'].values[0] * (us_cpi_2023 / us_cpi_2015) / 1e9
)


In [None]:
revenue_data_raw = pd.read_csv(
    'data/eop/compiled_country_data/imf_data_download_20250714/imf-dm-export-20250714.csv'
)
revenue_data_raw.rename(columns={'Revenue (% of GDP)': 'country'}, inplace=True)
revenue_data_raw.dropna(subset='country', inplace=True)

aux_accumulated_data['china_govt_revenue_percentage_2023'] = (
    revenue_data_raw.loc[revenue_data_raw.country == "China, People's Republic of", '2023'].values[0]
)

In [None]:
# Have to use WB (excluding grants) for OECD
wb_revenue_data = pd.read_csv(
    'data/eop/compiled_country_data/world_bank_data_download_20250904/API_GC.REV.XGRT.GD.ZS_DS2_en_csv_v2_557920.csv',
    skiprows=4
)
aux_accumulated_data['OECD_govt_revenue_percentage_2023'] = (
    wb_revenue_data.loc[wb_revenue_data['Country Code'] == 'OED', '2023'].values[0]
)


In [None]:

cpi_usa = cpi.loc[cpi['Country Code'] == 'USA']
earliest_survey_year = accumulated_data.survey_year.astype('Int64').min()
years = range(earliest_survey_year, 2025)
cpi_usa = cpi_usa[[str(y) for y in years]].melt(var_name='year', value_name='cpi_usa')
cpi_usa.cpi_usa = cpi_usa.cpi_usa / cpi_usa.loc[cpi_usa.year == '2023', 'cpi_usa'].values[0]

cpi_usa.rename(columns={'year': 'indicator', 'cpi_usa': 'value'}, inplace=True)
cpi_usa.indicator = cpi_usa.indicator.apply(lambda y: f'conversion_factor_nominal_USD_{y}_to_2023')

In [None]:
aux_dataframe = (
    pd.DataFrame.from_dict(aux_accumulated_data, columns=['value'], orient='index')
    .reset_index(names='indicator')
)
aux_dataframe = pd.concat((aux_dataframe, cpi_usa), ignore_index=True)

In [None]:
today_str = datetime.today().strftime('%Y%m%d')
aux_dataframe.to_csv(f'data/eop/compiled_country_data/auxiliary_data/secondary_auxiliary_data_{today_str}.csv', index=False)

# Human-readable

In [None]:
aux_files = glob('data/eop/compiled_country_data/auxiliary_data/auxiliary_data_*.csv')
latest_file = max(aux_files, key=lambda x: x.split('_')[-1].split('.')[0])
aux_data = pd.read_csv(latest_file)
print(f'Latest file: {latest_file}')

In [None]:
# WB country classification allows us to get the full list of countries and codes
country_income_classifications = pd.read_csv(
    '/data/eop/compiled_country_data/world_bank_helpdesk_download_20250628/world_bank_income_classification_20250628.csv'
)

# There's a split after the countries+territories and before the aggregates; use this to drop the aggregates
all_nan_mask = country_income_classifications.isnull().all(axis=1)
split_index = country_income_classifications[all_nan_mask].index.values[0]

all_countries_and_territories = country_income_classifications.loc[:split_index-1, ['Economy', 'Code']].rename(
    columns={'Economy': 'country', 'Code': 'country_code'}
)

# Manually constructed
not_countries = [
    'ASM', 'ABW', 'BMU', 'VGB', 'CYM', 'PYF', 'GIB', 'GRL', 
    'GUM', 'HKG', 'MAC', 'MAF', 'NCL', 'MNP', 'PRI','SXM', 'VIR'
]

all_countries = all_countries_and_territories[
     ~all_countries_and_territories.country_code.isin(not_countries)
]

In [None]:
wpc = pd.read_csv(
    '/data/eop/country_inventory/wpc_extended.csv',
    usecols = [
        'Country (color codes: inputs, intermediates, final outputs, error checks)',
        "Share of country's population that is in extreme poverty (WPC)",
    ]
)
wpc.columns = ['country',  'poverty_rate_wpc']
wpc['poverty_rate_wpc'] = wpc['poverty_rate_wpc'].str.rstrip('%').astype(float) / 100

wpc.dropna(subset='country', inplace=True)
# Typos 
wpc.loc[wpc.country.str.contains('Niger '), ['country']] = 'Niger'
wpc.loc[wpc.country.str.contains('Combodia'), ['country']] = 'Cambodia'


# Lining up names
for current, wb in [
    ('East Eimor', 'Timor-Leste'),
    ('Ivory Coast', 'Côte d’Ivoire'),
    ('Bosnia', 'Bosnia and Herzegovina'),
    ('Sao Tome and Principe', 'São Tomé and Príncipe'),
    ('Taiwan', 'Taiwan, China'),
    ('Turkiye', 'Türkiye'),
    ('Viet Nam', 'Vietnam'),
    ('Palestine', 'West Bank and Gaza')
]:
    wpc.loc[wpc.country.str.contains(current), ['country']] = wb

wpc.loc[wpc.country.str.contains('East Timor'), ['country']] = 'Timor-Leste'
wpc.loc[wpc.country.str.contains('Ivory Coast'), ['country']] = 'Côte d’Ivoire'
wpc.loc[wpc.country.str.contains('Bosnia'), ['country']] = 'Bosnia and Herzegovina'
wpc.drop(wpc[wpc.country == 'Puerto Rico'].index, inplace=True)

In [None]:
our_world_in_data = pd.read_csv(
    '/data/eop/compiled_country_data/our_world_in_data_20250918/pip_dataset.csv'
)
our_world_in_data = our_world_in_data[
    (our_world_in_data.ppp_version == 2017)
    & (our_world_in_data.reporting_level == 'national')
]
our_world_in_data.country.replace(
    {
        'Cape Verde': 'Cabo Verde', 'Democratic Republic of Congo': 'Congo, Dem. Rep.', 
        "Cote d'Ivoire": "Côte d’Ivoire", 'Congo': 'Congo, Rep.', 'Egypt': 'Egypt, Arab Rep.',
        'Gambia': 'Gambia, The', 'Iran': 'Iran, Islamic Rep.', 'South Korea': 'Korea, Rep.',
        'Kyrgyzstan': 'Kyrgyz Republic','Laos': 'Lao PDR', 'Micronesia (country)': 'Micronesia, Fed. Sts.',
        'Palestine': 'West Bank and Gaza', 'Russia': 'Russian Federation', 'Slovakia': 'Slovak Republic',
        'Saint Lucia': 'St. Lucia', 'Syria': 'Syrian Arab Republic', 'Sao Tome and Principe': 'São Tomé and Príncipe',
        'Taiwan': 'Taiwan, China', 'Timor': 'Timor-Leste', 'Turkey': 'Türkiye','Venezuela': 'Venezuela, RB',
        'Yemen': 'Yemen, Rep.'
    },
    inplace=True
)
def get_latest_value_with_year(df, value_col):
    def latest_func(group):
        if group[value_col].notna().any():
            idx = group['reporting_year'].idxmax()
            return pd.Series({
                f'latest_{value_col}': group.loc[idx, value_col],
                'year': group.loc[idx, 'reporting_year']
            })
        else:
            return pd.Series({
                f'latest_{value_col}': np.nan,
                'year': np.nan
            })
    return df.groupby('country_code').apply(latest_func).reset_index()

our_world_in_data_with_codes = our_world_in_data.merge(all_countries[['country', 'country_code']], on='country', how='left')
our_world_in_data_with_codes.rename(columns={
        'year': 'reporting_year', 'headcount_ratio_international_povline': 'headcount', 
    },
    inplace=True
)
our_world_in_data_with_codes.headcount /= 100

# avg_shortfall_international_povline is the average gap among people below the poverty line; we need to adjust
# it to be the average among the whole population.
def produce_poverty_gap(row):
    if row.headcount == 0:
        return 0
    else:
        return row.avg_shortfall_international_povline * row.headcount
    
our_world_in_data_with_codes['poverty_gap'] = our_world_in_data_with_codes.apply(produce_poverty_gap, axis=1)

In [None]:
latest_our_world_in_data_rates = get_latest_value_with_year(our_world_in_data_with_codes, 'headcount')
latest_our_world_in_data_rates.columns = ['country_code', 'owid_poverty_rate_most_recent', 'owid_poverty_rate_most_recent_year']
latest_our_world_in_data_gaps = get_latest_value_with_year(our_world_in_data_with_codes, 'poverty_gap')
latest_our_world_in_data_gaps.columns = ['country_code', 'owid_poverty_gap_index_most_recent', 'owid_poverty_gap_index_most_recent_year']
latest_our_world_in_data_gaps.owid_poverty_gap_index_most_recent /= 2.15

In [None]:
# country_code, survey_year, headcount, poverty_gap

our_world_in_data_survey_year = gather_survey_year_pip_data(
    aux_data[aux_data.survey_year.notna()], our_world_in_data_with_codes, 2.15
)
our_world_in_data_survey_year.columns = [
    'country_code', 'survey_year', 'owid_poverty_rate_survey_year', 
    'interpolation_method_rate_survey_year', 'owid_poverty_gap_index_survey_year'
]
our_world_in_data_survey_year.drop(columns='interpolation_method_rate_survey_year', inplace=True)

In [None]:
owid_merged = pd.merge(latest_our_world_in_data_rates, latest_our_world_in_data_gaps, how='inner')
owid_merged = owid_merged.merge(our_world_in_data_survey_year, on='country_code', how='left')
owid_merged.drop(columns='survey_year', inplace=True)

In [None]:
human_readable = all_countries.merge(wpc, on='country', how='outer')
human_readable = human_readable.merge(owid_merged, on='country_code', how='outer')
merged = pd.merge(human_readable, aux_data.drop(columns='country'), on='country_code', how='outer')

In [None]:
human_readable_columns = [
    'country', 'survey_year',
    'wb_poverty_rate_povertyline_2017_most_recent_year', 'wb_poverty_rate_povertyline_2017_most_recent', 
    'wb_poverty_rate_povertyline_2017_survey_year',
    'poverty_rate_wpc', 'survey_poverty_rate_povertyline_2017',
    'wb_poverty_gap_index_povertyline_2017_most_recent_year', 'wb_poverty_gap_index_povertyline_2017_most_recent',
    'wb_poverty_gap_index_povertyline_2017_survey_year', 'survey_poverty_gap_index_povertyline_2017'
]
human_readable_columns = human_readable_columns + [
    c for c in owid_merged.columns if c not in human_readable_columns
]
human_readable = merged[human_readable_columns]
# Confirm latest years match so I can drop one of them
print(
    (
        (
            human_readable.wb_poverty_rate_povertyline_2017_most_recent_year 
            == human_readable.wb_poverty_gap_index_povertyline_2017_most_recent_year
        ) | (
            (human_readable.wb_poverty_rate_povertyline_2017_most_recent_year.isna())
            & (human_readable.wb_poverty_gap_index_povertyline_2017_most_recent_year.isna())
        )
    ).mean()
)
print(
    (
        (
            human_readable.owid_poverty_rate_most_recent_year 
            == human_readable.owid_poverty_gap_index_most_recent_year
        ) | (
            (human_readable.owid_poverty_rate_most_recent_year.isna())
            & (human_readable.owid_poverty_gap_index_most_recent_year.isna())
        )
    ).mean()
)

human_readable = (
    human_readable
    .rename(columns={
        'wb_poverty_rate_povertyline_2017_most_recent_year': 'wb_most_recent_year',
        'wb_poverty_rate_povertyline_2017_most_recent': 'wb_poverty_rate_most_recent',
        'wb_poverty_rate_povertyline_2017_survey_year': 'wb_poverty_rate_survey_year',
        'poverty_rate_wpc': 'wpc_poverty_rate',
        'survey_poverty_rate_povertyline_2017': 'survey_poverty_rate',
        'wb_poverty_gap_index_povertyline_2017_most_recent': 'wb_poverty_gap_index_most_recent',
        'wb_poverty_gap_index_povertyline_2017_survey_year': 'wb_poverty_gap_index_survey_year',
        'survey_poverty_gap_index_povertyline_2017': 'survey_poverty_gap_index',
        'owid_poverty_rate_most_recent_year': 'owid_most_recent_year'
    })
    .drop(columns=['wb_poverty_gap_index_povertyline_2017_most_recent_year', 'owid_poverty_gap_index_most_recent_year'])
)
cols = list(human_readable.columns)
# Find the index of 'wb_most_recent_year'
idx = cols.index('wb_most_recent_year')
# Insert 'wpc_year' right after 'wb_most_recent_year'
cols.insert(idx + 1, 'wpc_year')
human_readable['wpc_year'] = 2022
human_readable = human_readable[cols]


In [None]:
human_readable.loc[
    human_readable.survey_year.notna(), 
    ['country',  'survey_poverty_rate', 'wb_poverty_rate_survey_year', 'wb_poverty_rate_most_recent', 'wpc_poverty_rate', 'owid_poverty_rate_most_recent', 
     'wb_most_recent_year', 'wpc_year', 'owid_most_recent_year'
    ]
]

In [None]:
today_str = datetime.today().strftime('%Y%m%d')
human_readable.to_csv(
    f'/data/eop/compiled_country_data/auxiliary_data/human_readable_{today_str}.csv', index=False
)