In [14]:
import pandas as pd
from pathlib import Path
from glob import glob
import numpy as np
from datetime import datetime


In [4]:
country_inventory_path = Path('/data/eop/country_inventory')

In [5]:
wpc = pd.read_csv(
    country_inventory_path / 'wpc_extended.csv',
    usecols = [
        'Country (color codes: inputs, intermediates, final outputs, error checks)',
        'Population',
        "Share of country's population that is in extreme poverty (WPC)",
        "Share of world's extremely poor population that live in this country (based on WPC)",
        "Share of country's population that is in extreme poverty (WB PIP)"
    ]
)
wpc.columns = ['country', 'population', 'poverty_rate_wpc', 'poverty_share_wpc', 'poverty_rate_pip']
wpc['poverty_rate_wpc'] = wpc['poverty_rate_wpc'].str.rstrip('%').astype(float) / 100
wpc['poverty_share_wpc'] = wpc['poverty_share_wpc'].str.rstrip('%').astype(float) / 100

lsms = pd.read_csv(country_inventory_path / 'LSMS.csv')
lsms.rename(columns={'Country': 'country'}, inplace=True)

non_lsms = pd.read_csv(country_inventory_path / 'non_lsms_1p_share.csv')

no_non_lsms = pd.read_csv(country_inventory_path / 'non_lsms_1p_share_no_survey.csv')
no_non_lsms.rename(columns={'Countries with no survey but > 1p of world poor': 'country'}, inplace=True)

shruthi_shortlist = pd.read_csv(country_inventory_path / 'shortlist.csv')

for df in [wpc, lsms, non_lsms, no_non_lsms, shruthi_shortlist]:
    df.dropna(subset='country', inplace=True)
    # Typos 
    df.loc[df.country.str.contains('Niger '), ['country']] = 'Niger'
    df.loc[df.country.str.contains('Combodia'), ['country']] = 'Cambodia'


    # Lining up names
    for current, wb in [
        ('East Eimor', 'Timor-Leste'),
        ('Ivory Coast', 'Côte d’Ivoire'),
        ('Bosnia', 'Bosnia and Herzegovina'),
        ('Sao Tome and Principe', 'São Tomé and Príncipe'),
        ('Taiwan', 'Taiwan, China'),
        ('Turkiye', 'Türkiye'),
        ('Viet Nam', 'Vietnam'),
        ('Palestine', 'West Bank and Gaza')
    ]:
        df.loc[df.country.str.contains(current), ['country']] = wb

    df.loc[df.country.str.contains('East Timor'), ['country']] = 'Timor-Leste'
    df.loc[df.country.str.contains('Ivory Coast'), ['country']] = 'Côte d’Ivoire'
    df.loc[df.country.str.contains('Bosnia'), ['country']] = 'Bosnia and Herzegovina'
    df.drop(df[df.country == 'Puerto Rico'].index, inplace=True)


In [6]:
country_income_classifications = pd.read_csv(
    '/data/eop/compiled_country_data/world_bank_helpdesk_download_20250628/world_bank_income_classification_20250628.csv'
)

# There's a split after the countries+territories and before the aggregates
all_nan_mask = country_income_classifications.isnull().all(axis=1)
split_index = country_income_classifications[all_nan_mask].index.values[0]

all_countries_and_territories = country_income_classifications.loc[:split_index-1, ['Economy', 'Code']].rename(
    columns={'Economy': 'country', 'Code': 'country_code'}
)

# Manually constructed
not_countries = [
    'ASM', 'ABW', 'BMU', 'VGB', 'CYM', 'PYF', 'GIB', 'GRL', 
    'GUM', 'HKG', 'MAC', 'MAF', 'NCL', 'MNP', 'PRI','SXM', 'VIR'
]

all_countries = all_countries_and_territories[
     ~all_countries_and_territories.country_code.isin(not_countries)
]

all_with_wpc = all_countries.merge(wpc, on='country', how='outer')

In [7]:
with_lsms = lsms.drop_duplicates(subset=['country'], keep='first')[
    ['country', 'Survey', 'HH Sample Size', 'Nationally representative?', 'Consumption Aggregate Available?', 'Comments']
]
with_lsms.rename(
    columns={
        'Country': 'country', 'Survey': 'shruthi_identified_lsms_survey', 'HH Sample Size': 'shruthi_identified_lsms_hh_sample_size', 
        'Nationally representative?': 'shruthi_identified_lsms_nationally_representative', 
        'Consumption Aggregate Available?': 'shruthi_identified_lsms_consumption_available',
        'Comments': 'shruthi_identified_lsms_comments'
    }, inplace=True
)
accumulator = all_with_wpc.merge(with_lsms, on='country', how='left')

In [8]:
with_non_lsms = non_lsms[non_lsms['survey found?'] == 'Yes'][['country', 'survey']].rename(
    columns={'survey': 'shruthi_identified_non_lsms_survey'}
)
accumulator = accumulator.merge(with_non_lsms, on='country', how='left')

In [9]:
existing_inventory = pd.read_csv(
    country_inventory_path / 'eop_main_country_list_20250825.csv',
    usecols=[
        'Country', 'Survey', 'Survey URL', 'Survey start year (for poverty-rate baseline)',
        'Accessible? (blank -> yes)','Consumption aggregate available?', 'Status (blank = nothing to do)', 'Beg WB', 'Notes'
    ]
)
existing_inventory.rename(columns={
    'Country': 'country', 'Survey': 'already_selected_survey', 'Survey URL': 'already_selected_survey_url',
}, inplace=True)

In [10]:
accumulator['rate_criterion_met'] = np.where(
    (accumulator['poverty_rate_wpc'] >= 0.1), True, False
)
accumulator['share_criterion_met'] = np.where(
    (accumulator['poverty_share_wpc'] >= 0.01), True, False
)
accumulator['manually_eligible'] = accumulator.country.isin(['India', 'Bangladesh', 'Colombia', 'Indonesia'])
accumulator['eligible'] = np.where(
    ((accumulator['rate_criterion_met']) & (accumulator['share_criterion_met'])) | (accumulator['manually_eligible']),
    True, False
)

accumulator = existing_inventory.merge(accumulator, on='country', how='right')
accumulator.sort_values(by='poverty_share_wpc', ascending=False, inplace=True)


In [11]:
desired_order = [
    'country',
    'already_selected_survey',
    'already_selected_survey_url',
    'Survey start year (for poverty-rate baseline)',
    'Accessible? (blank -> yes)',
    'Consumption aggregate available?',
    'Status (blank = nothing to do)',
    'Beg WB',
    'Notes',
    'poverty_rate_wpc',
    'poverty_share_wpc',
    'rate_criterion_met',
    'share_criterion_met',
    'manually_eligible',
    'eligible',
    'country_code',
    'population',
    'poverty_rate_pip',
    'shruthi_identified_lsms_survey',
    'shruthi_identified_lsms_hh_sample_size',
    'shruthi_identified_lsms_nationally_representative',
    'shruthi_identified_lsms_consumption_available',
    'shruthi_identified_lsms_comments',
    'shruthi_identified_non_lsms_survey'
]

accumulator = accumulator[desired_order]# + [col for col in accumulator.columns if col not in desired_order]]

In [16]:
today_str = datetime.today().strftime('%Y%m%d')

accumulator.to_csv(country_inventory_path / f'full_country_inventory_{today_str}.csv', index=False, encoding='utf-8')

# Merge with existing

In [32]:
existing = pd.read_csv(
    country_inventory_path / f'country_inventory_20250903.csv'
)
existing.dropna(subset='country', inplace=True)
existing = existing[existing.country != 'Totals']

In [34]:
merged = existing[['country']].merge(accumulator, on='country', how='left')

In [36]:
merged.to_csv(country_inventory_path / f'full_country_inventory_{today_str}.csv', index=False, encoding='utf-8')

# Archive

In [None]:
pip_data = pd.read_csv('/data/eop/compiled_country_data/pip_interpolated_all_countries_215_at_2017_ppp_20250822.csv')
def get_latest_value_with_year(df, value_col):
    def latest_func(group):
        if group[value_col].notna().any():
            idx = group['reporting_year'].idxmax()
            return pd.Series({
                f'latest_{value_col}': group.loc[idx, value_col],
                'year': group.loc[idx, 'reporting_year']
            })
        else:
            return pd.Series({
                f'latest_{value_col}': np.nan,
                'year': np.nan
            })
    return df.groupby('country_code').apply(latest_func).reset_index()

latest_poverty_rate = get_latest_value_with_year(pip_data, 'headcount')
latest_poverty_rate.rename(columns={'latest_headcount': 'latest_poverty_rate', 'year': 'latest_poverty_rate_year'}, inplace=True)
