In [2]:
import pandas as pd
from pathlib import Path

In [20]:
# Uganda
data = pd.read_parquet(
    '/data/eop/uganda/cleaned/uganda_full.parquet'
)
summary = pd.read_parquet(
    '/data/eop/uganda/cleaned/summary.parquet'
)

summary['geographic_indicator_coarser'] = summary.variable_name.isin(['region',])
summary['geographic_indicator_finer'] = summary.variable_name.isin(['region','subreg'])

summary.sum(numeric_only=True)

geographic_indicator            3
geographic_indicator_coarser    1
geographic_indicator_finer      2
dtype: int64

In [21]:
summary[summary.geographic_indicator]

Unnamed: 0,variable_name,module_name,variable_description,data_type,geographic_indicator,geographic_indicator_coarser,geographic_indicator_finer
7,region,pov2019_20,Region of Residence in 2019/20,categorical,True,True,True
8,subreg,pov2019_20,Sub region in 2019/20,categorical,True,False,True
9,district,pov2019_20,District Code,categorical,True,False,False


In [22]:
data.to_parquet(
    '/data/eop/uganda/cleaned/uganda_full.parquet',
    index=False,
)
summary.to_parquet(
    '/data/eop/uganda/cleaned/summary.parquet',
    index=False,
)

In [5]:
# Togo
data = pd.read_parquet('/data/eop/Togo 2018-19/clean/final_togo.parquet')
summary = pd.read_parquet('/data/eop/Togo 2018-19/clean/summary.parquet')
summary.rename(columns={'covariate': 'variable_name', 'type': 'data_type'}, inplace=True)
summary['geographic_indicator_finer'] = (
    summary.variable_name.isin(['residence_region', 'prefecture'])
)
summary['geographic_indicator_coarser'] = (
    summary.variable_name.isin(['residence_region'])
)

if False:
    missing_columns = [c for c in data.columns if c.endswith('_m')]

    with_missing = [c[:-2] for c in missing_columns]

    to_drop = summary[
        summary.variable_name.isin(with_missing)
        & (summary.data_type == 'categorical')
    ].variable_name.values
    to_drop = [f'{c}_m' for c in to_drop]

    summary = summary[~summary.variable_name.isin(to_drop)]
    data = data.drop(columns=to_drop)

In [6]:
data.to_parquet(
    '/data/eop/Togo 2018-19/clean/final_togo.parquet',
    index=False,
)
summary.to_parquet(
    '/data/eop/Togo 2018-19/clean/summary.parquet',
    index=False,
)

In [20]:
# Nigeria
data = pd.read_parquet('/data/eop/Nigeria 2018-19/clean/final_nigeria.parquet')
summary = pd.read_parquet('/data/eop/Nigeria 2018-19/clean/summary.parquet')

In [None]:
summary.rename(columns={'covariate': 'variable_name', 'type': 'data_type'}, inplace=True)
data = data[data.hh_wgt_m == 0]
data.drop(columns=['hh_wgt_m', 'consumption_per_capita_per_day_m'], inplace=True)

missing_columns = [c for c in data.columns if c.endswith('_m')]

with_missing = [c[:-2] for c in missing_columns]
to_drop = summary[
    summary.variable_name.isin(with_missing)
    & (summary.data_type == 'categorical')
].variable_name.values
to_drop = [f'{c}_m' for c in to_drop]

summary = summary[~summary.variable_name.isin(to_drop)]
data = data.drop(columns=to_drop)

summary['geographic_indicator_finer'] = (
    summary.variable_name.isin([ 'zone', 'state'])
)
summary['geographic_indicator_coarser'] = (
    summary.variable_name.isin([ 'zone'])
)


In [23]:
data.to_parquet(
    '/data/eop/Nigeria 2018-19/clean/final_nigeria.parquet',
    index=False,
)
summary.to_parquet(
    '/data/eop/Nigeria 2018-19/clean/summary.parquet',
    index=False,
)

In [28]:
# Ethiopia
data = pd.read_csv('/data/eop/Ethiopia 2018-19/clean/final_ethiopia.csv')
summary = pd.read_csv('/data/eop/Ethiopia 2018-19/summary.csv')
summary.rename(columns={'covariate': 'variable_name', 'type': 'data_type'}, inplace=True)


In [29]:
data['region_zone'] = data.region_code + '_' + data.zone_code.astype(str)
summary = pd.concat(
    [
        summary, 
        pd.DataFrame({'variable_name': ['region_zone'], 'data_type': ['categorical']})
    ], ignore_index=True)
data.drop(columns=['zone_code'], inplace=True)
summary = summary[summary.variable_name != 'zone_code']

In [30]:
summary['geographic_indicator_coarser'] = (
    summary.variable_name.isin([ 'region_code'])
)
summary['geographic_indicator_finer'] = (
    summary.variable_name.isin([ 'region_code', 'region_zone'])
)
summary['geographic_indicator'] = (
    summary.variable_name.isin([ 
        'ea_id', 'region_code', 'region_zone', 'woreda_code', 'city_code', 'subcity_code', 'kebele_code'
    ])
)

missing_columns = [c for c in data.columns if c.endswith('_m')]

with_missing = [c[:-2] for c in missing_columns]
to_drop = summary[
    summary.variable_name.isin(with_missing)
    & (summary.data_type == 'categorical')
].variable_name.values
to_drop = [f'{c}_m' for c in to_drop]

summary = summary[~summary.variable_name.isin(to_drop)]
data = data.drop(columns=to_drop)

In [31]:
data.to_parquet(
    '/data/eop/Ethiopia 2018-19/clean/final_ethiopia.parquet',
    index=False,
)
summary.to_parquet(
    '/data/eop/Ethiopia 2018-19/clean/summary.parquet',
    index=False,
)

In [32]:
# Kenya
summary = pd.read_csv(
    '/data/eop/kenya/cleaned/kenya_metadata.csv'
)
data = pd.read_parquet(
    '/data/eop/kenya/cleaned/kenya.parquet'
)

In [None]:

missing_columns = [col for col in data.columns if col.endswith('_missing_indicator')]
new_entries = pd.DataFrame({
    'variable_name': missing_columns,
    'variable_description': ['Missing indicator for ' + col for col in missing_columns],
    'module_name': ['generated'] * len(missing_columns),
    'module_description': ['Generated missing indicator'] * len(missing_columns),
    'data_type': ['categorical'] * len(missing_columns),
    'original_survey_variable_name': missing_columns,
})
summary = pd.concat([summary, new_entries], ignore_index=True)


to_add = [
    'mattresses_indicator', 'cooking_equip_indicator', 'radio_tv_comp_indicator', 'pressure_cooker_indicator', 'hh_size', 'clid'
]
new_entries = pd.DataFrame({
    'variable_name': to_add,
    'variable_description': to_add,
    'module_name': [None] * len(to_add),
    'module_description': [None] * len(to_add),
    'data_type': ['categorical'] * len(to_add),
    'original_survey_variable_name': [None] * len(to_add),
})
summary = pd.concat([summary, new_entries], ignore_index=True)
summary.loc[summary.variable__name == 'hh_size', 'data_type'] = 'numeric'
summary.loc[summary.variable_name == 'clid', 'variable_description'] = 'cluster id'


summary['geographic_indicator_finer'] = (
    summary.variable_name.isin(['county'])
)
summary['geographic_indicator_coarser'] = (
    summary.variable_name.isin(['county'])
)
summary['geographic_indicator'] = (
    summary.variable_name.isin(['county', 'clid'])
)
summary.loc[summary['variable_name'] == 'hhid', 'data_type'] = 'categorical'
summary.loc[summary['variable_name'] == 'hh_wgt', 'data_type'] = 'numeric'

In [34]:
summary.to_parquet(
    '/data/eop/kenya/cleaned/summary.parquet',
    index=False,
)