### Imports

In [8]:
import pandas as pd
from pathlib import Path
import numpy as np
import json
import warnings
import math

from collections import defaultdict
import sklearn.linear_model as sklearn_linear_model
import sklearn.metrics as sklearn_metrics
import sklearn.model_selection as sklearn_model_selection
import sklearn.preprocessing as sklearn_preprocessing
import sklearn.feature_selection as sklearn_feature_selection
import sklearn.ensemble as sklearn_ensemble
import sklearn.decomposition as sklearn_decomposition
from sklearn.impute import SimpleImputer

import dask.dataframe as ddf

import geopandas as gpd
import dask_geopandas as dgpd

import matplotlib.pyplot as plt
import pyreadstat
from pandas.api.types import is_numeric_dtype

In [9]:
data_path = Path('/home/selker/eop/data')

malawi_directory = data_path / 'malawi'
malawi_survey_directory_dta = malawi_directory / 'MWI_2016_IHS-IV_v04_M_STATA14'

mosaiks_directory = data_path / 'mosaiks'

Out: 
* A data file, one-hot encoded and imputed as in roshni's replication code, with all columns included
* A summary like I construct: with "dropped" indicating either dropped for missingness, or omitted because we don't want it (say, consumption qs)
* summary should include a one-hot map, i.e. a column containing one-hat categories

## Geo

In [10]:
geo_vars, _ = pyreadstat.read_dta(
    malawi_survey_directory_dta / 'household_geovariables/householdgeovariablesihs4.dta'
)
filt, _ = pyreadstat.read_dta(
    malawi_survey_directory_dta / 'household/hh_mod_a_filt.dta',
)

In [12]:
geo_vars[['case_id', 'lat_modified', 'lon_modified']]

Unnamed: 0,case_id,lat_modified,lon_modified
0,301025230225,-14.683761,34.915074
1,210374850204,-14.005029,33.794591
2,311057710075,-16.826165,35.269503
3,312048040073,-15.004730,35.163219
4,311097790117,-17.016698,35.079629
...,...,...,...
12442,312048040036,-15.004730,35.163219
12443,305016150137,-15.558742,35.010733
12444,104031830093,,
12445,314408470035,,


In [36]:
with_ea_and_lat_lon = filt[['case_id', 'ea_id']].merge(
    geo_vars[['case_id', 'lat_modified', 'lon_modified']],
    on='case_id', 
)

In [37]:
# A few EAs have two locations. Not sure why.
ea_with_lat_lon = with_ea_and_lat_lon.groupby('ea_id')[['lat_modified', 'lon_modified']].agg(pd.Series.mode)

In [38]:
malawi_admin_3 = gpd.read_file(
    malawi_directory / 'mwi_adm_nso_hotosm_20230405_shp' / 'mwi_admbnda_adm3_nso_hotosm_20230405.shp'
)

ImportError: The 'read_file' function requires the 'pyogrio' or 'fiona' package, but neither is installed or imports correctly.
Importing fiona resulted in: /home/selker/.conda/envs/leo_base/lib/python3.9/site-packages/fiona/../../../libgdal.so.34: undefined symbol: sqlite3_total_changes64
Importing pyogrio resulted in: No module named 'pyogrio'

#### Mosaiks

In [None]:
%%time
def clean_mosaiks_column_name(column_name):
    column_name_stripped = column_name.strip(' .')
    try:
        number = int(column_name_stripped)
    except ValueError:
        if column_name_stripped == '':
            return 'mosaiks_0'
        else:
            return column_name
    else:
        return f'mosaiks_{number}'

mosaiks = ddf.read_csv(str(mosaiks_directory / 'malawi_fine' / '*.csv'))
malawi_outline = gpd.read_file(
    malawi_directory / 'mwi_adm_nso_hotosm_20230405_shp' / 'mwi_admbnda_adm0_nso_hotosm_20230405.shp'
)

mosaiks = dgpd.from_dask_dataframe(
    mosaiks, dgpd.points_from_xy(mosaiks, 'Lon', 'Lat')
)

# this data covers a box containing Malawi; filter down to the points actually within the country.
mosaiks = mosaiks[mosaiks.geometry.within(malawi_outline.iloc[0].geometry)]

mosaiks.columns = mosaiks.columns.map(clean_mosaiks_column_name)

In [None]:
mosaiks.drop(columns='geometry').to_parquet('mosaiks_within_malawi')

In [8]:
mosaiks = ddf.read_parquet('mosaiks_within_malawi/*.parquet')

In [None]:
geo_vars.groupby(['lat_modified', 'lon_modified']).first()

In [None]:
# DOESN'T WORK - finish about; this geo_vars df doesn't have ea_id (it's in HH_MOD_A_FILT)


# associate a moasiks tile with each enumeration area
ea_geo = geo_vars.groupby('ea_id').first()[['lat_modified', 'lon_modified']]
ea_geo = gpd.GeoDataFrame(ea_geo, geometry = gpd.points_from_xy(x=ea_geo.lon_modified, y=ea_geo.lat_modified))


In [None]:
mosaiks_grid_size = 1
max_distance = math.sqrt(2 * (mosaiks_grid_size / 2)**2)

mosaiks_computed = mosaiks.compute()

ea_geo_with_mosaiks = gpd.sjoin_nearest(
    left_df=ea_geo, right_df=mosaiks_computed, how='left', max_distance=mosaiks_grid_size
)

ea_geo_with_mosaiks.rename(
    columns={'Lat': 'lat_mosaiks', 'Lon': 'lon_mosaiks', 'index_right': 'index_mosaiks'},
    inplace=True
)

ea_geo_with_mosaiks.reset_index(inplace=True)
ea_geo_with_mosaiks.ea_id = ea_geo_with_mosaiks.ea_id.astype(int)

In [None]:
ea_geo_with_mosaiks.drop(columns=['geometry', 'BoxLabel']).to_parquet('2016_ea_geo_with_mosaiks.parquet', index=False)

In [10]:
ea_geo_with_mosaiks = pd.read_parquet('2016_ea_geo_with_mosaiks.parquet')

## 2016 LSMS data

In [36]:
def columns_equal(df, col1, col2):
    c1 = df[col1]
    c2 = df[col2]

    if pd.api.types.is_numeric_dtype(c1) and pd.api.types.is_numeric_dtype(c2):
        return np.isclose(c1, c2, rtol=1e-4, equal_nan=True).all()
    else:
        try:
            eq = (c1 == c2).all()
        except TypeError:
            # mismatched categories -> this comparison raises a type error
            eq = False
        return eq

In [183]:
malawi = None
column_names_to_labels = dict()

for file in (
    'household/hh_mod_a_filt',
    'household/hh_mod_f', # housing
    'household/hh_mod_h', # food security
    'household/hh_mod_n1', # household enterprises
    'household/hh_mod_s2', # household credit
    'household/hh_mod_t', # subj assessment of well-being
    'household/hh_mod_x', # ag and fisheries filter,
    'agriculture/ag_mod_a', # ownership of land
    'agriculture/ag_mod_r2', # livestock
    'agriculture/ag_mod_e3', # coupon use - rainy season
    'household_geovariables/householdgeovariablesihs4', # geo
    'consumption_aggregate/ihs4 consumption aggregate' # consumption
):
    
    with warnings.catch_warnings():
        warnings.simplefilter('ignore') # TODO: Investigate. Warning thrown from w/in pyreadstat.

        dataframe, metadata =  pyreadstat.read_dta(
                malawi_survey_directory_dta / f'{file}.dta', apply_value_formats=True
        )

    column_names_to_labels.update(metadata.column_names_to_labels)

    if malawi is None:
        malawi = dataframe
    else:
        malawi = malawi.merge(dataframe, on='case_id', how='outer', suffixes=('_left', '_right'))

        for c in malawi.columns:
            if c.endswith('_left'):
                c_left = c
                base = c_left[:-5]
                c_right = f'{base}_right'

                match = columns_equal(malawi, c_left, c_right)
                
                if match:
                    malawi.drop(columns=c_right, inplace=True)
                    malawi.rename(columns={c_left: base}, inplace=True)
                # geographies are sometimes named and sometimes encoded as integers. If we've got one of each,  
                # keep the string name: that way it won't accidentally be treated as numeric later.
                elif (
                    (base in ['region', 'district'])
                    & (
                        pd.api.types.is_numeric_dtype(malawi[c_left]) 
                        + pd.api.types.is_numeric_dtype(malawi[c_right]) 
                        == 1
                      )
                ):
                    if pd.api.types.is_numeric_dtype(malawi[c_left]):
                        malawi.drop(columns=c_left, inplace=True)
                        malawi.rename(columns={c_right: base}, inplace=True)
                    else:
                        malawi.drop(columns=c_right, inplace=True)
                        malawi.rename(columns={c_left: base}, inplace=True)
                else:
                    print(f'error merging {file}, mismatch in {base}')
                    malawi.drop(columns=c_right, inplace=True)
                    malawi.rename(columns={c_left: base}, inplace=True)

# Add Mosaiks columns. 
if False:
    malawi.ea_id = malawi.ea_id.astype(int)
    malawi = malawi.merge(
        ea_geo_with_mosaiks, on='ea_id', how='outer'
    )
    
    for c in ('lat_modified', 'lon_modified'):
        malawi.drop(columns=f'{c}_x', inplace=True)
        malawi.rename(columns={f'{c}_y' : c}, inplace=True)

malawi_raw = malawi
# Drop rows that are missing critical fields which we don't want to impute.
malawi.dropna(subset=['rexpagg'], inplace=True)

# TODO: Figure out how to detect datetime-like columns automatically
malawi['interviewDate'] = pd.to_datetime(malawi['interviewDate'])

# columns not to be imputed, coerced to numeric, or one-hot encoded.
# summary table won't include these either - for now, this seems fine. 
columns_to_reserve = [
    'hhid', 'case_id', 'hh_wgt', 'interviewDate'
]
malawi_reserved = malawi[columns_to_reserve]
malawi_to_process = malawi[malawi.columns.difference(columns_to_reserve)]

# coerce columns to numeric that can be coerced
for c in malawi_to_process.columns:
    malawi_to_process[c] = pd.to_numeric(malawi_to_process[c], errors='ignore')

# coerce known categorical columns to string
known_categorical = [
    'region', 'district', 'hh_t01', 'hh_t02', 'hh_t03', 'hh_t04'
]
for c in known_categorical:
    malawi_to_process[c] = malawi_to_process[c].astype(str)

# summarize columns
missing_counts = malawi_to_process.isnull().sum()+ (malawi_to_process == "").sum()  
means = malawi_to_process.mean(skipna=True, numeric_only=True)
medians = malawi_to_process.median(skipna=True, numeric_only=True)
stds = malawi_to_process.std(skipna=True, numeric_only=True)

summary = pd.concat((missing_counts, means, medians, stds), axis=1)
summary.columns = ['missing_count', 'mean', 'median', 'std']
summary['missing_fraction'] = summary.missing_count / len(malawi_to_process)

summary.reset_index(names='covariate', inplace=True)

# Split into numeric and non-numeric columns
malawi_numeric = malawi_to_process.select_dtypes(include=[np.number])
malawi_non_numeric = malawi_to_process.select_dtypes(exclude=[np.number, np.datetime64])

def get_covariate_type(cov):
    
    if cov in malawi_numeric.columns:
        return 'numeric'
    elif cov in malawi_non_numeric.columns:
        return 'categorical'

summary['type'] = summary['covariate'].apply(get_covariate_type)
covariate_to_columns_map = {
    covariate: [covariate] for covariate in summary.covariate
}

# impute missing values with the mean.
MISSINGNESS_CUTOFF = 0.15
covariates_over_cutoff = summary[summary.missing_fraction > MISSINGNESS_CUTOFF].covariate.values
for covariate in malawi_numeric.columns:
    if covariate in covariates_over_cutoff:
        dummy_column = f'{covariate}_nan'
        malawi_numeric[dummy_column] = malawi_numeric[covariate].isna()
        covariate_to_columns_map[covariate].append(dummy_column)

# This is different from what roshni does: She uses 0 to impute
# if missingness is >15%. 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(malawi_numeric)

columns = malawi_numeric.columns
malawi_numeric = pd.DataFrame(imputer.transform(malawi_numeric))
malawi_numeric.columns = columns

# one-hot encode categoricals.
# This is different from what roshni does. I'm encoding missing values
# with a category regardless of missing fraction.
one_hot_encoder = sklearn_preprocessing.OneHotEncoder(
    drop='if_binary', sparse_output=False
).fit(malawi_non_numeric)
encoded_data = one_hot_encoder.transform(malawi_non_numeric)
malawi_non_numeric_encoded = pd.DataFrame(encoded_data)
malawi_non_numeric_encoded.columns = one_hot_encoder.get_feature_names_out()

# populate the map from original column names to the list of one-hot columns. 
for i in range(len(one_hot_encoder.feature_names_in_)):

    covariate = one_hot_encoder.feature_names_in_[i]
    categories = one_hot_encoder.categories_[i]

    if one_hot_encoder.drop_idx_[i] is not None:
        categories = np.delete(categories, one_hot_encoder.drop_idx_[i])

    covariate_to_columns_map[covariate] = [
        f'{covariate}_{category}' for category in categories
    ]

malawi = malawi_reserved.join(malawi_numeric).join(malawi_non_numeric_encoded)

# create map from one-hot columns to original columns + values
inverse_one_hot_map = dict()
for feature, categories in one_hot_map.items():
    for category in categories:
        inverse_one_hot_map[f'{feature}_{category}'] = (feature, category)

# TODO: replace with a dict get() with default
def interpret_column_name(column_name):

    if column_name in column_names_to_labels:
        return column_names_to_labels[column_name]

    return column_name

summary['description'] = summary.covariate.apply(interpret_column_name)

summary.missing_fraction = summary.missing_fraction.round(2)
summary['median'] = summary['median'].round(2)
summary['mean'] = summary['mean'].round(2)
summary['std'] = summary['std'].round(2)

ADULT_MIN_AGE = 18

roster, _ =  pyreadstat.read_dta(
    malawi_survey_directory_dta / 'household/hh_mod_b.dta', apply_value_formats=True
)

roster['adult'] = roster.hh_b05a >= ADULT_MIN_AGE
hh_adult_counts = (
    roster[roster.adult].groupby('case_id')[['hhid']].count().rename(columns={'hhid': 'num_adults'})
)
hh_child_counts = (
    roster[~roster.adult].groupby('case_id')[['hhid']].count().rename(columns={'hhid': 'num_children'})
)

malawi = (
    malawi
    .merge(hh_adult_counts, how='left', on='case_id')
    .merge(hh_child_counts, how='left', on='case_id')
)

malawi[['num_adults', 'num_childrens']] = (
    malawi[['num_adults', 'num_children']].fillna(value=0)
)

malawi = malawi[malawi.num_adults + malawi.num_children > 0]
summary['columns'] = summary.covariate.map(covariate_to_columns_map)

# https://docs.google.com/spreadsheets/d/11I0U413LgiVYuvgPhVL1M-5bfJabCFql75tQWG551U0/edit#gid=0
MALAWI_CONSUMPTION_CONVERSION_FACTOR = 0.00461055475

# Convert outcome to consumption per capita per day in terms of 2017 USD
#    1. Use conversion factor to convert to 2017 USD
#    2. Divide by household size
#    2. Convert consumption to consumption per day
malawi["outcome"] = malawi["rexpagg"].copy()
malawi["outcome"] /= (malawi.num_adults + malawi.num_children)
malawi["outcome"] *= MALAWI_CONSUMPTION_CONVERSION_FACTOR
malawi["outcome"] /= 365

error merging agriculture/ag_mod_r2, mismatch in hhid
error merging agriculture/ag_mod_e3, mismatch in hhid
error merging consumption_aggregate/ihs4 consumption aggregate, mismatch in region


  malawi_to_process[c] = pd.to_numeric(malawi_to_process[c], errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malawi_to_process[c] = pd.to_numeric(malawi_to_process[c], errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malawi_to_process[c] = malawi_to_process[c].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-co

In [184]:
if True:
    out_path = Path('/home/selker/eop/data/malawi')
    malawi.to_parquet(out_path / 'malawi_cleaned_2016.parquet', index=False)
    summary.set_index('covariate', drop=True).to_parquet(out_path / 'malawi_summary_2016.parquet')

### Calculate baseline poverty rate

In [4]:
out_path = Path('/home/selker/eop/data/malawi')
malawi = pd.read_parquet(out_path / 'malawi_cleaned_2016.parquet')

In [7]:
poverty_line = 2.15
below = len(malawi[malawi.outcome < poverty_line])
total = len(malawi)

rate = below / total
display(rate)

0.6417307692307692

## 2018 Census

In [44]:
census = pd.read_csv(malawi_directory / 'census_2018_pop_tables.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/home/selker/eop/data/malawi/census_2018_pop_tables.csv'

# Cleaning section from select_predictors, for 2019 Malawi LSMS 

In [6]:
def columns_equal(df, col1, col2):
    c1 = df[col1]
    c2 = df[col2]
    
    if pd.api.types.is_numeric_dtype(c1) and pd.api.types.is_numeric_dtype(c2):
        return np.isclose(c1, c2, rtol=1e-4).all()
    else:
        return (c1 == c2).all()

In [97]:
data_path = Path('/home/selker/eop/data')

malawi_directory = data_path / 'malawi'
malawi_survey_directory_csv = malawi_directory / 'MWI_2019_IHS-V_v06_M_CSV'
malawi_survey_directory_dta = malawi_directory / 'MWI_2019_IHS-V_v06_M_Stata'

mosaiks_directory = data_path / 'mosaiks'

RANDOM_STATE=11

### Load + process Mosaiks data

In [4]:
%%time
def clean_mosaiks_column_name(column_name):
    column_name_stripped = column_name.strip(' .')
    try:
        number = int(column_name_stripped)
    except ValueError:
        if column_name_stripped == '':
            return 'mosaiks_0'
        else:
            return column_name
    else:
        return f'mosaiks_{number}'

mosaiks = ddf.read_csv(str(mosaiks_directory / 'malawi_fine' / '*.csv'))
malawi_outline = gpd.read_file(
    malawi_directory / 'mwi_adm_nso_hotosm_20230405_shp' / 'mwi_admbnda_adm0_nso_hotosm_20230405.shp'
)

mosaiks = dgpd.from_dask_dataframe(
    mosaiks, dgpd.points_from_xy(mosaiks, 'Lon', 'Lat')
)

# this data covers a box containing Malawi; filter down to the points actually within the country.
mosaiks = mosaiks[mosaiks.geometry.within(malawi_outline.iloc[0].geometry)]

mosaiks.columns = mosaiks.columns.map(clean_mosaiks_column_name)

ERROR 1: PROJ: proj_create_from_database: Open of /home/selker/.conda/envs/leo_base_new/share/proj failed


AttributeError: 'Series' object has no attribute 'within'

In [154]:
geo_vars, _ = pyreadstat.read_dta(
    malawi_survey_directory_dta / 'householdgeovariables_ihs5.dta'
)

# associate a moasiks tile with each enumeration area
ea_geo = geo_vars.groupby('ea_id').first()[['ea_lat_mod', 'ea_lon_mod']]
ea_geo = gpd.GeoDataFrame(ea_geo, geometry = gpd.points_from_xy(x=ea_geo.ea_lon_mod, y=ea_geo.ea_lat_mod))


In [None]:
mosaiks_grid_size = 1
max_distance = math.sqrt(2 * (mosaiks_grid_size / 2)**2)

mosaiks_computed = mosaiks.compute()

ea_geo_with_mosaiks = gpd.sjoin_nearest(
    left_df=ea_geo, right_df=mosaiks_computed, how='left', max_distance=mosaiks_grid_size
)

ea_geo_with_mosaiks.rename(
    columns={'Lat': 'lat_mosaiks', 'Lon': 'lon_mosaiks', 'index_right': 'index_mosaiks'},
    inplace=True
)

ea_geo_with_mosaiks.reset_index(inplace=True)
ea_geo_with_mosaiks.ea_id = ea_geo_with_mosaiks.ea_id.astype(int)

In [None]:
ea_geo_with_mosaiks.drop(columns=['geometry', 'BoxLabel']).to_parquet('ea_geo_with_mosaiks.parquet', index=False)

In [13]:
ea_geo_with_mosaiks = pd.read_parquet('ea_geo_with_mosaiks.parquet')

### Load survey data

In [14]:
# https://docs.google.com/spreadsheets/d/1lHoEWEIhl7DR2SwFdHiBnBBuC75SzW39pDIVyNBh3JQ/edit#gid=1019974521
malawi_consumption_conversion_factor = 0.003361742723912196

malawi = None
column_names_to_labels = dict()

# malawi_directory.iterdir():
for file in (
    'HH_MOD_F',
    'HH_MOD_H',
    'HH_MOD_N1',
    'HH_MOD_S2',
    'HH_MOD_T',
    'HH_MOD_X',
    'ag_mod_a',
    'ag_mod_e3',
    'hh_mod_a_filt',
    'ihs5_consumption_aggregate',
    'householdgeovariables_ihs5'
): 
    
    # dataframe = pd.read_csv(malawi_survey_directory / file, low_memory=False)
    # 
    with warnings.catch_warnings():
        warnings.simplefilter('ignore') # TODO: Investigate. Warning thrown from w/in pyreadstat.

        dataframe, metadata =  pyreadstat.read_dta(
                malawi_survey_directory_dta / f'{file}.dta', apply_value_formats=True
        )

    column_names_to_labels.update(metadata.column_names_to_labels)
    
    # print(f'file: {file}, {dataframe.case_id.value_counts().head(10)}')
    if malawi is None:
        malawi = dataframe
    else:
        malawi = malawi.merge(dataframe, on='case_id', how='outer', suffixes=('_left', '_right'))    

        for c in malawi.columns:
            if c.endswith('_left'):
                c_left = c
                base = c_left[:-5]
                c_right = f'{base}_right'

                # sometimes categorical types mess up this check; fail conservatively
                try:
                    match = columns_equal(malawi, c_left, c_right)
                except:
                    match = False
                
                if match:
                    malawi.drop(columns=c_left, inplace=True)
                    malawi.rename(columns={c_right: base}, inplace=True)
                # geographies are sometimes named and sometimes encoded as integers. If we've got one of each,  
                # keep the string name: that way it won't accidentally be treated as numeric later.
                elif (
                    (base in ['region', 'district'])
                    & (
                        pd.api.types.is_numeric_dtype(malawi[c_left]) 
                        + pd.api.types.is_numeric_dtype(malawi[c_right]) 
                        == 1
                      )
                ):
                    if pd.api.types.is_numeric_dtype(malawi[c_left]):
                        malawi.drop(columns=c_left, inplace=True)
                        malawi.rename(columns={c_right: base}, inplace=True)
                    else:
                        malawi.drop(columns=c_right, inplace=True)
                        malawi.rename(columns={c_left: base}, inplace=True)
                else:
                    # print(pd.api.types.is_numeric_dtype(c_left) + pd.api.types.is_numeric_dtype(c_right))
                    print(f'error merging {file}, mismatch in {base}')
                    # TODO: Examine these cases
                    malawi.drop(columns=c_left, inplace=True)
                    malawi.rename(columns={c_right: base}, inplace=True)

# Add Mosaiks columns. 
malawi.ea_id = malawi.ea_id.astype(int)
malawi = malawi.merge(
    ea_geo_with_mosaiks, on='ea_id', how='outer'
)

for c in ('ea_lat_mod', 'ea_lon_mod'):
    malawi.drop(columns=f'{c}_x', inplace=True)
    malawi.rename(columns={f'{c}_y' : c}, inplace=True)

# save to allow exporting parts of this data later
malawi_raw = malawi

# Drop rows that are missing critical fields which we don't want to impute.
malawi.dropna(subset=['HHID', 'rexpaggpc'], inplace=True)

# TODO: Figure out how to detect datetime-like columns automatically
malawi['interviewDate'] = pd.to_datetime(malawi['interviewDate'])

# columns not to be imputed, coerced to numeric, or one-hot encoded.
# summary table won't include these either - for now, this seems fine. 
columns_to_reserve = [
    'HHID', 'case_id', 'hh_wgt', 'interviewDate'
]
malawi_reserved = malawi[columns_to_reserve]
malawi_to_process = malawi[malawi.columns.difference(columns_to_reserve)]

# coerce columns to numeric that can be coerced
for c in malawi_to_process.columns:
    malawi_to_process[c] = pd.to_numeric(malawi_to_process[c], errors='ignore')

# coerce known categorical columns to string
known_categorical = [
    'region', 'district', 'hh_t01', 'hh_t02', 'hh_t03', 'hh_t04'
]
for c in known_categorical:
    malawi_to_process[c] = malawi_to_process[c].astype(str)

# Before imputing or dropping highly-missing columns, summarize columns
missing_counts = malawi_to_process.isnull().sum()+ (malawi_to_process == "").sum()  
means = malawi_to_process.mean(skipna=True, numeric_only=True)
medians = malawi_to_process.median(skipna=True, numeric_only=True)
stds = malawi_to_process.std(skipna=True, numeric_only=True)
summary = pd.concat((missing_counts, means, medians, stds), axis=1)
summary.columns = ['missing_count', 'mean', 'median', 'std']
summary.reset_index(names='covariate', inplace=True)

# Drop highly missing columns.
print(f'pre-dropping: num columns {len(malawi_to_process.columns)}')
threshold = 0.15

missing_percent = missing_counts  / len(malawi_to_process)
dropped_for_missingness = malawi_to_process[missing_percent[missing_percent >= threshold].index].columns
malawi_to_process = malawi_to_process[missing_percent[missing_percent < threshold].index] 
not_dropped_for_missingness = [c for c in malawi_to_process.columns if c not in dropped_for_missingness]

print(f'dropping {len(dropped_for_missingness)} columns')
print(f'post-dropping: num columns {len(malawi_to_process.columns)}')

# Split into numeric and non-numeric columns
malawi_numeric = malawi_to_process.select_dtypes(include=[np.number])
malawi_non_numeric = malawi_to_process.select_dtypes(exclude=[np.number, np.datetime64])

def get_covariate_type(cov):
    
    if cov in malawi_numeric.columns:
        return 'numeric'
    elif cov in malawi_non_numeric.columns:
        return 'categorical'
    else:
        return 'dropped'

summary['type'] = summary['covariate'].apply(get_covariate_type)

# impute missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(malawi_numeric)

columns = malawi_numeric.columns
malawi_numeric = pd.DataFrame(imputer.transform(malawi_numeric))
malawi_numeric.columns = columns

# one-hot encode categoricals: First, fill missing values.
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='MISSING')
imputer.fit(malawi_non_numeric)
imputer.transform(malawi_non_numeric)

# next, one-hot encode
one_hot_encoder = sklearn_preprocessing.OneHotEncoder(
    drop='if_binary', sparse_output=False
).fit(malawi_non_numeric)
encoded_data = one_hot_encoder.transform(malawi_non_numeric)
malawi_non_numeric_encoded = pd.DataFrame(encoded_data)
malawi_non_numeric_encoded.columns = one_hot_encoder.get_feature_names_out()

# Set up a map from original column names to the list of one-hot columns. We'll use it later.
one_hot_map = dict()
for i in range(len(one_hot_encoder.feature_names_in_)):
    
    categories = one_hot_encoder.categories_[i]
    if one_hot_encoder.drop_idx_[i] is not None:
        categories = np.delete(categories, one_hot_encoder.drop_idx_[i])

    one_hot_map[one_hot_encoder.feature_names_in_[i]] = categories

malawi = malawi_reserved.join(malawi_numeric).join(malawi_non_numeric_encoded)

malawi['consumption_ppp_2017'] = malawi.rexpaggpc * malawi_consumption_conversion_factor

# create map from one-hot columns to original columns + values
inverse_one_hot_map = dict()
for feature, categories in one_hot_map.items():
    for category in categories:
        inverse_one_hot_map[f'{feature}_{category}'] = (feature, category)

def interpret_column_name(column_name):

    if column_name in inverse_one_hot_map:
        original_column_name, value = inverse_one_hot_map[column_name]
        return f'Covariate: {column_names_to_labels[original_column_name]}, value: {value}'

    elif column_name in column_names_to_labels:
        return column_names_to_labels[column_name]

    return column_name

summary['description'] = summary.covariate.apply(interpret_column_name)
summary['missing_fraction'] = summary['missing_count'] / len(malawi)

summary.missing_fraction = summary.missing_fraction.round(2)
summary['median'] = summary['median'].round(2)
summary['mean'] = summary['mean'].round(2)
summary['std'] = summary['std'].round(2)

error merging ag_mod_e3, mismatch in HHID
error merging hh_mod_a_filt, mismatch in HHID
error merging ihs5_consumption_aggregate, mismatch in region
error merging ihs5_consumption_aggregate, mismatch in district


  malawi_to_process[c] = pd.to_numeric(malawi_to_process[c], errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malawi_to_process[c] = pd.to_numeric(malawi_to_process[c], errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malawi_to_process[c] = malawi_to_process[c].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-co

pre-dropping: num columns 4357
dropping 144 columns
post-dropping: num columns 4213


In [93]:
if False:
    
    malawi_raw.to_parquet('/home/selker/eop/data/malawi/malawi_merged.parquet', index=False)

    pd.DataFrame.from_dict(
        column_names_to_labels, orient='index', columns=['description']
    ).reset_index(names='covariate').to_csv('column_description_map.csv', index=False)