In [219]:
import pandas as pd
from pathlib import Path
import numpy as np
import warnings
import math

import dask.dataframe as ddf

import sklearn.linear_model as sklearn_linear_model
import sklearn.metrics as sklearn_metrics
import sklearn.model_selection as sklearn_model_selection
import sklearn.preprocessing as sklearn_preprocessing
import sklearn.feature_selection as sklearn_feature_selection
import sklearn.ensemble as sklearn_ensemble
from sklearn.impute import SimpleImputer

import geopandas as gpd
import dask_geopandas as dgpd

import matplotlib.pyplot as plt
import pyreadstat
from pandas.api.types import is_numeric_dtype

In [2]:
def columns_equal(df, col1, col2):
    c1 = df[col1]
    c2 = df[col2]
    
    if pd.api.types.is_numeric_dtype(c1) and pd.api.types.is_numeric_dtype(c2):
        return np.isclose(c1, c2, rtol=1e-4).all()
    else:
        return (c1 == c2).all()

In [3]:
data_path = Path('/Users/leo/Documents/gpl/eop/data')

malawi_directory = data_path / 'malawi'
malawi_survey_directory_csv = malawi_directory / 'MWI_2019_IHS-V_v06_M_CSV'
malawi_survey_directory_dta = malawi_directory / 'MWI_2019_IHS-V_v06_M_Stata'

mosaiks_directory = data_path / 'mosaiks'

### Load + process Mosaiks data

In [None]:
%%time
def clean_mosaiks_column_name(column_name):
    column_name_stripped = column_name.strip(' .')
    try:
        number = int(column_name_stripped)
    except ValueError:
        if column_name_stripped == '':
            return 'mosaiks_0'
        else:
            return column_name
    else:
        return f'mosaiks_{number}'

mosaiks = ddf.read_csv(str(mosaiks_directory / 'malawi_fine' / '*.csv'))
malawi_outline = gpd.read_file(
    malawi_directory / 'mwi_adm_nso_hotosm_20230405_shp' / 'mwi_admbnda_adm0_nso_hotosm_20230405.shp'
)

mosaiks = dgpd.from_dask_dataframe(
    mosaiks, dgpd.points_from_xy(mosaiks, 'Lon', 'Lat')
)

# this data covers a box containing Malawi; filter down to the points actually within the country.
mosaiks = mosaiks[mosaiks.geometry.within(malawi_outline.iloc[0].geometry)]

mosaiks.columns = mosaiks.columns.map(clean_mosaiks_column_name)

geo_vars, _ = pyreadstat.read_dta(
    malawi_survey_directory_dta / 'householdgeovariables_ihs5.dta'
)

# associate a moasiks tile with each enumeration area
ea_geo = geo_vars.groupby('ea_id').first()[['ea_lat_mod', 'ea_lon_mod']]
ea_geo = gpd.GeoDataFrame(ea_geo, geometry = gpd.points_from_xy(x=ea_geo.ea_lon_mod, y=ea_geo.ea_lat_mod))

mosaiks_computed = mosaiks.compute()

In [136]:
mosaiks_grid_size = 1
max_distance = math.sqrt(2 * (mosaiks_grid_size / 2)**2)

ea_geo_with_mosaiks = gpd.sjoin_nearest(
    left_df=ea_geo, right_df=mosaiks_computed, how='left', max_distance=mosaiks_grid_size
)

ea_geo_with_mosaiks.rename(
    columns={'Lat': 'lat_mosaiks', 'Lon': 'lon_mosaiks', 'index_right': 'index_mosaiks'},
    inplace=True
)

ea_geo_with_mosaiks.reset_index(names='ea_id', inplace=True)
ea_geo_with_mosaiks.ea_id = ea_geo_with_mosaiks.ea_id.astype(int)

In [160]:
ea_geo_with_mosaiks.drop(columns=['geometry', 'BoxLabel']).to_parquet('ea_geo_with_mosaiks.parquet', index=False)

In [161]:
ea_geo_with_mosaiks = pd.read_parquet('ea_geo_with_mosaiks.parquet')

### Load survey data

In [205]:
# https://docs.google.com/spreadsheets/d/1lHoEWEIhl7DR2SwFdHiBnBBuC75SzW39pDIVyNBh3JQ/edit#gid=1019974521
malawi_consumption_conversion_factor = 0.01406191874

malawi = None
column_names_to_labels = dict()

# malawi_directory.iterdir():
for file in (
    'HH_MOD_F',
    'HH_MOD_H',
    'HH_MOD_N1',
    'HH_MOD_S2',
    'HH_MOD_T',
    'HH_MOD_X',
    'ag_mod_a',
    'ag_mod_e3',
    'hh_mod_a_filt',
    'ihs5_consumption_aggregate',
    'householdgeovariables_ihs5'
): 
    
    # dataframe = pd.read_csv(malawi_survey_directory / file, low_memory=False)
    # 
    with warnings.catch_warnings():
        warnings.simplefilter('ignore') # TODO: Investigate

        dataframe, metadata =  pyreadstat.read_dta(
                malawi_survey_directory_dta / f'{file}.dta', apply_value_formats=True
        )

    column_names_to_labels.update(metadata.column_names_to_labels)
    
    # print(f'file: {file}, {dataframe.case_id.value_counts().head(10)}')
    if malawi is None:
        malawi = dataframe
    else:
        malawi = malawi.merge(dataframe, on='case_id', how='outer', suffixes=('_left', '_right'))    

        for c in malawi.columns:
            if c.endswith('_left'):
                c_left = c
                base = c_left[:-5]
                c_right = f'{base}_right'

                # sometimes categorical types mess up this check; fail conservatively
                try:
                    match = columns_equal(malawi, c_left, c_right)
                except:
                    match = False
                
                if match:
                    malawi.drop(columns=c_left, inplace=True)
                    malawi.rename(columns={c_right: base}, inplace=True)
                # geographies are sometimes named and sometimes encoded as integers. If we've got one of each,  
                # keep the string name: that way it won't accidentally be treated as numeric later.
                elif (
                    (base in ['region', 'district'])
                    & (
                        pd.api.types.is_numeric_dtype(malawi[c_left]) 
                        + pd.api.types.is_numeric_dtype(malawi[c_right]) 
                        == 1
                      )
                ):
                    if pd.api.types.is_numeric_dtype(malawi[c_left]):
                        malawi.drop(columns=c_left, inplace=True)
                        malawi.rename(columns={c_right: base}, inplace=True)
                    else:
                        malawi.drop(columns=c_right, inplace=True)
                        malawi.rename(columns={c_left: base}, inplace=True)
                else:
                    # print(pd.api.types.is_numeric_dtype(c_left) + pd.api.types.is_numeric_dtype(c_right))
                    print(f'error merging {file}, mismatch in {base}')
                    # TODO: Examine these cases
                    malawi.drop(columns=c_left, inplace=True)
                    malawi.rename(columns={c_right: base}, inplace=True)

# Drop rows that are missing critical fields which we don't want to impute.
malawi.dropna(subset=['HHID', 'rexpaggpc'], inplace=True)

# TODO: Figure out how to detect datetime-like columns automatically
malawi['interviewDate'] = pd.to_datetime(malawi['interviewDate'])

# columns not to be imputed, coerced to numeric, or one-hot encoded.
# summary table won't include these either - for now, this seems fine. 
columns_to_reserve = [
    'HHID', 'case_id', 'hh_wgt', 'interviewDate'
]
malawi_reserved = malawi[columns_to_reserve]
malawi_to_process = malawi[malawi.columns.difference(columns_to_reserve)]

# coerce columns to numeric that can be coerced
for c in malawi_to_process.columns:
    malawi_to_process[c] = pd.to_numeric(malawi_to_process[c], errors='ignore')

# coerce known categorical columns to string
known_categorical = [
    'region', 'district', 'hh_t01', 'hh_t02', 'hh_t03', 'hh_t04'
]
for c in known_categorical:
    malawi_to_process[c] = malawi_to_process[c].astype(str)

# Add Mosaiks columns. 
malawi_to_process.ea_id = malawi_to_process.ea_id.astype(int)
malawi_to_process = malawi_to_process.merge(
    ea_geo_with_mosaiks, on='ea_id', how='outer'
)

# Before imputing or dropping highly-missing columns, summarize columns
missing_counts = malawi_to_process.isnull().sum()+ (malawi_to_process == "").sum()  
means = malawi_to_process.mean(skipna=True, numeric_only=True)
medians = malawi_to_process.median(skipna=True, numeric_only=True)
stds = malawi_to_process.std(skipna=True, numeric_only=True)
summary = pd.concat((missing_counts, means, medians, stds), axis=1)
summary.columns = ['missing_count', 'mean', 'median', 'std']
summary.reset_index(names='covariate', inplace=True)

# Drop highly missing columns.
print(f'pre-dropping: num columns {len(malawi_to_process.columns)}')
threshold = 0.15

missing_percent = missing_counts  / len(malawi_to_process)
dropped_for_missingness = malawi_to_process[missing_percent[missing_percent >= threshold].index].columns
malawi_to_process = malawi_to_process[missing_percent[missing_percent < threshold].index] 
not_dropped_for_missingness = [c for c in malawi_to_process.columns if c not in dropped_for_missingness]

print(f'dropping {len(dropped_for_missingness)} columns')
print(f'post-dropping: num columns {len(malawi_to_process.columns)}')

# Split into numeric and non-numeric columns
malawi_numeric = malawi_to_process.select_dtypes(include=[np.number])
malawi_non_numeric = malawi_to_process.select_dtypes(exclude=[np.number, np.datetime64])

# impute missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(malawi_numeric)

columns = malawi_numeric.columns
malawi_numeric = pd.DataFrame(imputer.transform(malawi_numeric))
malawi_numeric.columns = columns

# one-hot encode categoricals: First, fill missing values.
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='MISSING')
imputer.fit(malawi_non_numeric)
imputer.transform(malawi_non_numeric)

one_hot_encoder = sklearn_preprocessing.OneHotEncoder(
    drop='if_binary', sparse_output=False
).fit(malawi_non_numeric)
encoded_data = one_hot_encoder.transform(malawi_non_numeric)
malawi_non_numeric_encoded = pd.DataFrame(encoded_data)
malawi_non_numeric_encoded.columns = one_hot_encoder.get_feature_names_out()

# Set up a map from original column names to the list of one-hot columns. We'll use it later.
one_hot_map = dict()
for i in range(len(one_hot_encoder.feature_names_in_)):
    
    categories = one_hot_encoder.categories_[i]
    if one_hot_encoder.drop_idx_[i] is not None:
        categories = np.delete(categories, one_hot_encoder.drop_idx_[i])

    one_hot_map[one_hot_encoder.feature_names_in_[i]] = categories

malawi = malawi_reserved.join(malawi_numeric).join(malawi_non_numeric_encoded)

malawi['consumption_ppp_2017'] = malawi.rexpaggpc * malawi_consumption_conversion_factor

# create map from one-hot columns to original columns + values
inverse_one_hot_map = dict()
for feature, categories in one_hot_map.items():
    for category in categories:
        inverse_one_hot_map[f'{feature}_{category}'] = (feature, category)

def interpret_column_name(column_name):

    if column_name is None or column_name.startswith('mosaiks'):
        return column_name

    elif column_name in inverse_one_hot_map:
        original_column_name, value = inverse_one_hot_map[column_name]
        return f'Covariate: {column_names_to_labels[original_column_name]}, value: {value}'

    return column_names_to_labels[column_name]

summary['description'] = summary.covariate.apply(interpret_column_name)
summary['missing_fraction'] = summary['missing_count'] / len(malawi)

summary.missing_fraction = summary.missing_fraction.round(2)
summary['median'] = summary['median'].round(2)
summary['mean'] = summary['mean'].round(2)
summary['std'] = summary['std'].round(2)

error merging ag_mod_e3, mismatch in HHID
error merging hh_mod_a_filt, mismatch in HHID
error merging ihs5_consumption_aggregate, mismatch in region
error merging ihs5_consumption_aggregate, mismatch in district


  malawi_to_process[c] = pd.to_numeric(malawi_to_process[c], errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malawi_to_process[c] = pd.to_numeric(malawi_to_process[c], errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malawi_to_process[c] = malawi_to_process[c].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-co

pre-dropping: num columns 4359
dropping 144 columns
post-dropping: num columns 4215


KeyError: 'ea_lat_mod_x'

### Use forward selection to select covariates

In [201]:
columns_to_exclude = {
    'hh_a02a', 
    'hh_a03', 
    'hh_a23', 
    'hh_a22', 
    'interviewDate', 
    'consumption_ppp_2017', 
    'hh_f18', # total value of firewood you used in the last week,
    'index_mosaiks'
}

_, consumption_metadata = pyreadstat.read_dta(
        malawi_survey_directory_dta / 'ihs5_consumption_aggregate.dta', metadataonly=True
)
consumption_columns_excluded = (
    set(consumption_metadata.column_names)
    # we DO want region and district included - remove them from the excluded set
    - {'region', 'district'} 
)


columns_excluded = consumption_columns_excluded | columns_to_exclude

# exclude the corresponding one-hot-encoded columns
columns_excluded_one_hot = []
for column in columns_excluded:
    if column in one_hot_map:
        for category in one_hot_map[column]:
            columns_excluded_one_hot.append(f'{column}_{category}')

columns_excluded_one_hot = set(columns_excluded_one_hot)
malawi_covariate_columns = (
    set(malawi.columns)
    - (columns_excluded | columns_excluded_one_hot)
)

def forward_select_features(dataset, candidate_covariates, num_to_select=30, random_state=None):
    # split into selection and calibration. 
    # Then run the forward selection by fitting models on selection + testing them on calibration. Ultimately test
    # the whole thing on eval.
    
    selection, calibration = sklearn_model_selection.train_test_split(
        dataset, test_size=0.25, random_state=random_state
    )
    selected_covariate_list = []
    
    # first elements describe the null model.
    mses_cumulative = [calibration.consumption_ppp_2017.std() ** 2]
    r2s_cumulative = [0]
    r2s_univariate = [np.nan]
    
    unselected_covariates = candidate_covariates.copy()
    
    while((unselected_covariates is not None) and (len(selected_covariate_list) < num_to_select)):
        
        best_mse_this_step = np.inf
        best_covariate_this_step = None
        cumulative_r2_this_step = None
    
        for c in unselected_covariates:
        
            covariates_to_try = selected_covariate_list + [c]
            lr = sklearn_linear_model.LinearRegression()
            lr.fit(
                selection[covariates_to_try], 
                selection.consumption_ppp_2017,
                sample_weight=selection.hh_wgt
            )
            
            # Make predictions on test data
            y_pred = lr.predict(calibration[covariates_to_try])
            
            # Compute MSE 
            mse = sklearn_metrics.mean_squared_error(
                calibration.consumption_ppp_2017, y_pred, sample_weight=calibration.hh_wgt
            )
            cumulative_r2 = sklearn_metrics.r2_score(
                calibration.consumption_ppp_2017, y_pred, sample_weight=calibration.hh_wgt
            )
            if mse < best_mse_this_step:
                best_mse_this_step = mse
                cumulative_r2_this_step = cumulative_r2
                best_covariate_this_step = c
        
        if best_mse_this_step < mses_cumulative[-1]:
            selected_covariate_list.append(best_covariate_this_step)
            mses_cumulative.append(best_mse_this_step)
            r2s_cumulative.append(cumulative_r2_this_step)
            unselected_covariates.remove(best_covariate_this_step)
    
        else:
            break

    selected_covariates = pd.DataFrame(
        # add 'none' in front to indicate variance pre-covariates
        np.array([[None] + selected_covariate_list, mses_cumulative, r2s_cumulative]).transpose(), 
        columns=['Covariate', 'Cumulative Model MSE', 'Cumulative Model r2']
    )
    selected_covariates['Description'] = selected_covariates.Covariate.apply(interpret_column_name)
    selected_covariates['Cumulative Model MSE'] = (
        selected_covariates['Cumulative Model MSE'].astype(float).round(1)
    )
    selected_covariates['Cumulative Model r2'] = (
        selected_covariates['Cumulative Model r2'].astype(float).round(3)
    )
    
    return selected_covariates

In [171]:
selected_covariates = forward_select_features(
    malawi, malawi_covariate_columns, num_to_select = 30, random_state=11
)

In [None]:
def add_univariate_r2(selected_covariates):
    # drop the "none"
    selected_covariate_list = selected_covariates.Covariate[1:]
    
    r2s_univariate = [np.nan]
    
    for selected_covariate in selected_covariate_list:
        lr = sklearn_linear_model.LinearRegression()
        lr.fit(
            malawi[[selected_covariate]], 
            malawi.consumption_ppp_2017,
            sample_weight=malawi.hh_wgt
        )
        # Make predictions on test data
        y_pred = lr.predict(malawi[[selected_covariate]])
        
        r2 = sklearn_metrics.r2_score(
            malawi.consumption_ppp_2017, y_pred, sample_weight=malawi.hh_wgt
        )
        r2s_univariate.append(r2)
    
    selected_covariates['Univariate r2'] = r2s_univariate
    selected_covariates['Univariate r2'] = selected_covariates['Univariate r2'].astype(float).round(3)

    return selected_covariates

In [None]:
for_display = add_univariate_r2(selected_covariates)
with pd.option_context('display.max_rows', 200, 'display.max_colwidth', 1):

    display(
        for_display[['Covariate', 'Description', 'Cumulative Model MSE', 'Cumulative Model r2', 'Univariate r2']]
        .set_index('Covariate')
    )

In [None]:
mosaiks_columns = [c for c in malawi.columns if c.startswith('mosaiks')]
selected_covariates_mosaiks = forward_select_features(
    malawi, mosaiks_columns, num_to_select = 10, random_state=11
)

In [None]:
selected_covariates_mosaiks

In [202]:
malawi_covariate_columns_no_mosaiks = [
    c for c in malawi_covariate_columns if not c.startswith('mosaiks_')
]
selected_covariates_no_mosaiks = forward_select_features(
    malawi, malawi_covariate_columns_no_mosaiks, num_to_select = 30, random_state=11
)

In [204]:
for_display = add_univariate_r2(selected_covariates_no_mosaiks)
with pd.option_context('display.max_rows', 200, 'display.max_colwidth', 1):

    display(
        for_display[['Covariate', 'Description', 'Cumulative Model MSE', 'Cumulative Model r2', 'Univariate r2']]
        .set_index('Covariate')
    )

Unnamed: 0_level_0,Description,Cumulative Model MSE,Cumulative Model r2,Univariate r2
Covariate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,,15209406.5,0.0,
hh_t10_BED &amp; MATTRESS,"Covariate: What do you (HH HEAD) sleep on?, value: BED &amp; MATTRESS",9246612.1,0.173,0.101
hh_f12_ELECTRICITY,"Covariate: What is your main source of cooking fuel?, value: ELECTRICITY",8097828.2,0.276,0.127
hh_f41_Flush to septic tank,"Covariate: What kind of toilet facility does your household use?, value: Flush to septic tank",7528403.6,0.327,0.139
hh_f12_CHARCOAL,"Covariate: What is your main source of cooking fuel?, value: CHARCOAL",7067763.1,0.368,0.055
hh_t03_It was less than adequate for household needs,"Covariate: Concerning your household's clothing, which of the following is true?, value: It was less than adequate for household needs",6860562.5,0.387,0.056
hh_t19_NO,"Covariate: ..HH were hungry but did not eat b'se not enough money/resources for food?, value: NO",6751474.4,0.396,0.074
hh_t04_It was more than adequate for household needs,"Covariate: Concerning the standard of health care you receive for HH members which is true, value: It was more than adequate for household needs",6667698.6,0.404,0.032
hh_f01_OWNED,"Covariate: Do you own or are purchasing this property, is it provided to you by an emp, value: OWNED",6599528.7,0.41,0.035
hh_h02d,"In the past 7 days, ...? (Restrict consumption by adults in order for small chil",6531459.2,0.416,0.015


### Geo-only model using district

In [259]:
district_columns = [c for c in malawi.columns if 'district' in c]
lr = sklearn_linear_model.LinearRegression()
cross_val_score = sklearn_model_selection.cross_val_score(
    lr,
    X=malawi[district_columns], 
    y=malawi.consumption_ppp_2017,
    fit_params={'sample_weight': malawi.hh_wgt},
    scoring='r2',
    cv=5
)
display(cross_val_score.mean())

0.12262191166535308

### Mosaiks-only models

In [217]:
mosaiks_columns = [c for c in malawi.columns if c.startswith('mosaiks')]
lr = sklearn_linear_model.LinearRegression()
cross_val_score_lr = sklearn_model_selection.cross_val_score(
    lr,
    X=malawi[mosaiks_columns], 
    y=malawi.consumption_ppp_2017,
    params={'sample_weight': malawi.hh_wgt},
    scoring='r2',
    cv=5
)

In [None]:
lasso = sklearn_linear_model.Lasso(max_iter=5000)
alphas = [1, 3, 10, 30, 100, 300, 1000, 3000]

lasso_grid_search = sklearn_model_selection.GridSearchCV(
    lasso,
    {'alpha': alphas},
    scoring='r2',
    cv=5
)
lasso_grid_search.fit(    
    X=malawi[mosaiks_columns], 
    y=malawi.consumption_ppp_2017,
    sample_weight=malawi.hh_wgt
)

In [214]:
display(lasso_grid_search.best_score_)

0.014044557528915248

In [227]:
"""
class sklearn.ensemble.GradientBoostingClassifier(*, loss='squared_error', learning_rate=0.1, n_estimators=100,
subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, 
max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0
)

Hyperparams for classifier, probably similar to regressor:
['ccp_alpha', 'criterion', 'init', 'learning_rate', 'loss', 'max_depth', 'max_features', 'max_leaf_nodes', 
'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 
'n_iter_no_change', 'random_state', 'subsample', 'tol', 'validation_fraction', 'verbose', 'warm_start'].
"""
gb_hyperparameters_from_cider = {
    'min_samples_leaf': [20],# [10, 20, 50],
    'max_leaf_nodes': [10],# [ 5, 10, 20 ],
    'learning_rate': [.1],#[ 0.05, 0.075, 0.1 ],
    'n_estimators': [100],#[ 50, 100, 200 ]
}
gb_classifier = sklearn_ensemble.GradientBoostingRegressor()
gb_grid_search = sklearn_model_selection.GridSearchCV(
    gb_classifier, gb_hyperparameters_from_cider, scoring='r2', cv=5, verbose=1
)

gb_grid_search.fit(    
    X=malawi[mosaiks_columns], 
    y=malawi.consumption_ppp_2017,
    sample_weight=malawi.hh_wgt
)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [230]:
gb_grid_search.cv_results_

{'mean_fit_time': array([216.19630632]),
 'std_fit_time': array([1.72899845]),
 'mean_score_time': array([0.10447245]),
 'std_score_time': array([0.01442885]),
 'param_learning_rate': masked_array(data=[0.1],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_max_leaf_nodes': masked_array(data=[10],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[20],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[100],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'learning_rate': 0.1,
   'max_leaf_nodes': 10,
   'min_samples_leaf': 20,
   'n_estimators': 100}],
 'split0_test_score': array([-0.1082747]),
 'split1_test_score': array([0.00374832]),
 'split2_test_score': array([0.03941962]),
 'split3_test_score': array([-0.16596888]),
 'split4_test_score': array([-0.47200

### Univariate r2s for all covariates considered

In [214]:
r2s_univariate = []
malawi_covariate_columns_list = list(malawi_covariate_columns)
for selected_covariate in malawi_covariate_columns_list:
    lr = sklearn_linear_model.LinearRegression()
    lr.fit(
        malawi[[selected_covariate]], 
        malawi.consumption_ppp_2017,
        sample_weight=malawi.hh_wgt
    )
    # Make predictions on test data
    y_pred = lr.predict(malawi[[selected_covariate]])
    
    r2 = sklearn_metrics.r2_score(
        malawi.consumption_ppp_2017, y_pred, sample_weight=malawi.hh_wgt
    )
    r2s_univariate.append(r2)

In [226]:
all_univariate_r2s = pd.DataFrame(
    data=np.array([malawi_covariate_columns_list, r2s_univariate]).transpose(),
    columns=['covariate', 'univariate_r2']
)
all_univariate_r2s.univariate_r2 = all_univariate_r2s.univariate_r2.astype(float)
display(all_univariate_r2s.sort_values('univariate_r2', ascending=False).head(50))

Unnamed: 0,covariate,univariate_r2
1532,hh_f03a,0.340331
827,hh_f37,0.338343
2542,hh_f41_Flush to septic tank,0.138543
4172,hh_f19_YES,0.131945
615,hh_f27_nan,0.131861
299,hh_f11_ELECTRICITY,0.130127
1086,hh_f12_ELECTRICITY,0.126646
45,hh_f36_PIPED INTO DWELLING,0.122557
119,hh_f12_GAS,0.115442
3829,hh_f36_1_PIPED INTO DWELLING,0.112674


### Print summaries of covariates

In [None]:
# We determine what is included by omitting what's not included. This approach handles one-hot encoded
# columns correctly.

covariates_considered = [c for c in not_dropped_for_missingness if c not in columns_excluded]
with pd.option_context('display.max_rows', 300, 'display.max_colwidth', 1):

    display(
        summary[
            summary.covariate.isin(covariates_considered)
        ]
        [['covariate', 'description', 'missing_fraction', 'mean', 'median', 'std']]
    )

In [None]:
with pd.option_context('display.max_rows', 300, 'display.max_colwidth', 1):

    display(
        summary[
            (summary.covariate.isin(columns_excluded))
            & (~summary.covariate.isin(dropped_for_missingness))
        ]
        [['covariate', 'description', 'missing_fraction', 'mean', 'median', 'std']]
    )

In [None]:
with pd.option_context('display.max_rows', 200, 'display.max_colwidth', 1):

    display(
        summary[summary.covariate.isin(dropped_for_missingness)]
        [['covariate', 'description', 'missing_fraction', 'mean', 'median', 'std']]
    )

## Mosaiks preprocessing

#### Preparing map query

In [329]:
malawi_geo = gpd.read_file(
    malawi_directory / 'mwi_adm_nso_hotosm_20230405_shp' / 'mwi_admbnda_adm2_nso_hotosm_20230405.shp'
)

In [19]:
num_y_partitions = 3
ys = np.linspace(bounds.miny, bounds.maxy, num_y_partitions + 1)
for i in range(num_y_partitions):
    print(ys[i], ys[i+1])

-17.12974811999993 -14.542280706999938
-14.542280706999938 -11.954813293999944
-11.954813293999944 -9.367345880999949


#### Fine-grained mosaiks

In [121]:
mosaiks = ddf.read_csv(mosaiks_path / 'malawi_fine' / '*.csv')

In [122]:
mosaiks = dgpd.from_dask_dataframe(
    mosaiks, dgpd.points_from_xy(mosaiks, 'Lon', 'Lat')
)

In [142]:
geo_mosaiks = mosaiks.sjoin(malawi_geo, predicate='within', how='inner')

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  meta = geopandas.sjoin(left._meta, right._meta, how=how, predicate=predicate)


#### Coarse-grained mosaiks

unusable - for now, it appears Africa's mosaiks files are incomplete.

In [8]:
mosaiks = ddf.read_csv(mosaiks_directory / 'coarse' / '*.csv')

malawi_geo = gpd.read_file(
    malawi_directory / 'mwi_adm_nso_hotosm_20230405_shp' / 'mwi_admbnda_adm0_nso_hotosm_20230405.shp'
)
bounds = malawi_geo.bounds.iloc[0]

In [10]:
mosaiks.lon.max().compute()

32.65

In [345]:
mosaiks_bounded = mosaiks[
    (mosaiks.lon >= bounds.minx)
    & (mosaiks.lon <= bounds.maxx)
    & (mosaiks.lat >= bounds.miny)
    & (mosaiks.lat <= bounds.maxx)
]

In [346]:
mosaiks_bounded_computed = mosaiks_bounded.compute()

In [6]:
mosaiks = dgpd.from_dask_dataframe(
    mosaiks, dgpd.points_from_xy(mosaiks, 'lon', 'lat')
)

NameError: name 'mosaiks' is not defined

In [357]:
computed = mosaiks[['geometry']].compute()

# .to_file(data_directory / 'mosaiks_output' / 'africa_coarse_location_only.shp')

In [360]:
computed.to_file(mosaiks_directory / 'output' / 'africa_coarse_location_only.shp')

In [5]:
for file in (mosaiks_directory / 'coarse').glob('*.csv'):
    df = ddf.read_csv(file)
    df = dgpd.from_dask_dataframe(
        df, dgpd.points_from_xy(df, 'lon', 'lat')
    )
    df[['geometry']].compute().to_file(mosaiks_directory / 'output' / f'{file.name}_location_only.shp')

  df[name] = val


In [326]:
mosaiks_in_malawi = mosaiks.sjoin(malawi_geo, predicate='within', how='inner')

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  meta = geopandas.sjoin(left._meta, right._meta, how=how, predicate=predicate)


In [327]:
mosaiks_in_malawi_computed = mosaiks_in_malawi.compute()

In [328]:
len(mosaiks_in_malawi_computed)

0

In [320]:
len(mosaiks_in_malawi_computed)

77910

In [298]:
mosaiks_in_malawi = mosaiks[mosaiks.geometry.within(malawi_geo.iloc[0].geometry)]
mosaiks_in_malawi_computed = mosaiks_in_malawi.compute()

In [311]:
mosaiks = ddf.read_csv(str(mosaiks_directory / 'malawi_fine' / '*.csv'))

mosaiks = dgpd.from_dask_dataframe(
    mosaiks, dgpd.points_from_xy(mosaiks, 'Lon', 'Lat')
)


In [315]:
mosaiks_head = mosaiks.head(10)

In [316]:
mosaiks_head[
    mosaiks_head.geometry.within(malawi_geo.iloc[0].geometry)
]

Unnamed: 0,Lat,Lon,BoxLabel,Unnamed: 4,.1,.2,.3,.4,.5,.6,...,.3991,.3992,.3993,.3994,.3995,.3996,.3997,.3998,.3999,geometry
0,-11.955,33.635,Malawi 2,0.170008,0.63444,0.019116,0.68551,0.338629,0.35248,0.104404,...,0.0329,0.193308,0.135092,0.13475,1.062124,1.219159,0.709582,0.748263,0.024786,POINT (33.63500 -11.95500)
1,-13.775,33.635,Malawi 2,0.082285,0.367268,0.010162,0.462284,0.178959,0.206822,0.057531,...,0.023203,0.126262,0.081328,0.068765,0.68804,0.776211,0.482071,0.530758,0.019135,POINT (33.63500 -13.77500)
2,-13.785,33.635,Malawi 2,0.070622,0.323596,0.008341,0.373956,0.153765,0.22834,0.087386,...,0.019909,0.122797,0.088507,0.055622,0.663131,0.707385,0.434476,0.42967,0.034416,POINT (33.63500 -13.78500)
3,-13.795,33.635,Malawi 2,0.073091,0.405169,0.003787,0.676295,0.168738,0.147581,0.047011,...,0.006762,0.046159,0.034311,0.035998,0.760553,0.454671,0.359011,0.611862,0.006365,POINT (33.63500 -13.79500)
4,-13.805,33.635,Malawi 2,0.049633,0.28821,0.005367,0.505401,0.112409,0.136541,0.047413,...,0.012273,0.079417,0.051379,0.030846,0.618406,0.463685,0.269003,0.485262,0.011441,POINT (33.63500 -13.80500)
5,-13.815,33.635,Malawi 2,0.071722,0.339016,0.007961,0.446428,0.153106,0.184042,0.058747,...,0.014938,0.096399,0.068643,0.047508,0.640277,0.632705,0.355021,0.464346,0.017225,POINT (33.63500 -13.81500)
6,-13.825,33.635,Malawi 2,0.078247,0.317984,0.009977,0.400218,0.157336,0.195162,0.06625,...,0.018159,0.087256,0.076097,0.056665,0.668922,0.600207,0.415302,0.477198,0.01744,POINT (33.63500 -13.82500)
7,-13.835,33.635,Malawi 2,0.063673,0.324298,0.007354,0.453272,0.141424,0.151106,0.055621,...,0.014669,0.074235,0.059847,0.051411,0.653411,0.548116,0.388714,0.483854,0.016156,POINT (33.63500 -13.83500)
8,-13.845,33.635,Malawi 2,0.070136,0.339884,0.008587,0.333813,0.15246,0.18406,0.065987,...,0.016783,0.11347,0.079152,0.060439,0.613129,0.730515,0.479701,0.403412,0.020996,POINT (33.63500 -13.84500)
9,-13.855,33.635,Malawi 2,0.06669,0.345752,0.005758,0.523793,0.156903,0.144562,0.060406,...,0.009037,0.070228,0.052238,0.039818,0.652444,0.482536,0.337546,0.499016,0.010873,POINT (33.63500 -13.85500)


# Unused

In [232]:
selected_covariate_list = selected_covariates_old

['hh_f11_ELECTRICITY',
 'hh_f12_GAS',
 'hh_f12_ELECTRICITY',
 'hh_t17_YES',
 'hh_f12_CHARCOAL',
 'hh_f36_PIPED INTO DWELLING',
 'hh_t10_BED &amp; MATTRESS',
 'hh_g09_YES',
 'hh_h04_YES',
 'hh_t03_It was more than adequate for household needs',
 'hh_f01_OWNED',
 'hh_h02d',
 'hh_t14_YES',
 'region_North',
 'hh_t07',
 'hh_f41_4_NO',
 'hh_t04_It was more than adequate for household needs',
 'hh_t11_BLANKET &amp; SHEETS',
 'hh_h01_YES',
 'hh_f52_YES',
 'mosaiks_2712',
 'district_Zomba',
 'district_Blantyre',
 'district_Chiradzulu',
 'hh_h03a',
 'hh_o0a_YES',
 'hh_f07_CONCRETE',
 'hh_f09_OTHER(SPECIFY)',
 'hh_f09_SAND',
 'hh_t03_It was less than adequate for household needs']

In [237]:
r2s_univariate = [np.nan]
for selected_covariate in selected_covariate_list:
    lr = sklearn_linear_model.LinearRegression()
    lr.fit(
        malawi[[selected_covariate]], 
        malawi.consumption_ppp_2017,
        sample_weight=malawi.hh_wgt
    )
    # Make predictions on test data
    y_pred = lr.predict(malawi[[selected_covariate]])
    
    r2 = sklearn_metrics.r2_score(
        malawi.consumption_ppp_2017, y_pred, sample_weight=malawi.hh_wgt
    )
    r2s_univariate.append(r2)

selected_covariates = pd.DataFrame(
    # add 'none' in front to indicate variance pre-covariates
    np.array([[None] + selected_covariate_list, mses_cumulative, r2s_cumulative, r2s_univariate]).transpose(), 
    columns=['Covariate', 'Cumulative Model MSE', 'Cumulative Model r2', 'Univariate r2']
)

selected_covariates['Description'] = selected_covariates.Covariate.apply(interpret_column_name)
selected_covariates['Cumulative Model MSE'] = selected_covariates['Cumulative Model MSE'].astype(float).round(1)
selected_covariates['Cumulative Model r2'] = selected_covariates['Cumulative Model r2'].astype(float).round(3)
selected_covariates['Univariate r2'] = selected_covariates['Univariate r2'].astype(float).round(3)

with pd.option_context('display.max_rows', 200, 'display.max_colwidth', 1):

    display(
        selected_covariates[['Covariate', 'Description', 'Cumulative Model MSE', 'Cumulative Model r2', 'Univariate r2']]
        .set_index('Covariate')
    )

Unnamed: 0_level_0,Description,Cumulative Model MSE,Cumulative Model r2,Univariate r2
Covariate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,,18244076.8,0.0,
hh_f11_ELECTRICITY,"Covariate: What is your main source of lighting fuel?, value: ELECTRICITY",11732814.1,0.195,0.13
hh_f12_GAS,"Covariate: What is your main source of cooking fuel?, value: GAS",9920509.3,0.32,0.115
hh_f12_ELECTRICITY,"Covariate: What is your main source of cooking fuel?, value: ELECTRICITY",8931944.7,0.388,0.127
hh_t17_YES,"Covariate: ..HH ate less than you thought you sh'd b'se of a lack of money/other resources?, value: YES",8226515.5,0.436,0.09
hh_f12_CHARCOAL,"Covariate: What is your main source of cooking fuel?, value: CHARCOAL",7895538.4,0.459,0.055
hh_f36_PIPED INTO DWELLING,"Covariate: What is your main source of drinking water?, value: PIPED INTO DWELLING",7653646.6,0.475,0.123
hh_t10_BED &amp; MATTRESS,"Covariate: What do you (HH HEAD) sleep on?, value: BED &amp; MATTRESS",7464199.9,0.488,0.101
hh_g09_YES,"Covariate: Over the past one week (7 days), did any people that you did nonlist as househol, value: YES",7338646.5,0.497,0.002
hh_h04_YES,"Covariate: ..12 months..faced with a situation when did not have enough food to feed the hh, value: YES",7193763.0,0.507,0.066


In [823]:
dropped_for_missingness

Index(['ag_e27a', 'ag_e27b', 'ag_e27c', 'ag_e27d', 'ag_e27e', 'ag_e27f',
       'ag_e27g', 'ag_e27h', 'ag_e28', 'ag_e29a',
       ...
       'hh_s16_oth', 'hh_s16a', 'hh_s16b', 'hh_s17', 'hh_s17_oth',
       'hh_s19_oth', 'hh_s19a', 'hh_s19b', 'hh_t10_oth', 'hh_t12_oth'],
      dtype='object', length=144)

In [673]:
# Using sklearn: Haven't figured out how to incorporate sample weights.
linear_regression = sklearn_linear_model.LinearRegression()
sfs = sklearn_feature_selection.SequentialFeatureSelector(
    linear_regression, n_features_to_select=10
)
sfs.fit(
    selection[list(malawi_covariate_columns)], 
    selection.consumption_ppp_2017,
    sample_weights=selection.hh_wgt
)

TypeError: fit() got an unexpected keyword argument 'sample_weights'

In [656]:
sklearn_selected_covariates = pd.DataFrame(
    sfs.get_feature_names_out(), 
    columns=['covariate']
)

sklearn_selected_covariates['description'] = sklearn_selected_covariates.covariate.apply(interpret_column_name)
# selected_covariates.mse = selected_covariates.mse.astype(float).round(1)