### Imports

In [2]:
import pandas as pd
from pathlib import Path
import numpy as np
import json
import warnings
import math

import dask.dataframe as ddf

import sklearn.linear_model as sklearn_linear_model
import sklearn.metrics as sklearn_metrics
import sklearn.model_selection as sklearn_model_selection
import sklearn.preprocessing as sklearn_preprocessing
import sklearn.feature_selection as sklearn_feature_selection
import sklearn.ensemble as sklearn_ensemble
import sklearn.decomposition as sklearn_decomposition
from sklearn.impute import SimpleImputer

from scipy import stats

import geopandas as gpd
import dask_geopandas as dgpd

import matplotlib.pyplot as plt
import pyreadstat
from pandas.api.types import is_numeric_dtype

In [3]:
data_path = Path('/home/selker/eop/data/malawi')

# for reproducibility
RANDOM_STATE=11

### Load cleaned data

In [4]:
malawi = pd.read_parquet(data_path / 'malawi_cleaned_2016.parquet')
summary = pd.read_parquet(data_path / 'malawi_summary_2016.parquet')

### Select covariates

Partition your data into 10 random subsets, in order to perform 10-fold cross-validation.
Initialize model parameters m as empty set {}
For counter c from 1 to 30:
   For each variable v (that is not already included in m)
      For each fold i:
            temporarily hold out data in partition i (10% of observations)
            train a model (linear regression?) to predict consumption from {m+v} on the 90% of observations not in i 
                record the in-sample performance (RMSE and R2) on those 90% of observations, and store it [this is the in-sample cross-val performance for fold i]
                using this trained model (trained on the 90% of observations not in i), calculate performance (RMSE and R2) on the held-out observation in i, store those values [these represent the  out-of-sample cross-val performance for fold i]
      Averaging across the 10 folds from the above loop, calculate the average in-sample and out-of-sample performance that you obtain for the model that includes m and v. Store these values.
   Choose the v* that maximizes the out-of-sample performance (among all of the v's tested in the above loop)
   Add v* to your model, and then iterate, until you've identified the best 30 variables

In [150]:
def evenly_partition(dataset, n_partitions, random_state=None):

    shuffled = dataset.sample(frac=1, random_state=random_state)
    indices = np.linspace(0, len(dataset), n_partitions + 1)
    indices = [round(i) for i in indices]
    indices[-1] = len(dataset)
    
    [shuffled[indices[i] : indices[i+1]] for i in range(len(indices) - 1)]
    
def forward_select_features(dataset, candidate_covariates, num_to_select=30, random_state=None):

    n_folds = 10
    
    folds = evenly_partition(dataset, n_folds, random_state)
    
    selected_covariate_list = []
    columns_so_far = []
    
    mses_
    
    unselected_covariates = list(candidate_covariates.copy())
    
    # avoid set-order non-determinism
    unselected_covariates.sort()
    
    while((unselected_covariates is not None) and (len(selected_covariate_list) < num_to_select)):
        
        best_mse_this_step = np.inf
        best_covariate_this_step = None
        cumulative_r2_this_step = None
    
        for covariate in unselected_covariates:

            # the values in the summary are a numpy array - need a list for list concatenation to work.
            columns_to_add = list(summary.loc[covariate].columns)
            columns_to_try = columns_so_far + columns_to_add

            lr = sklearn_linear_model.LinearRegression()
            lr.fit(
                selection[columns_to_try], 
                selection.outcome,
                sample_weight=selection.hh_wgt
            )
            
            # Make predictions on test data
            y_pred = lr.predict(calibration[columns_to_try])
            
            # Compute MSE 
            mse = sklearn_metrics.mean_squared_error(
                calibration.outcome, y_pred, sample_weight=calibration.hh_wgt
            )
            cumulative_r2 = sklearn_metrics.r2_score(
                calibration.outcome, y_pred, sample_weight=calibration.hh_wgt
            )

            if mse < best_mse_this_step:

                best_mse_this_step = mse
                cumulative_r2_this_step = cumulative_r2
                best_covariate_this_step = covariate
                best_columns_this_step = columns_to_add
        
        if best_mse_this_step < mses_cumulative[-1]:
            selected_covariate_list.append(best_covariate_this_step)
            columns_so_far += best_columns_this_step
            mses_cumulative.append(best_mse_this_step)
            r2s_cumulative.append(cumulative_r2_this_step)
            unselected_covariates.remove(best_covariate_this_step)
    
        else:
            print('No more improvement.')
            break

    selected_covariates = pd.DataFrame(
        # add 'none' in front to indicate variance pre-covariates
        np.array([[None] + selected_covariate_list, mses_cumulative, r2s_cumulative]).transpose(), 
        columns=['Covariate', 'Cumulative Model MSE', 'Cumulative Model r2']
    )
    selected_covariates = (
        selected_covariates.join(summary['description'], on='Covariate', how='left')
        .rename(columns={'description': 'Description'})
    )
    selected_covariates['Cumulative Model MSE'] = (
        selected_covariates['Cumulative Model MSE'].astype(float).round(3)
    )
    selected_covariates['Cumulative Model r2'] = (
        selected_covariates['Cumulative Model r2'].astype(float).round(3)
    )
    
    return selected_covariates

def add_covariate_r2(selected_covariates):

    # drop the "none"
    selected_covariate_list = selected_covariates.Covariate[1:]
    
    r2s_univariate = [np.nan]
    
    for selected_covariate in selected_covariate_list:
        columns = list(summary.loc[selected_covariate].columns)
        lr = sklearn_linear_model.LinearRegression()
        lr.fit(
            malawi[columns], 
            malawi.outcome,
            sample_weight=malawi.hh_wgt
        )

        y_pred = lr.predict(malawi[columns])
        
        r2 = sklearn_metrics.r2_score(
            malawi.outcome, y_pred, sample_weight=malawi.hh_wgt
        )
        r2s_univariate.append(r2)
    
    selected_covariates['Covariate r2'] = r2s_univariate
    selected_covariates['Covariate r2'] = selected_covariates['Covariate r2'].astype(float).round(3)

    return selected_covariates

In [154]:
manually_excluded = {
    'outcome',
    'hh_wgt',
    'af_bio_1', # annual mean temp
    'hh_f01_4a', # This and next 3: confusing questions about names listed on ownership doc for property
    'hh_f01_4b',
    'hh_f01_4c',
    'hh_f01_4d'
} 

_, consumption_metadata = pyreadstat.read_dta(
        data_path / 'MWI_2016_IHS-IV_v04_M_STATA14/consumption_aggregate/ihs4 consumption aggregate.dta', metadataonly=True
)
consumption_columns_excluded = (
    set(consumption_metadata.column_names)
    # we DO want some of the columns from the consumption module - remove them from the excluded set.
    # To do: what is adulteq?
    - {'region', 'district', 'ea_id', 'area', 'urban', 'hhsize'} 
)

columns_excluded = consumption_columns_excluded | manually_excluded

# Create a list of covariates to consider, treating each column output of one-hot encoding separately.
if False:
        
    # exclude the corresponding one-hot-encoded columns
    columns_excluded_one_hot = []
    for column in columns_excluded:
        if column in one_hot_map:
            for category in one_hot_map[column]:
                columns_excluded_one_hot.append(f'{column}_{category}')
    
    columns_excluded_one_hot = set(columns_excluded_one_hot)
    covariates_to_consider = (
        set(malawi.columns)
        - (columns_excluded | columns_excluded_one_hot)
    )

else:
    covariates_to_consider = set(summary[summary.type != 'dropped'].index.values) - columns_excluded

In [157]:
'hh_f01_4b' in covariates_to_consider

False

#### Using forward selection from the full set

In [152]:
%%time
selected_covariates = forward_select_features(
    malawi, covariates_to_consider, num_to_select = 30, random_state=RANDOM_STATE
)

CPU times: user 29min 3s, sys: 2h 9min 51s, total: 2h 38min 54s
Wall time: 4min 3s


In [158]:
for_display = add_covariate_r2(selected_covariates)
with pd.option_context('display.max_rows', 200, 'display.max_colwidth', 1):

    display(
        for_display[['Covariate', 'Description', 'Cumulative Model MSE', 'Cumulative Model r2', 'Covariate r2']]
        .set_index('Covariate')
    )

Unnamed: 0_level_0,Description,Cumulative Model MSE,Cumulative Model r2,Covariate r2
Covariate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,,4.775,0.0,
hh_f34,How many working cell phones in total does your household own?,3.574,0.258,0.004
hh_f03a,F03a. Estimate the rent you could receive if you rented this property?,3.198,0.336,0.001
hh_h03a,"How many meals, including b/fast are taken per day in household? (Adults)",2.941,0.389,0.003
ag_r29,"...total on housing equipment, feeding utensils/any other input for all types of",2.835,0.411,0.001
hh_f02,"If you sold this property today, how much would you receive for it?",2.692,0.441,0.003
hhsize,Household size,2.53,0.475,0.001
hh_h02b,"In the past 7 days, ...? (Limit portion size at meal times)",2.515,0.478,0.001
af_bio_8,Mean Temperature of Wettest Quarter (degC * 10),2.508,0.479,0.0
hh_f30,How many weeks have you been waiting for?,2.5,0.481,0.0


In [147]:
r2s = dict()
for c in covariates_to_consider:
    columns = list(summary.loc[c].columns)

    lr = sklearn_linear_model.LinearRegression()
    lr.fit(
        malawi[columns],
        malawi.outcome,
        sample_weight=malawi.hh_wgt
    )

    y_pred = lr.predict(malawi[columns])
    r2s[c] = sklearn_metrics.r2_score(
        malawi.outcome, y_pred, sample_weight=malawi.hh_wgt
    )
    

r2s = pd.DataFrame.from_dict(r2s, orient='index').rename(columns={0: 'r2'})

#### Using forward selection from subsets of columns

In [210]:
mosaiks_covariates = [c for c in covariates_to_consider if c.startswith('mosaiks')]
covariates_to_consider_no_mosaiks = [
    c for c in covariates_to_consider if not c.startswith('mosaiks_')
]

In [None]:
selected_covariates_mosaiks = forward_select_features(
    malawi, mosaiks_covariates, num_to_select = 10, random_state=RANDOM_STATE
)

In [213]:
selected_covariates_no_mosaiks = forward_select_features(
    malawi, covariates_to_consider_no_mosaiks, num_to_select = 30, random_state=RANDOM_STATE
)

In [None]:
for_display = add_univariate_r2(selected_covariates_no_mosaiks)
with pd.option_context('display.max_rows', 200, 'display.max_colwidth', 1):

    display(
        for_display[['Covariate', 'Description', 'Cumulative Model MSE', 'Cumulative Model r2', 'Univariate r2']]
        .set_index('Covariate')
    )

#### Output column list

In [168]:
selected_covariates[~selected_covariates.Covariate.isna()].Covariate.to_csv('2016/selected_columns_no_mosaiks.csv', index=False)

In [None]:
columns = pd.read_csv('2016/selected_columns_no_mosaiks.csv')

##### Shuffle to simulate less-selected features

In [24]:
columns_shuffled = columns.sample(frac=1)

In [26]:
columns_shuffled.to_csv('selected_columns_shuffled.csv', index=False)

### LASSO on many covariates

In [137]:
len(malawi_covariate_columns_no_mosaiks)

420

In [None]:
%%time
lasso = sklearn_linear_model.Lasso(max_iter=5000)
alphas = [1, 3, 10, 30, 100, 300, 1000, 3000]

lasso_grid_search = sklearn_model_selection.GridSearchCV(
    lasso,
    {'alpha': alphas},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
)
lasso_grid_search.fit(    
    X=malawi[list(malawi_covariate_columns_no_mosaiks)], 
    y=malawi.outcome,
    sample_weight=malawi.hh_wgt
)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [145]:
lasso_grid_search.best_score_

0.36162471609894153

### Geo-only models

#### By district

In [100]:
district_columns = [c for c in malawi.columns if 'district' in c]
lr = sklearn_linear_model.LinearRegression()
cross_val_score = sklearn_model_selection.cross_val_score(
    lr,
    X=malawi[district_columns], 
    y=malawi.outcome,
    fit_params={'sample_weight': malawi.hh_wgt},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
)
display(cross_val_score.mean())



0.1215089498868964

In [136]:
%%time
lasso = sklearn_linear_model.Lasso(max_iter=5000)
alphas = [0.01, 0.03, 0.1, 0.3]
lasso_grid_search = sklearn_model_selection.GridSearchCV(
    lasso,
    {'alpha': alphas},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
)
lasso_grid_search.fit(    
    X=malawi[district_columns], 
    y=malawi.outcome,
    sample_weight=malawi.hh_wgt
)

CPU times: user 51.2 s, sys: 2min 49s, total: 3min 41s
Wall time: 6.34 s


In [137]:
display(lasso_grid_search.best_params_)

{'alpha': 0.01}

In [138]:
display(lasso_grid_search.best_score_)

0.121497395850383

#### By EA

In [50]:
one_hot_encoder = sklearn_preprocessing.OneHotEncoder(
    sparse_output=False
).fit(malawi[['ea_id']])
encoded_data = one_hot_encoder.transform(malawi[['ea_id']])
ea_one_hot_encoded = pd.DataFrame(encoded_data)
ea_one_hot_encoded.columns = one_hot_encoder.get_feature_names_out()

ea_one_hot_encoded[['hh_wgt', 'outcome']] = malawi[['hh_wgt', 'outcome']]
ea_columns = [c for c in ea_one_hot_encoded if c.startswith('ea_id')]

In [58]:
lr = sklearn_linear_model.LinearRegression()
lr.fit(
    ea_one_hot_encoded[ea_columns], 
    ea_one_hot_encoded.outcome,
    sample_weight=ea_one_hot_encoded.hh_wgt
)

y_pred = lr.predict(ea_one_hot_encoded[ea_columns])

r2_score = sklearn_metrics.r2_score(
    ea_one_hot_encoded.outcome, y_pred, sample_weight=ea_one_hot_encoded.hh_wgt
)

In [59]:
r2_score

0.25881958233028335

In [73]:
ea_one_hot_encoded[ea_columns] = (
    ea_one_hot_encoded[ea_columns] - ea_one_hot_encoded[ea_columns].mean()
) / ea_one_hot_encoded[ea_columns].std()

In [126]:
lr = sklearn_linear_model.LinearRegression()
cv_results = sklearn_model_selection.cross_validate(
    lr, 
    ea_one_hot_encoded[ea_columns], 
    ea_one_hot_encoded.outcome,
    params={
        'sample_weight': ea_one_hot_encoded.hh_wgt
    },
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
)
cv_results

{'fit_time': array([0.55383325, 0.68389297, 0.67866611, 0.83493209, 0.66769886]),
 'score_time': array([0.05156279, 0.02536654, 0.02550459, 0.04184675, 0.02517891]),
 'test_score': array([0.20083605, 0.10153452, 0.04738064, 0.16071741, 0.09975301])}

In [129]:
cv_results['test_score'].mean()

0.12204432847353888

In [110]:
%%time
lasso = sklearn_linear_model.Lasso(max_iter=5000)
alphas = [0.1, 0.3, 1, 3, 10, 30]
lasso_grid_search = sklearn_model_selection.GridSearchCV(
    lasso,
    {'alpha': alphas},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
)
lasso_grid_search.fit(    
    X=ea_one_hot_encoded[one_hot_encoder.get_feature_names_out()], 
    y=ea_one_hot_encoded.outcome,
    sample_weight=ea_one_hot_encoded.hh_wgt
)

CPU times: user 7min 53s, sys: 32min 51s, total: 40min 45s
Wall time: 1min 13s


In [108]:
display(lasso_grid_search.best_score_)

0.1242394930163857

In [109]:
display(lasso_grid_search.best_params_)

{'alpha': 1}

### Mosaiks-only models

In [111]:
mosaiks_columns = [c for c in malawi.columns if c.startswith('mosaiks')]

In [112]:
mosaiks_means = malawi[mosaiks_columns].mean()
mosaiks_stds = malawi[mosaiks_columns].std()
malawi_mosaiks_normalized = (malawi[mosaiks_columns] - mosaiks_means) / mosaiks_stds
# std of 0 -> div by 0 in previous step; fill with zeros.
malawi_mosaiks_normalized.fillna(value=0, inplace=True)

malawi_mosaiks_normalized[['HHID', 'case_id', 'hh_wgt', 'outcome']] = (
    malawi[['HHID', 'case_id', 'hh_wgt', 'outcome']]
)

In [114]:
%%time
lasso = sklearn_linear_model.Lasso(max_iter=5000)
alphas = [0.1, 0.3, 1, 3, 10, 30]

lasso_grid_search = sklearn_model_selection.GridSearchCV(
    lasso,
    {'alpha': alphas},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=40

)
lasso_grid_search.fit(    
    X=malawi_mosaiks_normalized[mosaiks_columns], 
    y=malawi_mosaiks_normalized.outcome,
    sample_weight=malawi_mosaiks_normalized.hh_wgt
)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


CPU times: user 3h 28min 11s, sys: 6h 59min 48s, total: 10h 28min
Wall time: 1h 16min 20s


  model = cd_fast.enet_coordinate_descent(


In [124]:
display(lasso_grid_search.best_score_)

0.12415856363486044

In [123]:
%%time

"""
class sklearn.ensemble.GradientBoostingRegressor(*, loss='squared_error', learning_rate=0.1, n_estimators=100,
subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, 
max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0
)

Parameter grid from chi et al:
Hyperparameters were tuned to minimize the cross-validated mean-squared error, using a grid search over 
several possible values for maximum tree depth (1, 3, 5, 10, 15, 20, 31) and the minimum sum of instance 
weight needed in a child (1, 3, 5, 7, 10).
"""
gb_hyperparameters_from_cider = {
    'min_samples_leaf': [1,7,20],
    'max_depth': [1, 5, 10, 25],
}
gb_classifier = sklearn_ensemble.GradientBoostingRegressor()
gb_grid_search = sklearn_model_selection.GridSearchCV(
    gb_classifier, 
    gb_hyperparameters_from_cider, 
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    verbose=1,
    n_jobs=40
)

gb_grid_search.fit(    
    X=malawi_mosaiks_normalized[mosaiks_columns], 
    y=malawi_mosaiks_normalized.outcome,
    sample_weight=malawi_mosaiks_normalized.hh_wgt
)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: user 8min 53s, sys: 2.86 s, total: 8min 56s
Wall time: 55min


In [125]:
gb_grid_search.best_score_

0.12287005209113795

#### PCA on Mosaiks features

In [139]:
pca = sklearn_decomposition.PCA(n_components=200)
mosaiks_pca_array = pca.fit_transform(malawi_mosaiks_normalized[mosaiks_columns])
mosaiks_pca = pd.DataFrame(data=mosaiks_pca_array)
mosaiks_pca_columns = [f'pca_{c}' for c in mosaiks_pca.columns]
mosaiks_pca.columns = mosaiks_pca_columns

In [140]:
mosaiks_pca[['HHID', 'case_id', 'hh_wgt', 'outcome']] = (
    malawi_mosaiks_normalized[['HHID', 'case_id', 'hh_wgt', 'outcome']]
)

In [143]:
%%time
gb_hyperparameters_from_cider = {
    'min_samples_leaf': [1,7,20],
    'max_depth': [1, 5, 10, 25],
}
gb_classifier = sklearn_ensemble.GradientBoostingRegressor()
gb_grid_search = sklearn_model_selection.GridSearchCV(
    gb_classifier, 
    gb_hyperparameters_from_cider, 
    scoring='r2', 
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    verbose=1, 
    n_jobs=40
)

gb_grid_search.fit(    
    X=mosaiks_pca[mosaiks_pca_columns[:100]], 
    y=mosaiks_pca.outcome,
    sample_weight=mosaiks_pca.hh_wgt
)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: user 14.1 s, sys: 116 ms, total: 14.2 s
Wall time: 1min 31s


In [144]:
gb_grid_search.best_score_

0.12147961046829074

### Univariate r2s for all covariates considered

In [214]:
r2s_univariate = []
malawi_covariate_columns_list = list(covariates_to_consider)
for selected_covariate in malawi_covariate_columns_list:
    lr = sklearn_linear_model.LinearRegression()
    lr.fit(
        malawi[[selected_covariate]], 
        malawi.outcome,
        sample_weight=malawi.hh_wgt
    )
    # Make predictions on test data
    y_pred = lr.predict(malawi[[selected_covariate]])
    
    r2 = sklearn_metrics.r2_score(
        malawi.outcome, y_pred, sample_weight=malawi.hh_wgt
    )
    r2s_univariate.append(r2)

In [226]:
all_univariate_r2s = pd.DataFrame(
    data=np.array([malawi_covariate_columns_list, r2s_univariate]).transpose(),
    columns=['covariate', 'univariate_r2']
)
all_univariate_r2s.univariate_r2 = all_univariate_r2s.univariate_r2.astype(float)
display(all_univariate_r2s.sort_values('univariate_r2', ascending=False).head(50))

Unnamed: 0,covariate,univariate_r2
1532,hh_f03a,0.340331
827,hh_f37,0.338343
2542,hh_f41_Flush to septic tank,0.138543
4172,hh_f19_YES,0.131945
615,hh_f27_nan,0.131861
299,hh_f11_ELECTRICITY,0.130127
1086,hh_f12_ELECTRICITY,0.126646
45,hh_f36_PIPED INTO DWELLING,0.122557
119,hh_f12_GAS,0.115442
3829,hh_f36_1_PIPED INTO DWELLING,0.112674


### Print summaries of covariates

In [None]:
# We determine what is included by omitting what's not included. This approach handles one-hot encoded
# columns correctly.

covariates_considered = [c for c in not_dropped_for_missingness if c not in columns_excluded]
with pd.option_context('display.max_rows', 300, 'display.max_colwidth', 1):

    display(
        summary[
            summary.covariate.isin(covariates_considered)
        ]
        [['covariate', 'description', 'missing_fraction', 'mean', 'median', 'std']]
    )

In [None]:
with pd.option_context('display.max_rows', 300, 'display.max_colwidth', 1):

    display(
        summary[
            (summary.covariate.isin(columns_excluded))
            & (~summary.covariate.isin(dropped_for_missingness))
        ]
        [['covariate', 'description', 'missing_fraction', 'mean', 'median', 'std']]
    )

In [None]:
with pd.option_context('display.max_rows', 200, 'display.max_colwidth', 1):

    display(
        summary[summary.covariate.isin(dropped_for_missingness)]
        [['covariate', 'description', 'missing_fraction', 'mean', 'median', 'std']]
    )

## Mosaiks preprocessing

#### Preparing map query

In [329]:
malawi_geo = gpd.read_file(
    malawi_directory / 'mwi_adm_nso_hotosm_20230405_shp' / 'mwi_admbnda_adm2_nso_hotosm_20230405.shp'
)

In [19]:
num_y_partitions = 3
ys = np.linspace(bounds.miny, bounds.maxy, num_y_partitions + 1)
for i in range(num_y_partitions):
    print(ys[i], ys[i+1])

-17.12974811999993 -14.542280706999938
-14.542280706999938 -11.954813293999944
-11.954813293999944 -9.367345880999949


#### Fine-grained mosaiks

In [121]:
mosaiks = ddf.read_csv(mosaiks_path / 'malawi_fine' / '*.csv')

In [122]:
mosaiks = dgpd.from_dask_dataframe(
    mosaiks, dgpd.points_from_xy(mosaiks, 'Lon', 'Lat')
)

In [142]:
geo_mosaiks = mosaiks.sjoin(malawi_geo, predicate='within', how='inner')

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  meta = geopandas.sjoin(left._meta, right._meta, how=how, predicate=predicate)


#### Coarse-grained mosaiks

unusable - for now, it appears Africa's mosaiks files are incomplete.

In [8]:
mosaiks = ddf.read_csv(mosaiks_directory / 'coarse' / '*.csv')

malawi_geo = gpd.read_file(
    malawi_directory / 'mwi_adm_nso_hotosm_20230405_shp' / 'mwi_admbnda_adm0_nso_hotosm_20230405.shp'
)
bounds = malawi_geo.bounds.iloc[0]

In [10]:
mosaiks.lon.max().compute()

32.65

In [345]:
mosaiks_bounded = mosaiks[
    (mosaiks.lon >= bounds.minx)
    & (mosaiks.lon <= bounds.maxx)
    & (mosaiks.lat >= bounds.miny)
    & (mosaiks.lat <= bounds.maxx)
]

In [346]:
mosaiks_bounded_computed = mosaiks_bounded.compute()

In [6]:
mosaiks = dgpd.from_dask_dataframe(
    mosaiks, dgpd.points_from_xy(mosaiks, 'lon', 'lat')
)

NameError: name 'mosaiks' is not defined

In [357]:
computed = mosaiks[['geometry']].compute()

# .to_file(data_directory / 'mosaiks_output' / 'africa_coarse_location_only.shp')

In [360]:
computed.to_file(mosaiks_directory / 'output' / 'africa_coarse_location_only.shp')

In [5]:
for file in (mosaiks_directory / 'coarse').glob('*.csv'):
    df = ddf.read_csv(file)
    df = dgpd.from_dask_dataframe(
        df, dgpd.points_from_xy(df, 'lon', 'lat')
    )
    df[['geometry']].compute().to_file(mosaiks_directory / 'output' / f'{file.name}_location_only.shp')

  df[name] = val


In [326]:
mosaiks_in_malawi = mosaiks.sjoin(malawi_geo, predicate='within', how='inner')

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  meta = geopandas.sjoin(left._meta, right._meta, how=how, predicate=predicate)


In [327]:
mosaiks_in_malawi_computed = mosaiks_in_malawi.compute()

In [328]:
len(mosaiks_in_malawi_computed)

0

In [320]:
len(mosaiks_in_malawi_computed)

77910

In [298]:
mosaiks_in_malawi = mosaiks[mosaiks.geometry.within(malawi_geo.iloc[0].geometry)]
mosaiks_in_malawi_computed = mosaiks_in_malawi.compute()

In [311]:
mosaiks = ddf.read_csv(str(mosaiks_directory / 'malawi_fine' / '*.csv'))

mosaiks = dgpd.from_dask_dataframe(
    mosaiks, dgpd.points_from_xy(mosaiks, 'Lon', 'Lat')
)


In [315]:
mosaiks_head = mosaiks.head(10)

In [316]:
mosaiks_head[
    mosaiks_head.geometry.within(malawi_geo.iloc[0].geometry)
]

Unnamed: 0,Lat,Lon,BoxLabel,Unnamed: 4,.1,.2,.3,.4,.5,.6,...,.3991,.3992,.3993,.3994,.3995,.3996,.3997,.3998,.3999,geometry
0,-11.955,33.635,Malawi 2,0.170008,0.63444,0.019116,0.68551,0.338629,0.35248,0.104404,...,0.0329,0.193308,0.135092,0.13475,1.062124,1.219159,0.709582,0.748263,0.024786,POINT (33.63500 -11.95500)
1,-13.775,33.635,Malawi 2,0.082285,0.367268,0.010162,0.462284,0.178959,0.206822,0.057531,...,0.023203,0.126262,0.081328,0.068765,0.68804,0.776211,0.482071,0.530758,0.019135,POINT (33.63500 -13.77500)
2,-13.785,33.635,Malawi 2,0.070622,0.323596,0.008341,0.373956,0.153765,0.22834,0.087386,...,0.019909,0.122797,0.088507,0.055622,0.663131,0.707385,0.434476,0.42967,0.034416,POINT (33.63500 -13.78500)
3,-13.795,33.635,Malawi 2,0.073091,0.405169,0.003787,0.676295,0.168738,0.147581,0.047011,...,0.006762,0.046159,0.034311,0.035998,0.760553,0.454671,0.359011,0.611862,0.006365,POINT (33.63500 -13.79500)
4,-13.805,33.635,Malawi 2,0.049633,0.28821,0.005367,0.505401,0.112409,0.136541,0.047413,...,0.012273,0.079417,0.051379,0.030846,0.618406,0.463685,0.269003,0.485262,0.011441,POINT (33.63500 -13.80500)
5,-13.815,33.635,Malawi 2,0.071722,0.339016,0.007961,0.446428,0.153106,0.184042,0.058747,...,0.014938,0.096399,0.068643,0.047508,0.640277,0.632705,0.355021,0.464346,0.017225,POINT (33.63500 -13.81500)
6,-13.825,33.635,Malawi 2,0.078247,0.317984,0.009977,0.400218,0.157336,0.195162,0.06625,...,0.018159,0.087256,0.076097,0.056665,0.668922,0.600207,0.415302,0.477198,0.01744,POINT (33.63500 -13.82500)
7,-13.835,33.635,Malawi 2,0.063673,0.324298,0.007354,0.453272,0.141424,0.151106,0.055621,...,0.014669,0.074235,0.059847,0.051411,0.653411,0.548116,0.388714,0.483854,0.016156,POINT (33.63500 -13.83500)
8,-13.845,33.635,Malawi 2,0.070136,0.339884,0.008587,0.333813,0.15246,0.18406,0.065987,...,0.016783,0.11347,0.079152,0.060439,0.613129,0.730515,0.479701,0.403412,0.020996,POINT (33.63500 -13.84500)
9,-13.855,33.635,Malawi 2,0.06669,0.345752,0.005758,0.523793,0.156903,0.144562,0.060406,...,0.009037,0.070228,0.052238,0.039818,0.652444,0.482536,0.337546,0.499016,0.010873,POINT (33.63500 -13.85500)


# Unused

In [44]:
geo = gpd.read_file(
    malawi_directory / 'mwi_adm_nso_hotosm_20230405_shp' / 'mwi_admbnda_adm3_nso_hotosm_20230405.shp'
)

ImportError: The 'read_file' function requires the 'pyogrio' or 'fiona' package, but neither is installed or imports correctly.
Importing fiona resulted in: /home/selker/.conda/envs/leo_base/lib/python3.9/site-packages/fiona/../../../libgdal.so.34: undefined symbol: sqlite3_total_changes64
Importing pyogrio resulted in: No module named 'pyogrio'

In [43]:
[c for c in malawi_raw if 'district' in c]

['district']

In [232]:
selected_covariate_list = selected_covariates_old

['hh_f11_ELECTRICITY',
 'hh_f12_GAS',
 'hh_f12_ELECTRICITY',
 'hh_t17_YES',
 'hh_f12_CHARCOAL',
 'hh_f36_PIPED INTO DWELLING',
 'hh_t10_BED &amp; MATTRESS',
 'hh_g09_YES',
 'hh_h04_YES',
 'hh_t03_It was more than adequate for household needs',
 'hh_f01_OWNED',
 'hh_h02d',
 'hh_t14_YES',
 'region_North',
 'hh_t07',
 'hh_f41_4_NO',
 'hh_t04_It was more than adequate for household needs',
 'hh_t11_BLANKET &amp; SHEETS',
 'hh_h01_YES',
 'hh_f52_YES',
 'mosaiks_2712',
 'district_Zomba',
 'district_Blantyre',
 'district_Chiradzulu',
 'hh_h03a',
 'hh_o0a_YES',
 'hh_f07_CONCRETE',
 'hh_f09_OTHER(SPECIFY)',
 'hh_f09_SAND',
 'hh_t03_It was less than adequate for household needs']

In [237]:
r2s_univariate = [np.nan]
for selected_covariate in selected_covariate_list:
    lr = sklearn_linear_model.LinearRegression()
    lr.fit(
        malawi[[selected_covariate]], 
        malawi.consumption_ppp_2017,
        sample_weight=malawi.hh_wgt
    )
    # Make predictions on test data
    y_pred = lr.predict(malawi[[selected_covariate]])
    
    r2 = sklearn_metrics.r2_score(
        malawi.consumption_ppp_2017, y_pred, sample_weight=malawi.hh_wgt
    )
    r2s_univariate.append(r2)

selected_covariates = pd.DataFrame(
    # add 'none' in front to indicate variance pre-covariates
    np.array([[None] + selected_covariate_list, mses_cumulative, r2s_cumulative, r2s_univariate]).transpose(), 
    columns=['Covariate', 'Cumulative Model MSE', 'Cumulative Model r2', 'Univariate r2']
)

selected_covariates['Description'] = selected_covariates.Covariate.apply(interpret_column_name)
selected_covariates['Cumulative Model MSE'] = selected_covariates['Cumulative Model MSE'].astype(float).round(1)
selected_covariates['Cumulative Model r2'] = selected_covariates['Cumulative Model r2'].astype(float).round(3)
selected_covariates['Univariate r2'] = selected_covariates['Univariate r2'].astype(float).round(3)

with pd.option_context('display.max_rows', 200, 'display.max_colwidth', 1):

    display(
        selected_covariates[['Covariate', 'Description', 'Cumulative Model MSE', 'Cumulative Model r2', 'Univariate r2']]
        .set_index('Covariate')
    )

Unnamed: 0_level_0,Description,Cumulative Model MSE,Cumulative Model r2,Univariate r2
Covariate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,,18244076.8,0.0,
hh_f11_ELECTRICITY,"Covariate: What is your main source of lighting fuel?, value: ELECTRICITY",11732814.1,0.195,0.13
hh_f12_GAS,"Covariate: What is your main source of cooking fuel?, value: GAS",9920509.3,0.32,0.115
hh_f12_ELECTRICITY,"Covariate: What is your main source of cooking fuel?, value: ELECTRICITY",8931944.7,0.388,0.127
hh_t17_YES,"Covariate: ..HH ate less than you thought you sh'd b'se of a lack of money/other resources?, value: YES",8226515.5,0.436,0.09
hh_f12_CHARCOAL,"Covariate: What is your main source of cooking fuel?, value: CHARCOAL",7895538.4,0.459,0.055
hh_f36_PIPED INTO DWELLING,"Covariate: What is your main source of drinking water?, value: PIPED INTO DWELLING",7653646.6,0.475,0.123
hh_t10_BED &amp; MATTRESS,"Covariate: What do you (HH HEAD) sleep on?, value: BED &amp; MATTRESS",7464199.9,0.488,0.101
hh_g09_YES,"Covariate: Over the past one week (7 days), did any people that you did nonlist as househol, value: YES",7338646.5,0.497,0.002
hh_h04_YES,"Covariate: ..12 months..faced with a situation when did not have enough food to feed the hh, value: YES",7193763.0,0.507,0.066


In [823]:
dropped_for_missingness

Index(['ag_e27a', 'ag_e27b', 'ag_e27c', 'ag_e27d', 'ag_e27e', 'ag_e27f',
       'ag_e27g', 'ag_e27h', 'ag_e28', 'ag_e29a',
       ...
       'hh_s16_oth', 'hh_s16a', 'hh_s16b', 'hh_s17', 'hh_s17_oth',
       'hh_s19_oth', 'hh_s19a', 'hh_s19b', 'hh_t10_oth', 'hh_t12_oth'],
      dtype='object', length=144)

In [673]:
# Using sklearn: Haven't figured out how to incorporate sample weights.
linear_regression = sklearn_linear_model.LinearRegression()
sfs = sklearn_feature_selection.SequentialFeatureSelector(
    linear_regression, n_features_to_select=10
)
sfs.fit(
    selection[list(malawi_covariate_columns)], 
    selection.consumption_ppp_2017,
    sample_weights=selection.hh_wgt
)

TypeError: fit() got an unexpected keyword argument 'sample_weights'

In [656]:
sklearn_selected_covariates = pd.DataFrame(
    sfs.get_feature_names_out(), 
    columns=['covariate']
)

sklearn_selected_covariates['description'] = sklearn_selected_covariates.covariate.apply(interpret_column_name)
# selected_covariates.mse = selected_covariates.mse.astype(float).round(1)