## Imports

In [3]:
import pandas as pd
from pathlib import Path
import numpy as np
import json
import warnings
import math

import dask.dataframe as ddf

import sklearn.linear_model as sklearn_linear_model
import sklearn.metrics as sklearn_metrics
import sklearn.model_selection as sklearn_model_selection
import sklearn.preprocessing as sklearn_preprocessing
import sklearn.feature_selection as sklearn_feature_selection
import sklearn.ensemble as sklearn_ensemble
import sklearn.decomposition as sklearn_decomposition
from sklearn.impute import SimpleImputer

from scipy import stats

from matplotlib import pyplot as plt

import geopandas as gpd
import dask_geopandas as dgpd

import matplotlib.pyplot as plt
import pyreadstat
from pandas.api.types import is_numeric_dtype

In [5]:
data_path = Path('/home/selker/eop/data/malawi')
out_path = Path('/home/selker/eop/eop/select_predictors')
# for reproducibility
RANDOM_STATE=11

## Load cleaned data

In [72]:
year = 2019

malawi = pd.read_parquet(data_path / f'malawi_cleaned_{year}.parquet')
summary = pd.read_parquet(data_path / f'malawi_summary_{year}.parquet')

## Select covariates

Partition your data into 10 random subsets, in order to perform 10-fold cross-validation.
Initialize model parameters m as empty set {}
For counter c from 1 to 30:
   For each variable v (that is not already included in m)
      For each fold i:
            temporarily hold out data in partition i (10% of observations)
            train a model (linear regression?) to predict consumption from {m+v} on the 90% of observations not in i 
                record the in-sample performance (RMSE and R2) on those 90% of observations, and store it [this is the in-sample cross-val performance for fold i]
                using this trained model (trained on the 90% of observations not in i), calculate performance (RMSE and R2) on the held-out observation in i, store those values [these represent the  out-of-sample cross-val performance for fold i]
      Averaging across the 10 folds from the above loop, calculate the average in-sample and out-of-sample performance that you obtain for the model that includes m and v. Store these values.
   Choose the v* that maximizes the out-of-sample performance (among all of the v's tested in the above loop)
   Add v* to your model, and then iterate, until you've identified the best 30 variables

### Common functions

In [73]:
class Score:

    def __init__(self, is_mse, oos_mse, is_r2, oos_r2):
        self.is_mse = is_mse
        self.oos_mse = oos_mse
        self.is_r2 = is_r2
        self.oos_r2 = oos_r2


    def get(self, key):
        if key == 'is_mse':
            return self.is_mse
        elif key == 'oos_mse':
            return self.oos_mse
        elif key == 'is_r2':
            return self.is_r2
        elif key == 'oos_r2':
            return self.oos_r2
        else:
            return ValueError(f'key {key} not recognized')

    def is_improvement_over(self, other):
        if other is None:
            return True
        else:
            return self.oos_r2 > other.oos_r2 

    def __add__(self, other):
        
        if not isinstance(other, Score):
            raise TypeError()

        return Score(
            is_mse=self.is_mse + other.is_mse,
            oos_mse=self.oos_mse + other.oos_mse,
            is_r2=self.is_r2 + other.is_r2,
            oos_r2=self.oos_r2 + other.oos_r2
        )

    def __truediv__(self, denom):
        return Score(
            is_mse=self.is_mse / denom,
            oos_mse=self.oos_mse / denom,
            is_r2=self.is_r2 / denom,
            oos_r2=self.oos_r2 / denom,
        )


def get_columns_for_covariates(covariates):
    columns = []
    for covariate in covariates:
        columns_to_add = list(summary.loc[covariate].columns)
        columns = columns + columns_to_add
    return columns

def evenly_partition(dataset, n_partitions, random_state=None):
    
    shuffled = dataset.sample(frac=1, random_state=random_state)
    indices = np.linspace(0, len(dataset), n_partitions + 1)
    indices = [round(i) for i in indices]
    indices[-1] = len(dataset)
    
    folds = []
    
    for i in range(n_partitions):
        test = shuffled[indices[i] : indices[i+1]]
        train = pd.concat((
            shuffled[:indices[i]], shuffled[indices[i+1]:]
        ))
        folds.append((train, test))
    
    return folds


def forward_select_features(dataset, candidate_covariates, num_to_select=30, random_state=None):

    n_folds = 10
    
    folds = evenly_partition(dataset, n_folds, random_state)
    
    selected_covariate_list = []
    columns_so_far = []
    
    cumulative_scores = []

    # avoid set-order non-determinism
    unselected_covariates = list(candidate_covariates.copy())
    unselected_covariates.sort()

    global global_values
    global_values = dict()

    while((len(unselected_covariates) > 0) and (len(selected_covariate_list) < num_to_select)):
        
        best_score_this_step = None
        best_covariate_this_step = None

        for covariate in unselected_covariates:

            # the values in the summary are a numpy array - need a list for list concatenation to work.
            columns_to_add = list(summary.loc[covariate].columns)
            columns_to_try = columns_so_far + columns_to_add

            fold_scores = []
            i = 0
            for train, test in folds:

                # fit the model on training data
                lr = sklearn_linear_model.LinearRegression()
                lr.fit(
                    train[columns_to_try], 
                    train.outcome,
                    sample_weight=train.hh_wgt
                )
                
                # Make predictions on training data, score
                y_pred = lr.predict(train[columns_to_try])
                is_mse = sklearn_metrics.mean_squared_error(
                    train.outcome, y_pred, sample_weight=train.hh_wgt
                )
                is_r2 = sklearn_metrics.r2_score(
                    train.outcome, y_pred, sample_weight=train.hh_wgt
                )

                # Make predictions on test data, score
                y_pred = lr.predict(test[columns_to_try])
                oos_mse = sklearn_metrics.mean_squared_error(
                    test.outcome, y_pred, sample_weight=test.hh_wgt
                )
                oos_r2 = sklearn_metrics.r2_score(
                    test.outcome, y_pred, sample_weight=test.hh_wgt
                )

                fold_scores.append(Score(
                    is_mse=is_mse, is_r2=is_r2, oos_mse=oos_mse, oos_r2=oos_r2
                ))

                global_values[(len(selected_covariate_list), covariate, i)] = (
                    test.outcome, y_pred
                )
                i += 1
            average_scores = sum(
                fold_scores, start=Score(0,0,0,0)
            ) / n_folds

            if average_scores.is_improvement_over(best_score_this_step):

                best_score_this_step = average_scores
                best_covariate_this_step = covariate
                best_columns_this_step = columns_to_add
        
        if (
            (len(cumulative_scores) < 1) 
            or (best_score_this_step.is_improvement_over(cumulative_scores[-1]))
        ):
            selected_covariate_list.append(best_covariate_this_step)
            columns_so_far += best_columns_this_step
            cumulative_scores.append(best_score_this_step)
            unselected_covariates.remove(best_covariate_this_step)
    
        else:
            print('No more improvement.')
            break

    selected_covariates = pd.DataFrame(
        selected_covariate_list, 
        columns=['covariate']
    )

    for score_type in ('is_mse', 'oos_mse', 'is_r2', 'oos_r2'):
        selected_covariates[score_type] = [s.get(score_type) for s in cumulative_scores]
        selected_covariates[score_type] = selected_covariates[score_type].astype(float).round(3)
  
    selected_covariates = (
        selected_covariates.join(summary['description'], on='covariate', how='left')
    )
    
    return add_covariate_r2(dataset, selected_covariates)

def add_covariate_r2(dataset, selected_covariates):
    
    r2s_univariate = []

    selected_covariate_list = selected_covariates.covariate.values
    for selected_covariate in selected_covariate_list:
        columns = list(summary.loc[selected_covariate].columns)
        lr = sklearn_linear_model.LinearRegression()
        lr.fit(
            dataset[columns], 
            dataset.outcome,
            sample_weight=dataset.hh_wgt
        )

        y_pred = lr.predict(dataset[columns])
        
        r2 = sklearn_metrics.r2_score(
            dataset.outcome, y_pred, sample_weight=dataset.hh_wgt
        )
        r2s_univariate.append(r2)
    
    selected_covariates['single_covariate_r2'] = r2s_univariate
    selected_covariates['single_covariate_r2'] = (
        selected_covariates['single_covariate_r2'].astype(float).round(3)
    )

    return selected_covariates

In [8]:
if year == 2016:
    manually_excluded = {
        'outcome',
        'hh_wgt',
        'af_bio_1', # annual mean temp
        'hh_f01_4a', # This and next 3: confusing questions about names listed on ownership doc for property
        'hh_f01_4b',
        'hh_f01_4c',
        'hh_f01_4d',
        'asset_index'
    } 
    
    _, consumption_metadata = pyreadstat.read_dta(
            data_path / 'MWI_2016_IHS-IV_v04_M_STATA14/consumption_aggregate/ihs4 consumption aggregate.dta', metadataonly=True
    )


elif year == 2019:
    manually_excluded = {
        'asset_index',
        'hh_a02a', 
        'hh_a03', 
        'hh_a23', 
        'hh_a22', 
        'interviewDate', 
        'consumption_ppp_2017', 
        'hh_f18', # total value of firewood you used in the last week
        'index_mosaiks',
        'ea_id' # debatable
    }

    durable_verifiable_modules = {
        'hh_mod_a_filt',
        # 'HH_MOD_F',
        'HH_MOD_X',
        'ag_mod_a',
        'householdgeovariables_ihs5',
        'HH_MOD_L_durable_goods',
        'HH_MOD_M_ag_goods'
    }

    non_durable_verifiable_covariates_override = {
        'hh_g09',
        'hh_s01',
        'hh_w01',
        'hh_x09',
        'hh_a06',
        'hh_a11',
        'hh_a13',
        'hh_a22',
        'hh_a23',
        'ag_s01',
    }
    durable_verifiable_covariates_override= {
        'area',
        'district',
        'region',
        'hhsize',
        'urban',
        'num_adults',
        'num_children',
        'hh_f01',
        'yearly_rent',
        'hh_f06',
        'hh_f07',
        'hh_f07_oth',
        'hh_f08',
        'hh_f08_oth',
        'hh_f09',
        'hh_f09_oth',
        'hh_f10',
        'hh_f11',
        'hh_f11_oth',
        'hh_f12',
        'hh_f12_oth',
        'hh_f19',
        'hh_f31',
        'hh_f34',
        'hh_f36',
        'hh_f36_oth',
        'hh_f41',
        'hh_f41_oth',
        'hh_f43',
        'hh_f43_oth',
        'TA',
        'adulteq',
    }
        
    _, consumption_metadata = pyreadstat.read_dta(
        data_path / 'MWI_2019_IHS-V_v06_M_Stata/ihs5_consumption_aggregate.dta', metadataonly=True
    )

consumption_columns_excluded = (
    set(consumption_metadata.column_names)
    # columns we don't want to exclude as consumption columns
    - {'region', 'district', 'ea_id', 'area', 'urban', 'hhsize'} 
)

mosaiks_columns_excluded = {
    covariate for covariate in summary.index if covariate.startswith('mosaiks')
}

columns_excluded = consumption_columns_excluded | mosaiks_columns_excluded | manually_excluded

covariates_to_consider = set(summary[summary.type != 'dropped'].index.values) - columns_excluded

summary_to_consider = summary[summary.index.isin(covariates_to_consider)]
durable_verifiable_covariates_table = summary_to_consider[
    (
        (summary_to_consider.module.isin(durable_verifiable_modules))
        | (summary_to_consider.index.isin(durable_verifiable_covariates_override))
    )
    & ~(summary_to_consider.index.isin(non_durable_verifiable_covariates_override))
]
non_durable_verifiable_covariates_table = summary_to_consider[~summary_to_consider.index.isin(durable_verifiable_covariates_table.index)]

durable_verifiable_covariates = set(durable_verifiable_covariates_table.index.values)

### Using forward selection from the full set

All cov, all households

In [8]:
%%time
selected_covariates = forward_select_features(
    malawi, covariates_to_consider, num_to_select = 30, random_state=RANDOM_STATE
)
selected_covariates.to_csv(out_path / f'{year}' / 'covariates_country_all.csv', index=False)

No more improvement.
CPU times: user 5h 23min 18s, sys: 23h 14min 22s, total: 1d 4h 37min 40s
Wall time: 37min 29s


Durable/verifiable cov, all households

In [None]:
%%time
selected_covariates_durable = forward_select_features(
    malawi, durable_verifiable_covariates, num_to_select = 30, random_state=RANDOM_STATE
)
selected_covariates_durable.to_csv(out_path / f'{year}' / 'covariates_country_durable.csv', index=False)

### Using forward selection on a subset of households

In [16]:
malawi_rural = malawi[malawi.reside_URBAN == 0]

In [17]:
%%time
selected_covariates_rural = forward_select_features(
    malawi_rural, covariates_to_consider, num_to_select = 30, random_state=RANDOM_STATE
)
selected_covariates_rural.to_csv(out_path / f'{year}' / 'covariates_rural_all.csv', index=False)


CPU times: user 14h 12min 15s, sys: 2d 10h 21min, total: 3d 33min 15s
Wall time: 1h 35min 2s


In [23]:
%%time
durable_verifiable_selected_covariates_rural = forward_select_features(
    malawi_rural, durable_verifiable_covariates, num_to_select = 30, random_state=RANDOM_STATE
)
durable_verifiable_selected_covariates_rural.to_csv(
    out_path / f'{year}' / 'covariates_rural_durable.csv', index=False
)

CPU times: user 4h 18min 29s, sys: 18h 20min 51s, total: 22h 39min 21s
Wall time: 29min 38s


#### Only households < $5/person/day

In [8]:
malawi_below_5 = malawi[malawi.outcome < 5]

In [None]:
%%time
selected_covariates_below_5 = forward_select_features(
    malawi_below_5, covariates_to_consider, num_to_select = 30, random_state=RANDOM_STATE
)
selected_covariates_below_5.to_csv(out_path / f'{year}' / 'covariates_below_5_all.csv', index=False)


In [9]:
%%time
durable_verifiable_selected_covariates_below_5 = forward_select_features(
    malawi_below_5, durable_verifiable_covariates, num_to_select = 30, random_state=RANDOM_STATE
)
durable_verifiable_selected_covariates_below_5.to_csv(
    out_path / f'{year}' /  'covariates_below_5_durable.csv', index=False
)


CPU times: user 5h 16min 20s, sys: 22h 55min 33s, total: 1d 4h 11min 54s
Wall time: 37min 43s


#### By district

In [30]:
%%time
malawi_below_5 = malawi[malawi.outcome < 5]
district_columns = [c for c in malawi.columns if c.startswith('district')]
for c in district_columns:
    district = c.split('_')[1]
    in_district = malawi_below_5[malawi_below_5[c] == 1]
    if len(in_district) < 100:
        print(f'data size for {district} is small: {len(in_district)}')

    selected_covariates = forward_select_features(
        in_district, durable_verifiable_covariates, num_to_select = 10, random_state=RANDOM_STATE
    )

    selected_covariates.to_csv(Path('district_level_covariates') / f'{district}.csv', index=False)

data size for Likoma is small: 28
No more improvement.
CPU times: user 1h 28min 59s, sys: 3h 37min 14s, total: 5h 6min 13s
Wall time: 47min 48s


### Print output tables

In [None]:
selected_covariates_durable = pd.read_csv(out_path / f'{year}' /  'covariates_country_durable.csv')
selected_covariates = pd.read_csv(out_path / f'{year}' /  'covariates_country_all.csv')
selected_covariates_rural = pd.read_csv(out_path / f'{year}' /  'covariates_rural_all.csv')
selected_covariates_rural_durable = pd.read_csv(out_path / f'{year}' /  'covariates_rural_durable.csv')
selected_covariates_under_5 = pd.read_csv(out_path / f'{year}' /  'covariates_below_5_all.csv')

In [None]:
selected_covariates_under_5_durable = pd.read_csv(out_path / f'{year}' /  'covariates_below_5_durable.csv')

with pd.option_context('display.max_rows', 200, 'display.max_colwidth', None):

    display(
        selected_covariates_under_5_durable[['covariate', 'description', 'oos_r2', 'is_r2', 'single_covariate_r2']]
        .set_index('covariate')
    )

### Output column list

In [168]:
selected_covariates[~selected_covariates.Covariate.isna()].Covariate.to_csv('2016/selected_columns_no_mosaiks.csv', index=False)

In [None]:
columns = pd.read_csv('2016/selected_columns_no_mosaiks.csv')

##### Shuffle to simulate less-selected features

In [24]:
columns_shuffled = columns.sample(frac=1)

In [26]:
columns_shuffled.to_csv('selected_columns_shuffled.csv', index=False)

## LASSO on many covariates

In [70]:
%%time
lasso = sklearn_linear_model.Lasso(max_iter=5000)
alphas = [1e-2, 3e-2, 1e-1, 3e-1, 1]

lasso_grid_search = sklearn_model_selection.GridSearchCV(
    lasso,
    {'alpha': alphas},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
)
lasso_grid_search.fit(    
    X=malawi[get_columns_for_covariates(covariates_to_consider)], 
    y=malawi.outcome,
    sample_weight=malawi.hh_wgt
)

CPU times: user 51min 31s, sys: 2h 43min 46s, total: 3h 35min 17s
Wall time: 5min 7s


In [72]:
lasso_grid_search.best_score_

0.3906709953918759

In [71]:
lasso_grid_search.best_params_

{'alpha': 0.1}

In [54]:
%%time
lasso = sklearn_linear_model.Lasso(max_iter=5000)
alphas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2]

lasso_grid_search = sklearn_model_selection.GridSearchCV(
    lasso,
    {'alpha': alphas},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
)
lasso_grid_search.fit(    
    X=malawi[get_columns_for_covariates(selected_covariates_under_5_durable.covariate)], 
    y=malawi.outcome,
    sample_weight=malawi.hh_wgt
)

CPU times: user 1min 42s, sys: 6min 37s, total: 8min 19s
Wall time: 14.9 s


In [55]:
lasso_grid_search.best_score_

0.47853737835156157

In [56]:
lasso_grid_search.best_params_

{'alpha': 0.003}

## Geo-only models

### By district

#### Predicting consumption

In [9]:
district_columns = [c for c in malawi.columns if 'district' in c]

In [80]:
lr = sklearn_linear_model.LinearRegression()
cross_val = sklearn_model_selection.cross_validate(
    lr,
    X=malawi[district_columns], 
    y=malawi.outcome,
    # params={'sample_weight': weight_mangled},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    return_train_score=True
)
#display(cross_val_score.mean())

In [79]:
display(cross_val['train_score'])
display(cross_val['test_score'])


array([0.10732983, 0.10747813, 0.11018216, 0.1410627 , 0.10573383])

array([0.14322041, 0.15433102, 0.11530569, 0.06894499, 0.14109288])

In [94]:
lr = sklearn_linear_model.LinearRegression()
lr.fit(
    malawi[district_columns], 
    malawi.outcome,
    sample_weight=malawi.hh_wgt
)

y_pred = lr.predict(malawi[district_columns])

r2_score = sklearn_metrics.r2_score(
    malawi.outcome, y_pred, sample_weight=malawi.hh_wgt
)

In [74]:
weight_mangled = malawi.hh_wgt.copy()
weight_mangled.iloc[5600] = 0

In [90]:
malawi.outcome.sort_values()

6797      0.205450
4976      0.240865
4675      0.258202
6304      0.262698
9167      0.280653
           ...    
1884     61.832170
1875     66.963074
5589     83.218633
2126     91.278029
5600    170.947620
Name: outcome, Length: 11434, dtype: float64

In [91]:
sklearn_metrics.r2_score(
    malawi.outcome, y_pred, sample_weight=malawi.hh_wgt
)

0.09661181354426485

In [92]:
sklearn_metrics.r2_score(
    malawi.outcome, y_pred, sample_weight=weight_mangled
)

0.13147505766354173

In [69]:
np.abs((malawi.outcome - y_pred) * malawi.hh_wgt).sort_values()

4877         0.178288
10515        0.182463
6986         0.200275
5036         0.204838
7725         0.318399
             ...     
5739     16016.253362
5793     16232.328424
11329    16369.078415
5589     41987.096895
5600     89036.536275
Length: 11434, dtype: float64

In [10]:
%%time
lasso = sklearn_linear_model.Lasso(max_iter=10000)
alphas = [1e-6, 1e-5, 1e-4]
lasso_grid_search = sklearn_model_selection.GridSearchCV(
    lasso,
    {'alpha': alphas},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
)
lasso_grid_search.fit(    
    X=malawi[district_columns], 
    y=malawi.outcome,
    sample_weight=malawi.hh_wgt
)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


CPU times: user 3min 46s, sys: 9min 52s, total: 13min 39s
Wall time: 36.2 s


  model = cd_fast.enet_coordinate_descent(


In [11]:
display(lasso_grid_search.best_params_)

{'alpha': 1e-06}

In [12]:
display(lasso_grid_search.best_score_)

0.12149778517312788

#### Predicting asset index

In [8]:
district_columns = [c for c in malawi.columns if 'district' in c]

In [28]:
lr = sklearn_linear_model.LinearRegression()
cross_val_score = sklearn_model_selection.cross_val_score(
    lr,
    X=malawi[district_columns], 
    y=malawi.asset_index,
    params={'sample_weight': malawi.hh_wgt},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
)
display(cross_val_score.mean())

0.14704508260667845

In [25]:
%%time
lasso = sklearn_linear_model.Lasso(max_iter=10000)
alphas = [1e-6, 1e-5, 1e-4]
lasso_grid_search = sklearn_model_selection.GridSearchCV(
    lasso,
    {'alpha': alphas},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
)
lasso_grid_search.fit(    
    X=malawi[district_columns], 
    y=malawi.asset_index,
    sample_weight=malawi.hh_wgt
)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


CPU times: user 3min 55s, sys: 10min 9s, total: 14min 5s
Wall time: 35.4 s


  model = cd_fast.enet_coordinate_descent(


In [26]:
display(lasso_grid_search.best_params_)

{'alpha': 1e-06}

In [27]:
display(lasso_grid_search.best_score_)

0.14717503161064333

### By EA

#### Predicting consumption

In [96]:
ea_columns = [c for c in malawi.columns if c.startswith('ea_id')]

In [97]:
lr = sklearn_linear_model.LinearRegression()
lr.fit(
    malawi[ea_columns], 
    malawi.outcome,
    sample_weight=malawi.hh_wgt
)

y_pred = lr.predict(malawi[ea_columns])

r2_score = sklearn_metrics.r2_score(
    malawi.outcome, y_pred, sample_weight=malawi.hh_wgt
)

In [18]:
cross_val_score = sklearn_model_selection.cross_val_score(
    lr,
    X=malawi[ea_columns], 
    y=malawi.outcome,
    params={'sample_weight': malawi.hh_wgt},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
)
display(cross_val_score.mean())

0.08266767656355493

In [111]:
%%time
lasso = sklearn_linear_model.Lasso(max_iter=5000)
alphas = [1e-6, 1e-5, 1e-4, 1e-3]
lasso_grid_search = sklearn_model_selection.GridSearchCV(
    lasso,
    {'alpha': alphas},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
)
lasso_grid_search.fit(    
    X=malawi[ea_columns], 
    y=malawi.outcome,
    sample_weight=malawi.hh_wgt
)

CPU times: user 38min 41s, sys: 2h 44min 35s, total: 3h 23min 17s
Wall time: 4min 19s


In [112]:
display(lasso_grid_search.best_score_)

0.12423916624082476

In [114]:
display(lasso_grid_search.best_params_)

{'alpha': 0.0001}

In [None]:
%%time

"""
class sklearn.ensemble.GradientBoostingRegressor(*, loss='squared_error', learning_rate=0.1, n_estimators=100,
subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, 
max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0
)

Parameter grid from chi et al:
Hyperparameters were tuned to minimize the cross-validated mean-squared error, using a grid search over 
several possible values for maximum tree depth (1, 3, 5, 10, 15, 20, 31) and the minimum sum of instance 
weight needed in a child (1, 3, 5, 7, 10).
"""
gb_hyperparameters_from_cider = {
    'min_samples_leaf': [1,7,20],
    'max_depth': [1, 5, 10, 25],
}
gb_classifier = sklearn_ensemble.GradientBoostingRegressor()
gb_grid_search = sklearn_model_selection.GridSearchCV(
    gb_classifier, 
    gb_hyperparameters_from_cider, 
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    verbose=1,
    n_jobs=40
)

gb_grid_search.fit(    
    X=malawi[ea_columns], 
    y=malawi.outcome,
    sample_weight=malawi.hh_wgt
)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [21]:
gb_grid_search.best_score_

0.11337282319103255

#### Predicting asset index

In [31]:
lr = sklearn_linear_model.LinearRegression()
lr.fit(
    malawi[ea_columns], 
    malawi.asset_index,
    sample_weight=malawi.hh_wgt
)

y_pred = lr.predict(malawi[ea_columns])

r2_score = sklearn_metrics.r2_score(
    malawi.asset_index, y_pred, sample_weight=malawi.hh_wgt
)

In [32]:
r2_score

0.2958634898250986

In [33]:
cross_val_score = sklearn_model_selection.cross_val_score(
    lr,
    X=malawi[ea_columns], 
    y=malawi.asset_index,
    params={'sample_weight': malawi.hh_wgt},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
)
display(cross_val_score.mean())

0.16087145042569134

In [38]:
%%time
lasso = sklearn_linear_model.Lasso(max_iter=5000)
alphas = [3e-5, 1e-4, 3e-4]
lasso_grid_search = sklearn_model_selection.GridSearchCV(
    lasso,
    {'alpha': alphas},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
)
lasso_grid_search.fit(    
    X=malawi[ea_columns], 
    y=malawi.asset_index,
    sample_weight=malawi.hh_wgt
)

CPU times: user 1min 44s, sys: 7min 10s, total: 8min 54s
Wall time: 17.2 s


In [39]:
display(lasso_grid_search.best_score_)

0.1975326417750394

In [40]:
display(lasso_grid_search.best_params_)

{'alpha': 0.0001}

In [41]:
%%time

"""
class sklearn.ensemble.GradientBoostingRegressor(*, loss='squared_error', learning_rate=0.1, n_estimators=100,
subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, 
max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0
)

Parameter grid from chi et al:
Hyperparameters were tuned to minimize the cross-validated mean-squared error, using a grid search over 
several possible values for maximum tree depth (1, 3, 5, 10, 15, 20, 31) and the minimum sum of instance 
weight needed in a child (1, 3, 5, 7, 10).
"""
gb_hyperparameters_from_cider = {
    'min_samples_leaf': [1,7,20],
    'max_depth': [1, 5, 10, 25],
}
gb_classifier = sklearn_ensemble.GradientBoostingRegressor()
gb_grid_search = sklearn_model_selection.GridSearchCV(
    gb_classifier, 
    gb_hyperparameters_from_cider, 
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    verbose=1,
    n_jobs=40
)

gb_grid_search.fit(    
    X=malawi[ea_columns], 
    y=malawi.asset_index,
    sample_weight=malawi.hh_wgt
)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: user 51 s, sys: 1.08 s, total: 52.1 s
Wall time: 1min 46s


In [42]:
gb_grid_search.best_score_

0.1856745929489712

## Mosaiks-only models

In [58]:
mosaiks_columns = [c for c in malawi.columns if c.startswith('mosaiks')]

mosaiks_means = malawi[mosaiks_columns].mean()
mosaiks_stds = malawi[mosaiks_columns].std()
malawi_mosaiks_normalized = (malawi[mosaiks_columns] - mosaiks_means) / mosaiks_stds
# std of 0 -> div by 0 in previous step; fill with zeros.
malawi_mosaiks_normalized.fillna(value=0, inplace=True)

malawi_mosaiks_normalized[['case_id', 'hh_wgt', 'outcome', 'asset_index']] = (
    malawi[['case_id', 'hh_wgt', 'outcome', 'asset_index']]
)

### Predicting consumption

In [84]:
%%time
lasso = sklearn_linear_model.Lasso(max_iter=5000)
alphas = [0.001]

lasso_grid_search = sklearn_model_selection.GridSearchCV(
    lasso,
    {'alpha': alphas},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=40

)
lasso_grid_search.fit(    
    X=malawi_mosaiks_normalized[mosaiks_columns], 
    y=malawi_mosaiks_normalized.outcome,
    sample_weight=malawi_mosaiks_normalized.hh_wgt
)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


CPU times: user 1h 21min 13s, sys: 3h 47min 13s, total: 5h 8min 26s
Wall time: 9min 43s


  model = cd_fast.enet_coordinate_descent(


In [85]:
display(lasso_grid_search.best_params_)

{'alpha': 0.001}

In [86]:
display(lasso_grid_search.best_score_)

0.11609345002255211

In [87]:
%%time

"""
class sklearn.ensemble.GradientBoostingRegressor(*, loss='squared_error', learning_rate=0.1, n_estimators=100,
subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, 
max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0
)

Parameter grid from chi et al:
Hyperparameters were tuned to minimize the cross-validated mean-squared error, using a grid search over 
several possible values for maximum tree depth (1, 3, 5, 10, 15, 20, 31) and the minimum sum of instance 
weight needed in a child (1, 3, 5, 7, 10).
"""
gb_hyperparameters_from_cider = {
    'min_samples_leaf': [1,7,20],
    'max_depth': [1, 5, 10, 25],
}
gb_classifier = sklearn_ensemble.GradientBoostingRegressor()
gb_grid_search = sklearn_model_selection.GridSearchCV(
    gb_classifier, 
    gb_hyperparameters_from_cider, 
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    verbose=1,
    n_jobs=40
)

gb_grid_search.fit(    
    X=malawi_mosaiks_normalized[mosaiks_columns], 
    y=malawi_mosaiks_normalized.outcome,
    sample_weight=malawi_mosaiks_normalized.hh_wgt
)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: user 9min 25s, sys: 3.48 s, total: 9min 28s
Wall time: 53min 46s


In [88]:
gb_grid_search.best_score_

0.12287005218374855

#### PCA on Mosaiks features

In [139]:
pca = sklearn_decomposition.PCA(n_components=200)
mosaiks_pca_array = pca.fit_transform(malawi_mosaiks_normalized[mosaiks_columns])
mosaiks_pca = pd.DataFrame(data=mosaiks_pca_array)
mosaiks_pca_columns = [f'pca_{c}' for c in mosaiks_pca.columns]
mosaiks_pca.columns = mosaiks_pca_columns

In [140]:
mosaiks_pca[['HHID', 'case_id', 'hh_wgt', 'outcome']] = (
    malawi_mosaiks_normalized[['HHID', 'case_id', 'hh_wgt', 'outcome']]
)

In [143]:
%%time
gb_hyperparameters_from_cider = {
    'min_samples_leaf': [1,7,20],
    'max_depth': [1, 5, 10, 25],
}
gb_classifier = sklearn_ensemble.GradientBoostingRegressor()
gb_grid_search = sklearn_model_selection.GridSearchCV(
    gb_classifier, 
    gb_hyperparameters_from_cider, 
    scoring='r2', 
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    verbose=1, 
    n_jobs=40
)

gb_grid_search.fit(    
    X=mosaiks_pca[mosaiks_pca_columns[:100]], 
    y=mosaiks_pca.outcome,
    sample_weight=mosaiks_pca.hh_wgt
)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: user 14.1 s, sys: 116 ms, total: 14.2 s
Wall time: 1min 31s


In [144]:
gb_grid_search.best_score_

0.12147961046829074

### Predicting asset index

In [127]:
%%time
lasso = sklearn_linear_model.Lasso(max_iter=5000)
alphas = [3e-6, 1e-5]

lasso_grid_search = sklearn_model_selection.GridSearchCV(
    lasso,
    {'alpha': alphas},
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=40

)
lasso_grid_search.fit(    
    X=malawi_mosaiks_normalized[mosaiks_columns], 
    y=malawi_mosaiks_normalized.asset_index,
    sample_weight=malawi_mosaiks_normalized.hh_wgt
)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


CPU times: user 2h 52min 27s, sys: 5h 26min 14s, total: 8h 18min 42s
Wall time: 15min 47s


  model = cd_fast.enet_coordinate_descent(


In [126]:
display(lasso_grid_search.best_params_)
display(lasso_grid_search.best_score_)

{'alpha': 3e-05}

0.19045245221653836

In [129]:
%%time

"""
class sklearn.ensemble.GradientBoostingRegressor(*, loss='squared_error', learning_rate=0.1, n_estimators=100,
subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
max_depth=3, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, 
max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0
)

Parameter grid from chi et al:
Hyperparameters were tuned to minimize the cross-validated mean-squared error, using a grid search over 
several possible values for maximum tree depth (1, 3, 5, 10, 15, 20, 31) and the minimum sum of instance 
weight needed in a child (1, 3, 5, 7, 10).
"""
gb_hyperparameters_from_cider = {
    'min_samples_leaf': [1,7,20],
    'max_depth': [1, 5, 10, 25],
}
gb_classifier = sklearn_ensemble.GradientBoostingRegressor()
gb_grid_search = sklearn_model_selection.GridSearchCV(
    gb_classifier, 
    gb_hyperparameters_from_cider, 
    scoring='r2',
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    verbose=1,
    n_jobs=40
)

gb_grid_search.fit(    
    X=malawi_mosaiks_normalized[mosaiks_columns], 
    y=malawi_mosaiks_normalized.asset_index,
    sample_weight=malawi_mosaiks_normalized.hh_wgt
)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: user 8min 49s, sys: 3.37 s, total: 8min 52s
Wall time: 54min 5s


In [130]:
gb_grid_search.best_score_

0.1923870657494606

#### PCA on Mosaiks features

In [133]:
pca = sklearn_decomposition.PCA(n_components=200)
mosaiks_pca_array = pca.fit_transform(malawi_mosaiks_normalized[mosaiks_columns])
mosaiks_pca = pd.DataFrame(data=mosaiks_pca_array)
mosaiks_pca_columns = [f'pca_{c}' for c in mosaiks_pca.columns]
mosaiks_pca.columns = mosaiks_pca_columns

In [136]:
mosaiks_pca[['case_id', 'hh_wgt', 'outcome', 'asset_index']] = (
    malawi_mosaiks_normalized[['case_id', 'hh_wgt', 'outcome', 'asset_index']]
)

In [None]:
%%time
gb_hyperparameters_from_cider = {
    'min_samples_leaf': [1,7,20],
    'max_depth': [1, 5, 10, 25],
}
gb_classifier = sklearn_ensemble.GradientBoostingRegressor()
gb_grid_search = sklearn_model_selection.GridSearchCV(
    gb_classifier, 
    gb_hyperparameters_from_cider, 
    scoring='r2', 
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    verbose=1, 
    n_jobs=40
)

gb_grid_search.fit(    
    X=mosaiks_pca[mosaiks_pca_columns[:100]], 
    y=mosaiks_pca.asset_index,
    sample_weight=mosaiks_pca.hh_wgt
)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [139]:
gb_grid_search.best_score_

0.19119701442596204

## Univariate r2s for all covariates considered

In [214]:
r2s_univariate = []
malawi_covariate_columns_list = list(covariates_to_consider)
for selected_covariate in malawi_covariate_columns_list:
    lr = sklearn_linear_model.LinearRegression()
    lr.fit(
        malawi[[selected_covariate]], 
        malawi.outcome,
        sample_weight=malawi.hh_wgt
    )
    # Make predictions on test data
    y_pred = lr.predict(malawi[[selected_covariate]])
    
    r2 = sklearn_metrics.r2_score(
        malawi.outcome, y_pred, sample_weight=malawi.hh_wgt
    )
    r2s_univariate.append(r2)

In [None]:
all_univariate_r2s = pd.DataFrame(
    data=np.array([malawi_covariate_columns_list, r2s_univariate]).transpose(),
    columns=['covariate', 'univariate_r2']
)
all_univariate_r2s.univariate_r2 = all_univariate_r2s.univariate_r2.astype(float)
display(all_univariate_r2s.sort_values('univariate_r2', ascending=False).head(50))

## Print summaries of covariates

In [None]:
# We determine what is included by omitting what's not included. This approach handles one-hot encoded
# columns correctly.

covariates_considered = [c for c in not_dropped_for_missingness if c not in columns_excluded]
with pd.option_context('display.max_rows', 300, 'display.max_colwidth', 1):

    display(
        summary[
            summary.covariate.isin(covariates_considered)
        ]
        [['covariate', 'description', 'missing_fraction', 'mean', 'median', 'std']]
    )

In [None]:
with pd.option_context('display.max_rows', 300, 'display.max_colwidth', 1):

    display(
        summary[
            (summary.covariate.isin(columns_excluded))
            & (~summary.covariate.isin(dropped_for_missingness))
        ]
        [['covariate', 'description', 'missing_fraction', 'mean', 'median', 'std']]
    )

In [None]:
with pd.option_context('display.max_rows', 200, 'display.max_colwidth', 1):

    display(
        summary[summary.covariate.isin(dropped_for_missingness)]
        [['covariate', 'description', 'missing_fraction', 'mean', 'median', 'std']]
    )

# Unused

In [120]:
%%time
lasso_cv = sklearn_linear_model.LassoCV(
    max_iter=5000,
    cv=sklearn_model_selection.KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    n_jobs=40,
    eps=1e-3,
    random_state=RANDOM_STATE,
    selection='random'
)

lasso_cv = lasso_cv.fit(    
    X=malawi_mosaiks_normalized[mosaiks_columns], 
    y=malawi_mosaiks_normalized.asset_index,
    sample_weight=malawi_mosaiks_normalized.hh_wgt
)

cv_scores = sklearn_model_selection.cross_val_score(
    lasso_cv, 
    malawi_mosaiks_normalized[mosaiks_columns],
    malawi_mosaiks_normalized.asset_index,
    cv=5,
    scoring='r2', 
    params={'sample_weight': malawi_mosaiks_normalized.hh_wgt},

)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

CPU times: user 3h 8min 6s, sys: 6h 28min 19s, total: 9h 36min 26s
Wall time: 30min 15s


  model = cd_fast.enet_coordinate_descent(


In [None]:
cv_scores = sklearn_model_selection.cross_val_score(
    lasso_cv, 
    malawi_mosaiks_normalized[mosaiks_columns],
    malawi_mosaiks_normalized.asset_index,
    cv=5,
    scoring='r2', 
    params={'sample_weight': malawi_mosaiks_normalized.hh_wgt},
)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

In [44]:
geo = gpd.read_file(
    malawi_directory / 'mwi_adm_nso_hotosm_20230405_shp' / 'mwi_admbnda_adm3_nso_hotosm_20230405.shp'
)

ImportError: The 'read_file' function requires the 'pyogrio' or 'fiona' package, but neither is installed or imports correctly.
Importing fiona resulted in: /home/selker/.conda/envs/leo_base/lib/python3.9/site-packages/fiona/../../../libgdal.so.34: undefined symbol: sqlite3_total_changes64
Importing pyogrio resulted in: No module named 'pyogrio'

In [43]:
[c for c in malawi_raw if 'district' in c]

['district']

In [232]:
selected_covariate_list = selected_covariates_old

['hh_f11_ELECTRICITY',
 'hh_f12_GAS',
 'hh_f12_ELECTRICITY',
 'hh_t17_YES',
 'hh_f12_CHARCOAL',
 'hh_f36_PIPED INTO DWELLING',
 'hh_t10_BED &amp; MATTRESS',
 'hh_g09_YES',
 'hh_h04_YES',
 'hh_t03_It was more than adequate for household needs',
 'hh_f01_OWNED',
 'hh_h02d',
 'hh_t14_YES',
 'region_North',
 'hh_t07',
 'hh_f41_4_NO',
 'hh_t04_It was more than adequate for household needs',
 'hh_t11_BLANKET &amp; SHEETS',
 'hh_h01_YES',
 'hh_f52_YES',
 'mosaiks_2712',
 'district_Zomba',
 'district_Blantyre',
 'district_Chiradzulu',
 'hh_h03a',
 'hh_o0a_YES',
 'hh_f07_CONCRETE',
 'hh_f09_OTHER(SPECIFY)',
 'hh_f09_SAND',
 'hh_t03_It was less than adequate for household needs']

In [237]:
r2s_univariate = [np.nan]
for selected_covariate in selected_covariate_list:
    lr = sklearn_linear_model.LinearRegression()
    lr.fit(
        malawi[[selected_covariate]], 
        malawi.consumption_ppp_2017,
        sample_weight=malawi.hh_wgt
    )
    # Make predictions on test data
    y_pred = lr.predict(malawi[[selected_covariate]])
    
    r2 = sklearn_metrics.r2_score(
        malawi.consumption_ppp_2017, y_pred, sample_weight=malawi.hh_wgt
    )
    r2s_univariate.append(r2)

selected_covariates = pd.DataFrame(
    # add 'none' in front to indicate variance pre-covariates
    np.array([[None] + selected_covariate_list, mses_cumulative, r2s_cumulative, r2s_univariate]).transpose(), 
    columns=['Covariate', 'Cumulative Model MSE', 'Cumulative Model r2', 'Univariate r2']
)

selected_covariates['Description'] = selected_covariates.Covariate.apply(interpret_column_name)
selected_covariates['Cumulative Model MSE'] = selected_covariates['Cumulative Model MSE'].astype(float).round(1)
selected_covariates['Cumulative Model r2'] = selected_covariates['Cumulative Model r2'].astype(float).round(3)
selected_covariates['Univariate r2'] = selected_covariates['Univariate r2'].astype(float).round(3)

with pd.option_context('display.max_rows', 200, 'display.max_colwidth', 1):

    display(
        selected_covariates[['Covariate', 'Description', 'Cumulative Model MSE', 'Cumulative Model r2', 'Univariate r2']]
        .set_index('Covariate')
    )

Unnamed: 0_level_0,Description,Cumulative Model MSE,Cumulative Model r2,Univariate r2
Covariate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,,18244076.8,0.0,
hh_f11_ELECTRICITY,"Covariate: What is your main source of lighting fuel?, value: ELECTRICITY",11732814.1,0.195,0.13
hh_f12_GAS,"Covariate: What is your main source of cooking fuel?, value: GAS",9920509.3,0.32,0.115
hh_f12_ELECTRICITY,"Covariate: What is your main source of cooking fuel?, value: ELECTRICITY",8931944.7,0.388,0.127
hh_t17_YES,"Covariate: ..HH ate less than you thought you sh'd b'se of a lack of money/other resources?, value: YES",8226515.5,0.436,0.09
hh_f12_CHARCOAL,"Covariate: What is your main source of cooking fuel?, value: CHARCOAL",7895538.4,0.459,0.055
hh_f36_PIPED INTO DWELLING,"Covariate: What is your main source of drinking water?, value: PIPED INTO DWELLING",7653646.6,0.475,0.123
hh_t10_BED &amp; MATTRESS,"Covariate: What do you (HH HEAD) sleep on?, value: BED &amp; MATTRESS",7464199.9,0.488,0.101
hh_g09_YES,"Covariate: Over the past one week (7 days), did any people that you did nonlist as househol, value: YES",7338646.5,0.497,0.002
hh_h04_YES,"Covariate: ..12 months..faced with a situation when did not have enough food to feed the hh, value: YES",7193763.0,0.507,0.066


In [823]:
dropped_for_missingness

Index(['ag_e27a', 'ag_e27b', 'ag_e27c', 'ag_e27d', 'ag_e27e', 'ag_e27f',
       'ag_e27g', 'ag_e27h', 'ag_e28', 'ag_e29a',
       ...
       'hh_s16_oth', 'hh_s16a', 'hh_s16b', 'hh_s17', 'hh_s17_oth',
       'hh_s19_oth', 'hh_s19a', 'hh_s19b', 'hh_t10_oth', 'hh_t12_oth'],
      dtype='object', length=144)

In [673]:
# Using sklearn: Haven't figured out how to incorporate sample weights.
linear_regression = sklearn_linear_model.LinearRegression()
sfs = sklearn_feature_selection.SequentialFeatureSelector(
    linear_regression, n_features_to_select=10
)
sfs.fit(
    selection[list(malawi_covariate_columns)], 
    selection.consumption_ppp_2017,
    sample_weights=selection.hh_wgt
)

TypeError: fit() got an unexpected keyword argument 'sample_weights'

In [656]:
sklearn_selected_covariates = pd.DataFrame(
    sfs.get_feature_names_out(), 
    columns=['covariate']
)

sklearn_selected_covariates['description'] = sklearn_selected_covariates.covariate.apply(interpret_column_name)
# selected_covariates.mse = selected_covariates.mse.astype(float).round(1)