Code accompanying the manuscript under review - `Nowcasting household energy access for up-to-datemonitoring of Sustainable Development Goals` by Pokhriyal, Letouze and Vosoughi. Data availability is described in the Methods section and the reporting summary.

In [None]:
%run Utilities.ipynb

## Load administrative information

In [None]:
map_distances = pd.read_csv('./com_distances.csv')
map_distances['InputID'] = map_distances['InputID'].map(lambda x: str(x).zfill(8))
map_distances['TargetID'] = map_distances['TargetID'].map(lambda x: str(x).zfill(8))

In [None]:
region_names = pd.read_csv('./region_names.csv')
region_names['IDDR'] = region_names['IDDR'].astype(str).str.zfill(2)

## Load all targets

In [None]:
targets_2013 = pickle.load(open('targets.pickle','rb'))

targets_2015 = pd.read_csv(open('dhs_data/2015_targets/targets_comm.csv','rb'))
targets_2015['CCOD_CRCA'] = targets_2015['CCOD_CRCA'].astype(str).str.zfill(8)
targets_2015.set_index('CCOD_CRCA',inplace=True)

targets_2017 = pd.read_csv(open('dhs_data/2017_targets/targets_comm.csv','rb'))
targets_2017['CCOD_CRCA'] = targets_2017['CCOD_CRCA'].astype(str).str.zfill(8)
targets_2017.set_index('CCOD_CRCA',inplace=True)

targets_2019 = pd.read_csv(open('dhs_data/2019_targets/targets_comm.csv','rb'))
targets_2019['CCOD_CRCA'] = targets_2019['CCOD_CRCA'].astype(str).str.zfill(8)
targets_2019.set_index('CCOD_CRCA',inplace=True)


## Load all inputs

In [None]:
# load only the mean data for 2013 landsat to construct the PCA model
landsat = loadLandsatData(2013)
# choose number of features
n_components = 20
pcamd = PCA(n_components=n_components)
pcamd.fit(landsat.values)
print(pcamd.explained_variance_ratio_.sum())

In [None]:
data_2013_all = loadAllData(2013,pcamd)
data_2013_all['year'] = 2013
data_2013_aug = data_2013_all.join(targets_2013)
data_2013_aug.fillna(0,inplace=True)

data_2015_all = loadAllData(2015,pcamd)
data_2015_all['year'] = 2015
data_2015_aug = data_2015_all.join(targets_2015,how='inner')
data_2015_aug.fillna(0,inplace=True)

data_2017_all = loadAllData(2017,pcamd)
data_2017_all['year'] = 2017
data_2017_aug = data_2017_all.join(targets_2017,how='inner')
data_2017_aug.fillna(0,inplace=True)

data_2020_all = loadAllData(2020,pcamd)
data_2020_all['year'] = 2020
data_2020_aug = data_2020_all.join(targets_2019,how='inner')
data_2020_all.fillna(0,inplace=True)


In [None]:
# input columns
outputs = ['Candle', 'Electric', 'Lamp', 'Other_lighting','Coal', 'Gas', 'Wood','Other_fuel']
ls_cols = ['ls_{}'.format(i+1) for i in np.arange(pcamd.n_components)]
ls_var_cols = ['ls_var_{}'.format(i+1) for i in np.arange(pcamd.n_components)]

In [None]:
year = 2013
nl_aod_cols = ['nl_{}_avg'.format(year),'aod_{}_avg'.format(year),'nl_{}_var'.format(year),'aod_{}_var'.format(year)]
inputs = ls_cols + ls_var_cols + nl_aod_cols + ['year']
X2013 = data_2013_aug[inputs].fillna(0).values
y2013 = data_2013_aug[outputs].values

year = 2015
nl_aod_cols = ['nl_{}_avg'.format(year),'aod_{}_avg'.format(year),'nl_{}_var'.format(year),'aod_{}_var'.format(year)]
inputs = ls_cols + ls_var_cols + nl_aod_cols + ['year']
X2015all = data_2015_all[inputs].fillna(0).values
X2015 = data_2015_aug[inputs].fillna(0).values
y2015 = data_2015_aug[outputs].values

year = 2017
nl_aod_cols = ['nl_{}_avg'.format(year),'aod_{}_avg'.format(year),'nl_{}_var'.format(year),'aod_{}_var'.format(year)]
inputs = ls_cols + ls_var_cols + nl_aod_cols + ['year']
X2017all = data_2017_all[inputs].fillna(0).values
X2017 = data_2017_aug[inputs].fillna(0).values
y2017 = data_2017_aug[outputs].values

year = 2020
nl_aod_cols = ['nl_{}_avg'.format(year),'aod_{}_avg'.format(year),'nl_{}_var'.format(year),'aod_{}_var'.format(year)]
inputs = ls_cols + ls_var_cols + nl_aod_cols + ['year']
X2020all = data_2020_all[inputs].fillna(0).values
X2020 = data_2020_aug[inputs].fillna(0).values


## Nowcasting the household energy access using EO data

### Predictions for 2015

In [None]:
# Training data
Xtrain = np.vstack([X2013])
ytrain = np.vstack([y2013])

scalerX = StandardScaler()
scalerX.fit(Xtrain[:,0:-1])
Xtrain = np.hstack([scalerX.transform(Xtrain[:,0:-1]),Xtrain[:,-1:]])

strain = np.vstack([data_2013_aug[['lon','lat']].values])
urtrain = np.hstack([data_2013_aug['TYPE'].map({'rural':0,'urban':1})])
# Test data
Xtest = np.hstack([scalerX.transform(X2015all[:,0:-1]),X2015all[:,-1:]])
stest = data_2015_all[['lon','lat']].values
urtest = data_2015_all['TYPE'].map({'rural':0,'urban':1}).values

In [None]:
# train and predict using the GPR model using appended data
mo_kernel=True
alpha=0.1
YPred,var,models = run_GPyTorchMO(Xtrain,ytrain,Xtest,alpha,
                                  strain,stest,urtrain[:,np.newaxis],urtest[:,np.newaxis],
                                  fitlinear=True,ard=False,mo_ard=False,
                                  initPhis=None,return_models=True,hasTime=True)
pred_means_df_2015,pred_vars_df_2015 = prepPredictions(YPred,var,data_2015_all,outputs,
                                                       data_2015_all['TYPE'].map({'rural':0,'urban':1}))

### Predictions for 2017

In [None]:
# Training data
Xtrain = np.vstack([X2013,X2015])
ytrain = np.vstack([y2013,y2015])

scalerX = StandardScaler()
scalerX.fit(Xtrain[:,0:-1])
Xtrain = np.hstack([scalerX.transform(Xtrain[:,0:-1]),Xtrain[:,-1:]])

strain = np.vstack([data_2013_aug[['lon','lat']].values,
                    data_2015_aug[['lon','lat']].values])
urtrain = np.hstack([data_2013_aug['TYPE'].map({'rural':0,'urban':1}),
                     data_2015_aug['TYPE'].map({'rural':0,'urban':1})])
# Test data
Xtest = np.hstack([scalerX.transform(X2017all[:,0:-1]),X2017all[:,-1:]])
stest = data_2017_all[['lon','lat']].values
urtest = data_2017_all['TYPE'].map({'rural':0,'urban':1}).values

In [None]:
# train and predict using the GPR model using appended data
mo_kernel=True
alpha=0.1
YPred,var,models = run_GPyTorchMO(Xtrain,ytrain,Xtest,alpha,
                                  strain,stest,urtrain[:,np.newaxis],urtest[:,np.newaxis],
                                  fitlinear=True,ard=False,mo_ard=False,
                                  initPhis=None,return_models=True,hasTime=True)
pred_means_df_2017,pred_vars_df_2017 = prepPredictions(YPred,var,data_2017_all,outputs,
                                                       data_2017_all['TYPE'].map({'rural':0,'urban':1}))

### Predictions for 2020

In [None]:
Xtrain = np.vstack([X2013,X2015,X2017])
ytrain = np.vstack([y2013,y2015,y2017])

scalerX = StandardScaler()
scalerX.fit(Xtrain[:,0:-1])
Xtrain = np.hstack([scalerX.transform(Xtrain[:,0:-1]),Xtrain[:,-1:]])

strain = np.vstack([data_2013_aug[['lon','lat']].values,
                    data_2015_aug[['lon','lat']].values,
                    data_2017_aug[['lon','lat']].values])
urtrain = np.hstack([data_2013_aug['TYPE'].map({'rural':0,'urban':1}),
                     data_2015_aug['TYPE'].map({'rural':0,'urban':1}),
                     data_2017_aug['TYPE'].map({'rural':0,'urban':1})])
Xtest = np.hstack([scalerX.transform(X2020all[:,0:-1]),X2020all[:,-1:]])
stest = data_2020_all[['lon','lat']].values
urtest = data_2020_all['TYPE'].map({'rural':0,'urban':1}).values

In [None]:
# train and predict using the GPR model using appended data
mo_kernel=True
alpha=0.1
YPred,var,models = run_GPyTorchMO(Xtrain,ytrain,Xtest,alpha,
                                  strain,stest,urtrain[:,np.newaxis],urtest[:,np.newaxis],
                                  fitlinear=True,ard=ard,mo_ard=False,
                                  initPhis=None,return_models=True,hasTime=True)
pred_means_df_2020,pred_vars_df_2020 = prepPredictions(YPred,var,data_2020_all,outputs,
                                                       data_2020_all['TYPE'].map({'rural':0,'urban':1}))

## Analyzing the interconnectedness of energy access and population growth for microregions

In [None]:
pop_df = pd.DataFrame(data_2020_aug['pop2020']).join(pd.DataFrame(data_2013_aug['pop2013']))
pop_df['pop_increase'] = pop_df['pop2020'] - pop_df['pop2013']

In [None]:
targets_2013_aug = targets_2013.join(data_2013_aug[['TYPE','pop2013']])
for t in targets_2013_aug.columns[0:8]:
    targets_2013_aug['{}_pop'.format(t)] = targets_2013_aug[t]*targets_2013_aug['pop2013']

In [None]:
pred_means_df_2020_aug = pred_means_df_2020.join(data_2019_all['pop2020'])
for t in pred_means_df_2020_aug.columns[0:8]:
    pred_means_df_2020_aug['{}_pop'.format(t)] = pred_means_df_2020_aug[t]*pred_means_df_2020_aug['pop2020']

In [None]:
targets_2013_aug['region'] = targets_2013_aug.index.str[0:2]
targets_2013_aug['region_TYPE'] = targets_2013_aug['region']+'_'+targets_2013_aug['TYPE']

# aggregate at region+type level
targets_2013_region = targets_2013_aug.groupby(['region_TYPE']).mean()
targets_2013_region.drop(columns=['pop2013'],inplace=True)
targets_2013_region = targets_2013_region.join(targets_2013_aug.groupby('region_TYPE')['pop2013'].sum())
targets_2013_region.name = '2013'

# aggregate at region+type level
pred_2020_region = pred_means_df_2020_aug.groupby(['region_TYPE']).mean()
pred_2020_region.drop(columns=['pop2020'],inplace=True)
pred_2020_region = pred_2020_region.join(pred_means_df_2020_aug.groupby('region_TYPE')['pop2020'].sum())
pred_2020_region.name = '2020'

In [None]:
# commune level
delta_c_df = targets_2013_aug[['Electric','Gas','Lamp','Wood','pop2013']].join(
    pred_means_df_2019_aug[['Electric','Gas','Lamp','Wood','pop2020']],lsuffix='2013',rsuffix='2020').join(targets_2013_aug[['TYPE']])
delta_c_df['Delta_Electric'] = 100*(delta_c_df['Electric2020'] - delta_c_df['Electric2013'])
delta_c_df['Delta_Gas'] = 100*(delta_c_df['Gas2020'] - delta_c_df['Gas2013'])
delta_c_df['Delta_Wood'] = 100*(delta_c_df['Wood2020'] - delta_c_df['Wood2013'])
delta_c_df['Delta_Lamp'] = 100*(delta_c_df['Lamp2020'] - delta_c_df['Lamp2013'])
delta_c_df['Delta_pop'] = 100*(delta_c_df['pop2020'] - delta_c_df['pop2013'])/delta_c_df['pop2013']

In [None]:
data_2020_all['pop2019'] = data_2020_all['pop2020']
est2 = plotScatter(pred_means_df_2020,pred_vars_df_2020,2020,
                   'Electric',data_2020_all['pop2020'])

In [None]:
target = 'Gas'
delta_c_df = delta_c_df.loc[delta_c_df['{}2013'.format(target)] > 0.0001]
fig, ax = plt.subplots(figsize=(16,12))
ax.set_ylabel('% population change (2013-2020)',fontsize=32)
ax.set_xlabel('% point change in access to gas for cooking (2013-2020)'.format(target),fontsize=32)

# make scatter
ml = 0.01
ax.scatter(delta_c_df.loc[delta_c_df['TYPE']=='urban','Delta_{}'.format(target)],
       delta_c_df.loc[delta_c_df['TYPE']=='urban','Delta_pop'],
       s = ml*delta_c_df.loc[delta_c_df['TYPE']=='urban','pop2020'],
       alpha=0.6,
       c='r')
ax.scatter(delta_c_df.loc[delta_c_df['TYPE']=='rural','Delta_{}'.format(target)],
       delta_c_df.loc[delta_c_df['TYPE']=='rural','Delta_pop'],
       s = ml*delta_c_df.loc[delta_c_df['TYPE']=='rural','pop2020'],
       alpha=0.4,
       c='b')
ax.legend(['Urban','Rural'],fontsize=32,loc="upper right",markerscale=0.5)

# add population legend
ps = []
ls = [1000,10000,100000]
for p in ls:
    ps.append(ax.scatter([], [], c='gray', alpha=0.8, s=p*ml,label=str(p)))
leg = Legend(ax, ps,['1k','10k','100k'],fontsize=30,
             loc='upper left', frameon=True, markerscale=1,
             title='Population\n(2020)',labelspacing=1)
ax.add_artist(leg);


ax.axvline(c='k')


In [None]:
# region level
delta_df = targets_2013_region[['Electric','Gas','Electric_pop','Gas_pop','pop2013']].join(
    pred_2020_region[['Electric','Gas','Electric_pop','Gas_pop','pop2020']],lsuffix='2013',rsuffix='2020')
delta_df['Delta_Electric'] = 100*(delta_df['Electric2020'] - delta_df['Electric2013'])
delta_df['Delta_Gas'] = 100*(delta_df['Gas2020'] - delta_df['Gas2013'])
delta_df['Delta_Electric_pop'] = delta_df['Electric_pop2020'] - delta_df['Electric_pop2013']
delta_df['Delta_Gas_pop'] = delta_df['Gas_pop2020'] - delta_df['Gas_pop2013']
delta_df['Delta_pop'] = 100*(delta_df['pop2020'] - delta_df['pop2013'])/delta_df['pop2013']


delta_df['type'] = delta_df.index.str[3:]

In [None]:
target = 'Gas'
fig, ax = plt.subplots(figsize=(16,12))
ax.set_ylabel('% population change (2013-2020)',fontsize=32)
ax.set_xlabel('% point change in access to gas for cooking (2013-2020)'.format(target),fontsize=32)

# make scatter
ax.scatter(delta_df.loc[delta_df['type']=='urban','Delta_{}'.format(target)],
       delta_df.loc[delta_df['type']=='urban','Delta_pop'],
       s = 0.001*delta_df.loc[delta_df['type']=='urban','pop2020'],
       alpha=0.6,
       c='r')
ax.scatter(delta_df.loc[delta_df['type']=='rural','Delta_{}'.format(target)],
       delta_df.loc[delta_df['type']=='rural','Delta_pop'],
       s = 0.001*delta_df.loc[delta_df['type']=='rural','pop2020'],
       alpha=0.6,
       c='b')
#ax.set_ylim([-0.1,0.1])
ax.legend(['Urban','Rural'],fontsize=32,loc="upper right",markerscale=0.6)
ax.axvline(c='k')
# add population legend
ps = []
ls = [5000,50000,500000]
for p in ls:
    ps.append(ax.scatter([], [], c='gray', alpha=0.8, s=p*0.005,label=str(p)))
leg = Legend(ax, ps,['5k','50k','500k'],fontsize=30,
             loc='center right', frameon=True, markerscale=0.6,
             title='Population\n(2020)')
ax.add_artist(leg);


## Results 3 - Highlighting the urban-rural disparity in energy access through the years

In [None]:
t = 'Electric'
df_ts = []
for yy,df in zip([2013,2015,2017,2020],[targets_2013_x,pred_means_df_2015,pred_means_df_2017,pred_means_df_2020]):
    df_ = df[[t]].rename(columns={t:yy})
    df_ts.append(df_)
df_ts = pd.concat(df_ts,axis=1)
df_ts = df_ts.join(targets_2013_x.loc[:,['TYPE']],how='right')

fig, ax = plt.subplots(figsize=(10,6))

# urban
df_ts_u = df_ts.loc[df_ts['TYPE']=='urban',[2013,2015,2017,2020]].transpose().clip(lower=0,upper=1)
df_ts_u.index=pd.PeriodIndex(pd.to_datetime(df_ts_u.index,format='%Y'),freq='A')

# rural
df_ts_r = df_ts.loc[df_ts['TYPE']=='rural',[2013,2015,2017,2020]].transpose().clip(lower=0,upper=1)
df_ts_r.index=pd.PeriodIndex(pd.to_datetime(df_ts_r.index,format='%Y'),freq='A')

df_ts_u.mean(axis=1).plot(ax=ax,xticks=df_ts_u.index,linewidth=4,c='r',marker='s',markersize=12)
df_ts_r.mean(axis=1).plot(ax=ax,xticks=df_ts_r.index,linewidth=4,c='b',marker='s',markersize=12)
df_ts_u.plot(ax=ax,xticks=df_ts_u.index,c='r',legend=None,alpha=0.1)
df_ts_r.plot(ax=ax,xticks=df_ts_r.index,c='b',legend=None,alpha=0.05)

ax.set_xlabel('Year')
ax.set_ylabel('Electricity Access'.format(t))
ax.legend(['Urban','Rural'])
ax.set_ylim([0,1])
ax.set_xticklabels([2013,2015,2017,2020])
plt.tight_layout()


In [None]:
t = 'Gas'
df_ts = []
for yy,df in zip([2013,2015,2017,2020],[targets_2013_x,pred_means_df_2015,pred_means_df_2017,pred_means_df_2020]):
    df_ = df[[t]].rename(columns={t:yy})
    df_ts.append(df_)
df_ts = pd.concat(df_ts,axis=1)
df_ts = df_ts.join(targets_2013_x.loc[:,['TYPE']],how='right')

fig, ax = plt.subplots(figsize=(10,6))

# urban
df_ts_u = df_ts.loc[df_ts['TYPE']=='urban',[2013,2015,2017,2020]].transpose().clip(lower=0,upper=1)
df_ts_u.index=pd.PeriodIndex(pd.to_datetime(df_ts_u.index,format='%Y'),freq='A')

# rural
df_ts_r = df_ts.loc[df_ts['TYPE']=='rural',[2013,2015,2017,2020]].transpose().clip(lower=0,upper=1)
df_ts_r.index=pd.PeriodIndex(pd.to_datetime(df_ts_r.index,format='%Y'),freq='A')

df_ts_u.mean(axis=1).plot(ax=ax,xticks=df_ts_u.index,linewidth=4,c='r',marker='s',markersize=12)
df_ts_r.mean(axis=1).plot(ax=ax,xticks=df_ts_r.index,linewidth=4,c='b',marker='s',markersize=12)
df_ts_u.plot(ax=ax,xticks=df_ts_u.index,c='r',legend=None,alpha=0.1)
df_ts_r.plot(ax=ax,xticks=df_ts_r.index,c='b',legend=None,alpha=0.05)

ax.set_xlabel('Year')
ax.set_ylabel('Gas (as cooking fuel) Access'.format(t))
ax.legend(['Urban','Rural'])
ax.set_ylim([0,1])
ax.set_xticklabels([2013,2015,2017,2020])
plt.tight_layout()


In [None]:
# Violin plot
t = 'Gas' #'Electric'
df_ts = []
for yy,df in zip([2013,2015,2017,2020],[targets_2013_x,pred_means_df_2015,pred_means_df_2017,pred_means_df_2019]):
    df_ = df[[t]].rename(columns={t:yy})
    df_ts.append(df_)
df_ts = pd.concat(df_ts,axis=1)
df_ts = df_ts.join(targets_2013_x.loc[:,['TYPE']],how='right')
df_ts_list = []
for y in [2013,2015,2017,2020]:
    df_ts_ = df_ts[[y]]
    df_ts_.loc[:,['year']] = y
    df_ts_.loc[:,['TYPE']] = df_ts['TYPE']
    df_ts_.rename(columns={y:'value'},inplace=True)
    df_ts_list.append(df_ts_)
df_ts_merged = pd.concat(df_ts_list,ignore_index=True)
#df_ts_merged.loc[df_ts_merged['value'] < 0,'value'] = 0
fig, ax = plt.subplots(figsize=(10,6))
#
#ax.legend(['Urban','Rural'])
#ax.set_ylim([0,1])
#ax.set_xticklabels([2013,2015,2017,2020])
colors = ['r', 'b']
# Set your custom color palette
sns.set_palette(sns.color_palette(colors))
ax = sns.violinplot(x="year", y="value", hue="TYPE",
                    data=df_ts_merged,ax=ax)
ax.set_xlabel('Year')
ax.legend(ncol=2)
ax.set_ylabel('Clean Cooking Fuel Access')
#ax.set_ylabel('Electricity Access'.format(t))
plt.tight_layout()
plt.savefig('plots/violin_gas.png',dpi=192)

## Spatial cross-validation

In [None]:
commune_ids = list(data_aug.index)
comreg_df = pd.DataFrame(commune_ids,columns=['CCOD_CRCA'])
comreg_df['REG'] = comreg_df['CCOD_CRCA'].str[0:2]
comreg_df['row_index'] = np.arange(comreg_df.shape[0])

In [None]:
# input columns
nl_aod_cols = ['nl_2013_avg','aod_2013_avg','nl_2013_var','aod_2013_var']
# Prepare data
outputs = ['Candle', 'Electric', 'Lamp', 'Other_lighting','Coal', 'Gas', 'Wood','Other_fuel']
inputs = ls_cols + ls_var_cols + nl_aod_cols 
X = data_aug[inputs].fillna(0).values
y = data_aug[outputs].values
# Training data
Xtrain = np.vstack([X2013])
ytrain = np.vstack([y2013])

scalerX = StandardScaler()
scalerX.fit(Xtrain[:,0:-1])
Xtrain = np.hstack([scalerX.transform(Xtrain[:,0:-1]),Xtrain[:,-1:]])

strain = np.vstack([data_2013_aug[['lon','lat']].values])
urtrain = np.hstack([data_2013_aug['TYPE'].map({'rural':0,'urban':1})])

lmbd = 0.1
niters = 1
args = {}
args['mpi'] = []
args['s'] = strain
args['urtype'] = urtrain.values[:,np.newaxis]
args['fitlinear'] = True # always true
args['initPhis'] = None 
args['mokernel'] = True
# ard entries are always false
args['ard'] = False
args['mo_ard'] = False
method = 'GPRSpatial'
results = crossvalidate_spatial(Xtrain,ytrain,lmbd,niters,method,args,map_distances,comreg_df)
pcorrvec,ppvalvec,scorrvec,spvalvec,rmsevec,map_YPreds,map_Yvar1,map_Yvar2,map_Yvar_combined,rmseurbanvec,rmseruralvec,rmseurban_dvec,rmseurban_ndvec = results
predictedMeans,predictedVars = getPredStats(map_YPreds,map_Yvar_combined,comreg_df,commune_ids)


In [None]:
res = np.vstack([np.mean(pcorrvec,axis=0),np.mean(ppvalvec,axis=0),
                 np.mean(scorrvec,axis=0),np.mean(spvalvec,axis=0),
                 np.mean(rmsevec,axis=0),np.mean(rmseurbanvec,axis=0),
                 np.mean(rmseruralvec,axis=0),np.mean(rmseurban_dvec,axis=0),np.mean(rmseurban_ndvec,axis=0),
                 np.mean(predictedVars,axis=0)])
res = pd.DataFrame(res,columns=outputs,
                   index=pd.Index([
                       'Pearson','PPVal','Spearman','SPVal','RMSE (all)','RMSE (urban)','RMSE (rural)','RMSE (urban-dakar)','RMSE (urban-nondakar)','Variances'
                   ]))
print(res.to_markdown())


## Temporal validation 

In [None]:
est2 = plotScatter(pred_means_df_2020,pred_vars_df_2020,2020,
                   'Electric',data_2020_all['pop2020'])

In [None]:
est2 = plotScatter(pred_means_df_2020,pred_vars_df_2020,2020,
                   'Gas',data_2020_all['pop2020'])