In [1]:
import os
import json
import numpy as np
import pandas as pd
from argparse import Namespace
from glob import glob

import rasterio

from maskrcnn.postprocess.validate import transform_coord

In [2]:
SVY_IN_DIR = 'data/GiveDirectly/Survey/household.dta'
SAT_IN_DIR = 'data/GiveDirectly/Pred/infer/'
OUT_DIR = 'data/GiveDirectly/merged.csv'
NL_IN_DIR = 'data/Nightlight/VIIRS_DNB_KE_2018.tif'

# set up visualization cfg
cfg = Namespace()
cfg.visual_score_cutoff = 0.9

In [3]:
dataset = rasterio.open(NL_IN_DIR)
band = dataset.read(1)

In [4]:
df = pd.read_stata(SVY_IN_DIR)
df = df.dropna(subset=['s19_gps_latitude', 's19_gps_longitude'])

In [5]:
# read satellite predictions
# from json annotations
df_sat = []
for file_name in glob(os.path.join(SAT_IN_DIR, '*.json')):
    with open(file_name, 'r') as f:
        df_sat += json.load(f)
df_sat = pd.DataFrame(df_sat)

# drop low score predictions
df_sat = df_sat.loc[df_sat['score'] > cfg.visual_score_cutoff, :]

In [6]:
df_sat['category_id'].replace(
    {1: 'thatched', 2: 'metal', 3: 'colored'}, inplace=True)

In [7]:
df_sat

Unnamed: 0,segmentation,bbox,score,area,image_id_str,image_id,category_id
0,"{'size': [800, 800], 'counts': 'bmZ;6hh04M3M3M...","[462.1242370605469, 400.2402038574219, 99.2250...",0.999265,4828.0,601040301010-056,2509,metal
1,"{'size': [800, 800], 'counts': 'kgP34gh09J5J5L...","[119.26983642578125, 627.6113891601562, 61.272...",0.998286,2046.0,601040301010-056,2509,metal
2,"{'size': [800, 800], 'counts': 'l`\=3kh05L3M3M...","[546.2864990234375, 389.5267028808594, 50.6158...",0.975503,1275.0,601040301010-056,2509,metal
15,"{'size': [800, 800], 'counts': ']im`0;dh06J5L4...","[693.110107421875, 388.4268798828125, 53.11779...",0.993461,2109.0,601030203007-040,1308,thatched
16,"{'size': [800, 800], 'counts': '[iT=5hh07K9H7H...","[533.3128662109375, 470.8623046875, 102.736450...",0.970277,5517.0,601030203007-040,1308,metal
...,...,...,...,...,...,...,...
97053,"{'size': [800, 800], 'counts': 'bg_a03hh0:J4K6...","[714.5397338867188, 343.2801513671875, 73.3020...",0.998080,3286.0,601020301003-030,728,metal
97054,"{'size': [800, 800], 'counts': 'eUa:1lh06K2O2M...","[427.1697692871094, 415.1800537109375, 83.7474...",0.997889,3391.0,601020301003-030,728,metal
97055,"{'size': [800, 800], 'counts': 'Rjh:1g0?mf0n0M...","[441.29888916015625, 279.7791442871094, 64.599...",0.997216,2991.0,601020301003-030,728,metal
97056,"{'size': [800, 800], 'counts': '_U\a0c0[h09G6K...","[706.9384155273438, 686.0633544921875, 86.8361...",0.989834,4457.0,601020301003-030,728,thatched


In [8]:
# grouping
df_cat = df_sat.groupby(['image_id_str', 'category_id']).agg(
    sat_house=pd.NamedAgg(column='area', aggfunc='count'),
    sat_size_mean=pd.NamedAgg(column='area', aggfunc=np.nanmean),
    sat_size_sum=pd.NamedAgg(column='area', aggfunc=np.nansum)
).reset_index()

In [9]:
df_all = df_sat.groupby(['image_id_str']).agg(
    sat_house=pd.NamedAgg(column='area', aggfunc='count'),
    sat_size_mean=pd.NamedAgg(column='area', aggfunc=np.nanmean),
    sat_size_sum=pd.NamedAgg(column='area', aggfunc=np.nansum)
).reset_index()

In [10]:
df_sat = pd.concat([df_cat, df_all.assign(category_id='all')], sort=True)

In [11]:
# scale areas / distances
df_sat[[col for col in df_sat.columns if col.startswith('sat_size')]] *= (
    (0.298 ** 2) * (640 ** 2) / (800 ** 2) * np.cos(23 / 180 * np.pi))  # in sq meters

In [12]:
df_sat = df_sat.pivot(
    index='image_id_str', columns='category_id',
    values=['sat_house', 'sat_size_mean', 'sat_size_sum'])

In [13]:
df_sat_columns = ['_'.join(cols[::-1]) for cols in df_sat.columns]
df_sat.columns = df_sat_columns

In [14]:
df = pd.merge(df, df_sat, how='left', left_on='s1_hhid_key', right_on='image_id_str')

In [15]:
cols = [col for col in df_sat_columns if col.endswith(('sat_house', 'sat_size_sum'))]
df[cols] = df[cols].fillna(0)

In [16]:
idx = np.round(transform_coord(
    transform=dataset.transform,
    to='colrow',
    xy=df.loc[:, ['s19_gps_longitude', 's19_gps_latitude']].values)).astype(np.int)

df['nightlight'] = [band[i[1], i[0]] for i in idx]

In [17]:
df.to_csv(OUT_DIR, index=False)