In [1]:
import os
import json
import numpy as np
import pandas as pd
from argparse import Namespace
from glob import glob

In [2]:
SVY_IN_DIR = 'data/GiveDirectly/Survey/household.dta'
SAT_IN_DIR = 'data/GiveDirectly/Pred/infer/'
OUT_DIR = 'data/GiveDirectly/merged.csv'

# set up visualization cfg
cfg = Namespace()
cfg.visual_score_cutoff = 0.9

In [3]:
df = pd.read_stata(SVY_IN_DIR)
df = df.dropna(subset=['s19_gps_latitude', 's19_gps_longitude'])

In [4]:
# read satellite predictions
# from json annotations
df_sat = []
for file_name in glob(os.path.join(SAT_IN_DIR, '*.json')):
    with open(file_name, 'r') as f:
        df_sat += json.load(f)
df_sat = pd.DataFrame(df_sat)

# drop low score predictions
df_sat = df_sat.loc[df_sat['score'] > cfg.visual_score_cutoff, :]

In [5]:
# grouping into localities
df_sat = df_sat.groupby(['image_id_str', 'category_id']).agg(
    sat_house=pd.NamedAgg(column='area', aggfunc='count'),
    sat_size_mean=pd.NamedAgg(column='area', aggfunc=np.nanmean),
    sat_size_sum=pd.NamedAgg(column='area', aggfunc=np.nansum)
).reset_index()
# scale areas / distances
df_sat[[col for col in df_sat.columns if col.startswith('sat_size')]] *= (
    (0.298 ** 2) * (640 ** 2) / (800 ** 2) * np.cos(23 / 180 * np.pi))  # in sq meters

In [6]:
df_sat['category_id'].replace(
    {1: 'thatched', 2: 'metal', 3: 'colored'}, inplace=True)

In [7]:
df_sat = df_sat.pivot(
    index='image_id_str', columns='category_id',
    values=['sat_house', 'sat_size_mean', 'sat_size_sum'])

In [8]:
df_sat.fillna(0, inplace=True)

In [9]:
df_sat_columns = ['_'.join(cols[::-1]) for cols in df_sat.columns]
df_sat.columns = df_sat_columns

In [10]:
df = pd.merge(df, df_sat, how='left', left_on='s1_hhid_key', right_on='image_id_str')

In [11]:
df[df_sat_columns] = df[df_sat_columns].fillna(0)

In [12]:
df.to_csv(OUT_DIR, index=False)