# Geo 

This notebook exists to make it easier to use a separate kernel for geo processing, to avoid dependency conflict issues I've encountered w/ dependencies for Geopandas and Dask-Geopandas.

In [14]:
import pandas as pd
from pathlib import Path
import numpy as np
import warnings
import math

import geopandas as gpd
import dask_geopandas as dgpd
import dask.dataframe as ddf

import pyreadstat

In [15]:
year = 2019

data_path = Path('/home/selker/eop/data')

malawi_directory = data_path / 'malawi'
mosaiks_directory = data_path / 'mosaiks'

survey_file_lists = {
    2016: [
        malawi_directory / 'MWI_2016_IHS-IV_v04_M_STATA14/household_geovariables/householdgeovariablesihs4.dta',
        malawi_directory / 'MWI_2016_IHS-IV_v04_M_STATA14/household/hh_mod_a_filt.dta'
    ],
    2019: [
        malawi_directory / 'MWI_2019_IHS-V_v06_M_Stata/householdgeovariables_ihs5.dta',
        malawi_directory / 'MWI_2019_IHS-V_v06_M_Stata/hh_mod_a_filt.dta'
    ]
}

In [16]:
def load_data(file_list):
    
    data = None
    column_names_to_labels = dict()

    for file in file_list:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore') # TODO: Investigate. Warning thrown from w/in pyreadstat.
    
            dataframe, metadata =  pyreadstat.read_dta(
                    file, apply_value_formats=True
            )
    
        column_names_to_labels.update(metadata.column_names_to_labels)
    
        if data is None:
            data = dataframe
        else:
            data = data.merge(dataframe, on='case_id', how='outer', suffixes=('_left', '_right'))

    if 'lat_modified' in data.columns:
        data.rename(columns={'lat_modified': 'ea_lat_mod'}, inplace=True)

    if 'lon_modified' in data.columns:
        data.rename(columns={'lon_modified': 'ea_lon_mod'}, inplace=True)

    # TA code column in 2019 data.
    if 'hh_a02a' in data.columns:
        data.rename(columns={'hh_a02a': 'TA'}, inplace=True)

    if 'ea_id_left' in data.columns:
        data.drop(columns='ea_id_right', inplace=True)
        data.rename(columns={'ea_id_left': 'ea_id'}, inplace=True)

    return data

In [17]:
survey_data = load_data(survey_file_lists[year])
survey_data = survey_data[['ea_id', 'ea_lat_mod', 'ea_lon_mod', 'TA']]

In [18]:
# A few EAs have two locations in 2016 data. Not sure why (and not true in 2019 data).
ea_with_lat_lon = survey_data.groupby('ea_id')[['ea_lat_mod', 'ea_lon_mod']].agg(pd.Series.mode)
ea_with_lat_lon = gpd.GeoDataFrame(
    ea_with_lat_lon, geometry = gpd.points_from_xy(
        x=ea_with_lat_lon.ea_lon_mod, y=ea_with_lat_lon.ea_lat_mod
    )
)

In [19]:
malawi_admin_3 = gpd.read_file(
    malawi_directory / 'mwi_adm_nso_hotosm_20230405_shp' / 'mwi_admbnda_adm3_nso_hotosm_20230405.shp'
)

ERROR 1: PROJ: proj_create_from_database: Open of /home/selker/.conda/envs/geo/share/proj failed


In [None]:
ea_with_lat_lon_geo = gpd.sjoin(
    ea_with_lat_lon, malawi_admin_3, predicate='within', how='left'
)

ea_with_lat_lon_geo = ea_with_lat_lon_geo[[
    'ea_lat_mod', 'ea_lon_mod', 'geometry', 'ADM2_EN', 'ADM3_EN'
]]

## Mosaiks

In [None]:
mosaiks = ddf.read_csv(str(mosaiks_directory / 'malawi_fine' / '*.csv'))

In [None]:
%%time
def clean_mosaiks_column_name(column_name):
    column_name_stripped = column_name.strip(' .')
    try:
        number = int(column_name_stripped)
    except ValueError:
        if column_name_stripped == '':
            return 'mosaiks_0'
        else:
            return column_name
    else:
        return f'mosaiks_{number}'

mosaiks = ddf.read_csv(str(mosaiks_directory / 'malawi_fine' / '*.csv'))
malawi_outline = gpd.read_file(
    malawi_directory / 'mwi_adm_nso_hotosm_20230405_shp' / 'mwi_admbnda_adm0_nso_hotosm_20230405.shp'
)

mosaiks = dgpd.from_dask_dataframe(
    mosaiks, dgpd.points_from_xy(mosaiks, 'Lon', 'Lat')
)

# this data covers a box containing Malawi; filter down to the points actually within the country.
mosaiks = mosaiks[mosaiks.geometry.within(malawi_outline.iloc[0].geometry)]

mosaiks.columns = mosaiks.columns.map(clean_mosaiks_column_name)

mosaiks.drop(columns='geometry').to_parquet('mosaiks_within_malawi')

In [24]:
mosaiks = ddf.read_parquet('mosaiks_within_malawi/*.parquet')

In [25]:
mosaiks[['Lat', 'Lon']].compute().to_csv('malawi_mosaiks_locations.csv', index=False)


KeyboardInterrupt



### Assign mosaiks feature values to EAs

In [142]:
ea_lat_lon = ea_with_lat_lon_geo[['ea_lat_mod', 'ea_lon_mod']]
ea_lat_lon = gpd.GeoDataFrame(ea_lat_lon, geometry = gpd.points_from_xy(x=ea_lat_lon.ea_lon_mod, y=ea_lat_lon.ea_lat_mod))

mosaiks_computed = mosaiks.compute()
mosaiks_computed = gpd.GeoDataFrame(
    mosaiks_computed, geometry=gpd.points_from_xy(
        x=mosaiks_computed.Lon, y=mosaiks_computed.Lat
    )
)

#### EA <-> nearest MOSAIKS tile

In [197]:
mosaiks_grid_size = 0.1
max_distance = math.sqrt(2 * (mosaiks_grid_size / 2)**2)

ea_geo_with_mosaiks = gpd.sjoin_nearest(
    left_df=ea_lat_lon, right_df=mosaiks_computed, how='left', max_distance=mosaiks_grid_size
)

ea_geo_with_mosaiks.rename(
    columns={'Lat': 'lat_mosaiks', 'Lon': 'lon_mosaiks', 'index_right': 'index_mosaiks'},
    inplace=True
)

ea_geo_with_mosaiks.reset_index(inplace=True)

mosaiks_feature_columns = [
    c for c in mosaiks_with_nearest_ea.columns if c.startswith('mosaiks_')
]
ea_geo_with_mosaiks[['ea_id'] + mosaiks_feature_columns].to_parquet(
    f'{year}_ea_with_mosaiks_nearest.parquet', index=False
)

#### EA <- mean of mosaiks w/in Voronoi

In [161]:
mosaiks_computed_reset = mosaiks_computed.reset_index(drop=True)

mosaiks_with_nearest_ea = gpd.sjoin_nearest(
    left_df=ea_lat_lon, right_df=mosaiks_computed_reset, how='right'
)

mosaiks_feature_columns = [c for c in mosaiks_with_nearest_ea.columns if c.startswith('mosaiks_')]

ea_mosaiks_averages_voronoi = mosaiks_with_nearest_ea.groupby('index_left')[mosaiks_feature_columns].mean()

ea_mosaiks_averages_voronoi = ea_lat_lon.join(ea_mosaiks_averages_voronoi)

ea_mosaiks_averages_voronoi.reset_index(inplace=True)

ea_mosaiks_averages_voronoi[['ea_id'] + mosaiks_feature_columns].to_parquet(
    f'{year}_ea_with_mosaiks_voronoi_average.parquet', index=False
)

#### EA <- mean of mosaiks w/in admin 3

In [190]:
mosaiks_with_admin_3 = gpd.sjoin(
    malawi_admin_3, mosaiks_computed, predicate='contains', how='inner'
)

mosaiks_feature_columns = [c for c in mosaiks_with_admin_3.columns if c.startswith('mosaiks_')]
mosaiks_averaged_by_admin_3 = (
    mosaiks_with_admin_3.groupby('ADM3_PCODE')[mosaiks_feature_columns].mean()
)

mosaiks_averaged_by_admin_3 = malawi_admin_3.join(mosaiks_averaged_by_admin_3, on='ADM3_PCODE')

ea_with_mosaiks_averaged_by_admin_3 = gpd.sjoin(
    mosaiks_averaged_by_admin_3, ea_with_lat_lon.reset_index(), predicate='contains', how='inner'
)

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None

  mosaiks_with_admin_3 = gpd.sjoin(
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: None

  ea_with_mosaiks_averaged_by_admin_3 = gpd.sjoin(


In [163]:
ea_with_mosaiks_averaged_by_admin_3[['ea_id'] + mosaiks_feature_columns].to_parquet(
    f'{year}_ea_with_mosaiks_admin_3_average.parquet', index=False
)

### Index by household for merging w/ survey data

In [188]:
ea_with_mosaiks_nearest = pd.read_parquet(f'{year}_ea_with_mosaiks_nearest.parquet')
ea_with_mosaiks_voronoi = pd.read_parquet(f'{year}_ea_with_mosaiks_voronoi_average.parquet')
ea_with_mosaiks_admin_3 = pd.read_parquet(f'{year}_ea_with_mosaiks_admin_3_average.parquet')

def rename_mosaiks_columns(original_column_name, prefix):
    if original_column_name.startswith('mosaiks_'):
        return f'{prefix}_{original_column_name}'
    else:
        return original_column_name

ea_with_mosaiks_nearest.rename(
    columns=lambda c: rename_mosaiks_columns(c, 'nearest'), inplace=True
)
ea_with_mosaiks_voronoi.rename(
    columns=lambda c: rename_mosaiks_columns(c, 'voronoi'), inplace=True
)
ea_with_mosaiks_admin_3.rename(
    columns=lambda c: rename_mosaiks_columns(c, 'admin_3'), inplace=True
)

ea_with_all_mosaiks = (
    ea_with_mosaiks_nearest
    .merge(ea_with_mosaiks_voronoi, on='ea_id', how='outer')
    .merge(ea_with_mosaiks_admin_3, on='ea_id', how='outer')
)

# ea_with_all_mosaiks.ea_id = ea_with_all_mosaiks.ea_id.astype(str)

In [180]:
# load survey data (again)
survey_data = load_data(survey_file_lists[year])
survey_data = survey_data[['case_id', 'ea_id']]

mosaiks_by_case_id = survey_data.merge(
    ea_with_all_mosaiks, on='ea_id', how='left'
)

In [205]:
mosaiks_columns = [c for c in mosaiks_by_case_id.columns if 'mosaiks' in c]
mosaiks_by_case_id[['case_id'] + mosaiks_columns].to_parquet(f'{year}_mosaiks_by_case_id.parquet', index=False)

In [206]:
mosaiks_by_case_id[
    ['case_id'] + [c for c in mosaiks_columns if c.startswith('nearest')]
].to_parquet(f'{year}_mosaiks_nearest_by_case_id.parquet', index=False)