# Geo 

This notebook exists to make it easier to use a separate kernel for geo processing, to avoid dependency conflict issues I've encountered w/ dependencies for Geopandas and Dask-Geopandas.

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import warnings
import math

import geopandas as gpd
import dask_geopandas as dgpd
import dask.dataframe as ddf

import pyreadstat

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
year = 2019

data_path = Path('/home/selker/eop/data')

malawi_directory = data_path / 'malawi'
mosaiks_directory = data_path / 'mosaiks'

survey_file_lists = {
    2016: [
        malawi_directory / 'MWI_2016_IHS-IV_v04_M_STATA14/household_geovariables/householdgeovariablesihs4.dta',
        malawi_directory / 'MWI_2016_IHS-IV_v04_M_STATA14/household/hh_mod_a_filt.dta'
    ],
    2019: [
        malawi_directory / 'MWI_2019_IHS-V_v06_M_Stata/householdgeovariables_ihs5.dta',
        malawi_directory / 'MWI_2019_IHS-V_v06_M_Stata/hh_mod_a_filt.dta'
    ]
}

In [3]:
def load_data(file_list):
    
    data = None
    column_names_to_labels = dict()

    for file in file_list:
        with warnings.catch_warnings():
            warnings.simplefilter('ignore') # TODO: Investigate. Warning thrown from w/in pyreadstat.
    
            dataframe, metadata =  pyreadstat.read_dta(
                    file, apply_value_formats=True
            )
    
        column_names_to_labels.update(metadata.column_names_to_labels)
    
        if data is None:
            data = dataframe
        else:
            data = data.merge(dataframe, on='case_id', how='outer', suffixes=('_left', '_right'))

    if 'lat_modified' in data.columns:
        data.rename(columns={'lat_modified': 'ea_lat_mod'}, inplace=True)

    if 'lon_modified' in data.columns:
        data.rename(columns={'lon_modified': 'ea_lon_mod'}, inplace=True)

    # TA code column in 2019 data.
    if 'hh_a02a' in data.columns:
        data.rename(columns={'hh_a02a': 'TA'}, inplace=True)

    if 'ea_id_left' in data.columns:
        data.drop(columns='ea_id_right', inplace=True)
        data.rename(columns={'ea_id_left': 'ea_id'}, inplace=True)

    return data

In [4]:
survey_data = load_data(survey_file_lists[year])
survey_data = survey_data[['ea_id', 'ea_lat_mod', 'ea_lon_mod', 'TA']]

In [5]:
# A few EAs have two locations in 2016 data. Not sure why (and not true in 2019 data).
ea_with_lat_lon = survey_data.groupby('ea_id')[['ea_lat_mod', 'ea_lon_mod']].agg(pd.Series.mode)
ea_with_lat_lon = gpd.GeoDataFrame(
    ea_with_lat_lon, geometry = gpd.points_from_xy(
        x=ea_with_lat_lon.ea_lon_mod, y=ea_with_lat_lon.ea_lat_mod
    )
)

In [6]:
malawi_admin_3 = gpd.read_file(
    malawi_directory / 'mwi_adm_nso_hotosm_20230405_shp' / 'mwi_admbnda_adm3_nso_hotosm_20230405.shp'
)

ERROR 1: PROJ: proj_create_from_database: Open of /home/selker/.conda/envs/geo/share/proj failed


In [7]:
ea_with_lat_lon_geo = gpd.sjoin(
    ea_with_lat_lon, malawi_admin_3, predicate='within', how='left'
)

Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  ea_with_lat_lon_geo = gpd.sjoin(


In [8]:
ea_with_lat_lon_geo = ea_with_lat_lon_geo[[
    'ea_lat_mod', 'ea_lon_mod', 'geometry', 'ADM2_EN'
]]

#### Mosaiks

In [3]:
mosaiks = ddf.read_csv(str(mosaiks_directory / 'malawi_fine' / '*.csv'))

In [None]:
%%time
def clean_mosaiks_column_name(column_name):
    column_name_stripped = column_name.strip(' .')
    try:
        number = int(column_name_stripped)
    except ValueError:
        if column_name_stripped == '':
            return 'mosaiks_0'
        else:
            return column_name
    else:
        return f'mosaiks_{number}'

mosaiks = ddf.read_csv(str(mosaiks_directory / 'malawi_fine' / '*.csv'))
malawi_outline = gpd.read_file(
    malawi_directory / 'mwi_adm_nso_hotosm_20230405_shp' / 'mwi_admbnda_adm0_nso_hotosm_20230405.shp'
)

mosaiks = dgpd.from_dask_dataframe(
    mosaiks, dgpd.points_from_xy(mosaiks, 'Lon', 'Lat')
)

# this data covers a box containing Malawi; filter down to the points actually within the country.
mosaiks = mosaiks[mosaiks.geometry.within(malawi_outline.iloc[0].geometry)]

mosaiks.columns = mosaiks.columns.map(clean_mosaiks_column_name)

mosaiks.drop(columns='geometry').to_parquet('mosaiks_within_malawi')

In [None]:
mosaiks = ddf.read_parquet('mosaiks_within_malawi/*.parquet')

In [11]:
mosaiks[['Lat', 'Lon']].compute().to_csv('malawi_mosaiks_locations.csv', index=False)

In [31]:
# associate a moasiks tile with each enumeration area
ea_geo = ea_with_lat_lon_geo[['ea_lat_mod', 'ea_lon_mod']]
ea_geo = gpd.GeoDataFrame(ea_geo, geometry = gpd.points_from_xy(x=ea_geo.ea_lon_mod, y=ea_geo.ea_lat_mod))

In [34]:
mosaiks_grid_size = 1
max_distance = math.sqrt(2 * (mosaiks_grid_size / 2)**2)

mosaiks_computed = mosaiks.compute()
mosaiks_computed = gpd.GeoDataFrame(
    mosaiks_computed, geometry=gpd.points_from_xy(
        x=mosaiks_computed.Lon, y=mosaiks_computed.Lat
    )
)

ea_geo_with_mosaiks = gpd.sjoin_nearest(
    left_df=ea_geo, right_df=mosaiks_computed, how='left', max_distance=mosaiks_grid_size
)

ea_geo_with_mosaiks.rename(
    columns={'Lat': 'lat_mosaiks', 'Lon': 'lon_mosaiks', 'index_right': 'index_mosaiks'},
    inplace=True
)

ea_geo_with_mosaiks.reset_index(inplace=True)
ea_geo_with_mosaiks.ea_id = ea_geo_with_mosaiks.ea_id.astype(int)

ea_geo_with_mosaiks.drop(columns=['geometry', 'BoxLabel']).to_parquet(
    f'{year}_ea_geo_with_mosaiks', index=False
)

In [59]:
ea_geo_with_mosaiks = pd.read_parquet(f'{year}_ea_geo_with_mosaiks')
# for the merge
ea_geo_with_mosaiks['ea_id'] = ea_geo_with_mosaiks['ea_id'].astype(str)

# load survey data (again)
survey_data = load_data(survey_file_lists[year])
survey_data = survey_data[['case_id', 'ea_id']]

mosaiks_by_case_id = survey_data.merge(
    ea_geo_with_mosaiks, on='ea_id', how='left'
)

mosaiks_feature_columns = [
    c for c in mosaiks_by_case_id.columns if c.startswith('mosaiks_')
]

mosaiks_by_case_id = mosaiks_by_case_id[['case_id'] + mosaiks_feature_columns]

In [61]:
mosaiks_by_case_id.to_parquet(f'{year}_mosaiks_by_case_id', index=False)