In [1]:
import numpy as np
import scipy.spatial
import pandas as pd
import shapely
import shapely.geometry
import geopandas as gpd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

from maskrcnn.preprocess.sample_aoi import load_df, aoi_to_chip

In [2]:
IN_DIR_SHP = 'data/Shapefile/SHP2010/mglu2010v5_0/poligonos_urbanos.shp'
IN_DIR_CEN = 'data/CPV/Raw/ITER2010/ITER_NALDBF10.csv'

In [3]:
OUT_LOC_DIR = 'data/Experiment3/census.shp'
OUT_IMG_DIR = 'data/Experiment3/aoi.csv'

In [4]:
# read urban shapefiles
df_shp = gpd.read_file(IN_DIR_SHP)
df_shp = df_shp.to_crs({'init': 'epsg:4326'})
for col in df_shp.columns:
    if col.startswith('CVE'):
        df_shp[col] = df_shp[col].astype(int)
df_shp.columns = ['ent', 'mun', 'loc', 'NOM_LOC', 'geometry']

In [5]:
# read census data
df_cen = load_df(IN_DIR_CEN, drop=False)
df_cen = gpd.GeoDataFrame(
    df_cen,
    geometry=[shapely.geometry.Point(x, y) for x, y in
              zip(df_cen['lon'].values, df_cen['lat'].values)])

In [6]:
df = pd.merge(
    # when using inner join mode, two localities are dropped
    # from df_shp
    # because one of them has VPH variables as N/D and
    # one has those as * (masked)
    # dropped seems fine since we are going to be using those
    # data for validation
    df_shp, df_cen, how='inner', on=['ent', 'mun', 'loc'],
    # most points recorded in the census is in the polygon
    # with some exceptions
    # all the exceptions are cases where points lie close to
    # the census tract but are outside it, b/c the census tract
    # is non convex or just b/c measurement errors
    # I'm sure that they are the same census block though
    # they are never too far apart from each other
    suffixes=('', '_point'))

In [7]:
df = df.drop(columns=['geometry_point'])

In [8]:
# choose the smaller half of the localities
# some are so big that this exercise does not make sense
area_median = df['geometry'].area.median()
df = df.loc[df['geometry'].area < area_median, :].reset_index(drop=True)

In [9]:
vph_cols = [col for col in df.columns if col.startswith('VPH')]
for col in ['VPH_PISOTI', 'VPH_1DOR', 'VPH_1CUART', 'VPH_2CUART',
            'VPH_S_ELEC', 'VPH_AGUAFV', 'VPH_NODREN', 'VPH_SNBIEN']:
    vph_cols.remove(col)

# compute asset score via pca
centered = (df.loc[:, vph_cols].values -
            df.loc[:, vph_cols].values.mean(axis=0)[np.newaxis, :])
m = PCA(n_components=1)
df = pd.concat([
    df.reset_index(drop=True),
    pd.DataFrame(m.fit_transform(centered),
                 columns=['cen_asset'])],
    axis=1)

# # compute asset score via summing
# df.loc[:, 'cen_asset_score_sum'] = df.loc[:, vph_cols].sum(axis=1)

In [10]:
# construct pairs
centroids = np.array([df['geometry'].centroid.x.values,
                      df['geometry'].centroid.y.values]).T
tree = scipy.spatial.cKDTree(centroids)
# find neighbors
k = 4
tree_d, tree_i = tree.query(centroids, k=k)
tree_d = tree_d[:, 1:]
tree_i = tree_i[:, 1:]
_, tree_j = np.meshgrid(range(k - 1), range(centroids.shape[0]))
pairs = np.array([
    # max distance between centroids: sqrt(area_median)
    tree_j[tree_d < np.sqrt(area_median)],
    tree_i[tree_d < np.sqrt(area_median)]]).T
pairs = np.vstack([pairs, pairs[:, ::-1]])
pairs = np.unique(pairs, axis=0)
pairs = pairs[pairs[:, 0] < pairs[:, 1], :]  # drop duplicates

In [11]:
df_i = df.loc[[i for i, _ in pairs], :]
df_j = df.loc[[j for _, j in pairs], :]
df_i.loc[:, 'diff'] = (df_i.loc[:, 'cen_asset'].values -
                       df_j.loc[:, 'cen_asset'].values)
df_j.loc[:, 'diff'] = (df_j.loc[:, 'cen_asset'].values -
                       df_i.loc[:, 'cen_asset'].values)

df_i = df_i.reset_index(drop=True)
df_i.loc[:, 'pair_id'] = df_i.index
df_j = df_j.reset_index(drop=True)
df_j.loc[:, 'pair_id'] = df_j.index

In [12]:
pair_sample = (
    df_i.loc[np.abs(df_i['diff']) > 0.5, 'pair_id'].tolist() +
    df_i.loc[np.abs(df_i['diff']) <= 0.5, 'pair_id'].sample(n=50, random_state=0).tolist())

In [13]:
df = pd.concat([df_i, df_j])
df = df.loc[df['pair_id'].isin(pair_sample), :]
# Int32 cannot be serialized, there is no N/A so safe to cast to int
cols = df.select_dtypes(include='Int32').columns
df.loc[:, cols] = df.loc[:, cols].astype('int')
# save locality level census data
df.to_file(OUT_LOC_DIR, index=False)

In [14]:
# save chip level data
df_chip = df.drop_duplicates(['ent', 'mun', 'loc']).loc[:, ['ent', 'mun', 'loc', 'geometry']]
df_chip = aoi_to_chip(df=df_chip, indices=['ent', 'mun', 'loc'],
                      file_name='ENT{:02d}MUN{:03d}LOC{:04d}CHIP{:06d}',
                      input_type='polygon')
df_chip.to_csv(OUT_IMG_DIR)