In [1]:
import numpy as np
import scipy.spatial
import pandas as pd
import shapely
import shapely.geometry
import geopandas as gpd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

from maskrcnn.preprocess.sample_aoi import load_df

In [2]:
IN_DIR_SHP = 'data/Shapefile/SHP2010/mglu2010v5_0/poligonos_urbanos.shp'
IN_DIR_CEN = 'data/CPV/Raw/ITER2010/ITER_NALDBF10.csv'

In [3]:
df_shp = gpd.read_file(IN_DIR_SHP)
df_shp = df_shp.to_crs({'init': 'epsg:4326'})
for col in df_shp.columns:
    if col.startswith('CVE'):
        df_shp[col] = df_shp[col].astype(int)
df_shp.columns = ['ent', 'mun', 'loc', 'NOM_LOC', 'geometry']

In [4]:
df_cen = load_df(IN_DIR_CEN, drop=False)
df_cen = gpd.GeoDataFrame(
    df_cen,
    geometry=[shapely.geometry.Point(x, y) for x, y in
              zip(df_cen['lon'].values, df_cen['lat'].values)])

In [5]:
df = pd.merge(
    # when using inner join mode, two localities are dropped
    # from df_shp
    # because one of them has VPH variables as N/D and
    # one has those as * (masked)
    # dropped seems fine since we are going to be using those
    # data for validation
    df_shp, df_cen, how='inner', on=['ent', 'mun', 'loc'],
    # most points recorded in the census is in the polygon
    # with some exceptions
    # all the exceptions are cases where points lie close to
    # the census tract but are outside it, b/c the census tract
    # is non convex or just b/c measurement errors
    # I'm sure that they are the same census block though
    # they are never too far apart from each other
    suffixes=('', '_point'))

In [6]:
# compute asset score
vph_cols = [col for col in df.columns if col.startswith('VPH')]

In [7]:
centered = (df.loc[:, vph_cols].values -
            df.loc[:, vph_cols].values.mean(axis=0)[np.newaxis, :])
m = PCA(n_components=3)
df = pd.concat([
    df.reset_index(drop=True),
    pd.DataFrame(m.fit_transform(centered),
                 columns=['asset_score1', 'asset_score2', 'asset_score3'])],
    axis=1)
df['asset_score1'] = - df['asset_score1']

In [8]:
area_median = df['geometry'].area.median()
df = df.loc[df['geometry'].area < area_median, :].reset_index(drop=True)

In [9]:
centroids = np.array([df['geometry'].centroid.x.values,
                      df['geometry'].centroid.y.values]).T
tree = scipy.spatial.cKDTree(centroids)

In [10]:
k = 4
tree_d, tree_i = tree.query(centroids, k=k)
tree_d = tree_d[:, 1:]
tree_i = tree_i[:, 1:]

In [11]:
_, tree_j = np.meshgrid(range(k - 1), range(centroids.shape[0]))

In [12]:
pairs = np.array([
    tree_j[tree_d < np.sqrt(area_median)],
    tree_i[tree_d < np.sqrt(area_median)]]).T

In [13]:
pairs = np.vstack([pairs, pairs[:, ::-1]])

In [14]:
pairs = np.unique(pairs, axis=0)
pairs = pairs[pairs[:, 0] < pairs[:, 1], :]

In [15]:
df_i = df.loc[[i for i, _ in pairs], :]
df_j = df.loc[[j for _, j in pairs], :]
df_i.loc[:, 'diff'] = (df_i.loc[:, 'asset_score1'].values -
                       df_j.loc[:, 'asset_score1'].values)
df_j.loc[:, 'diff'] = (df_j.loc[:, 'asset_score1'].values -
                       df_i.loc[:, 'asset_score1'].values)

In [16]:
df_i = df_i.reset_index(drop=True)
df_i.loc[:, 'pair_id'] = df_i.index
df_j = df_j.reset_index(drop=True)
df_j.loc[:, 'pair_id'] = df_j.index

In [17]:
df_i.loc[np.abs(df_i['diff'].values) > .5, [
    'ent', 'mun', 'loc', 'geometry', 'asset_score1', 'diff', 'pair_id'
]].to_file('data/tmp/urban_0.shp')
df_j.loc[np.abs(df_j['diff'].values) > .5, [
    'ent', 'mun', 'loc', 'geometry', 'asset_score1', 'diff', 'pair_id'
]].to_file('data/tmp/urban_1.shp')