# Install and load packages

In [1]:
!pip install -q ohsome

In [20]:
import pandas as pd
import numpy as np
import geopandas as gpd
from ohsome import OhsomeClient
import folium

In [3]:
# mount the google drive (if operating from within colab)
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/master_thesis/predicting_poverty/preprocess_OSM

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/master_thesis/predicting_poverty/preprocess_OSM


In [4]:
# import the OSM package
from OSM_utils.process import *
from OSM_utils.describe import *

# load LSMS data

In [5]:
LSMS_CSV_PATH = '../../Data/lsms/processed/labels_cluster.csv'
lsms_df = pd.read_csv(LSMS_CSV_PATH)
lsms_df['lat'] = lsms_df['lat'].astype(np.float32)
lsms_df['lon'] = lsms_df['lon'].astype(np.float32)
lsms_df = lsms_df[['country', 'cluster_id', 'rural', 'lat', 'lon']]
cluster_df = lsms_df.drop_duplicates().reset_index(drop = True).copy()

# Recenter locations

## Create the ROI

In [6]:
# create a multipolygon for all the

# first translate the dataset into a geopands dataframe containing the
# given coordinates as a point as well as the bounding box around it as a polygon
pt_geoms = gpd.points_from_xy(
    x=cluster_df["lon"],
    y=cluster_df["lat"],
    crs="EPSG:4326",
)

cluster_gpd = gpd.GeoDataFrame(cluster_df.copy(), geometry = pt_geoms)
# create buffer. First project into projection that calculates in meters

# create region of interest
roi_5k = cluster_gpd['geometry'].to_crs("EPSG:3857").buffer(5000)
roi_2k = cluster_gpd['geometry'].to_crs("EPSG:3857").buffer(2000)
cluster_gpd['roi'] = roi_5k
urban_mask = cluster_gpd['rural'] == 0
cluster_gpd['roi'][urban_mask] = roi_2k[urban_mask]

# reset the crs to degrees
cluster_gpd['roi'] = cluster_gpd['roi'].to_crs("EPSG:4326")

## First iteration

In [7]:
# first recenter locations based on village, neighbourhood, quarter etc.
client = OhsomeClient(log=False)
fltr = '(place in (village, neighbourhood, quarter, city_block, suburb, town, city) or landuse=residential)'

new_lats, new_lons, old_count, old_ids = recenter_data(cluster_gpd, client, fltr, extreme = False)

# set to array
new_lats = np.array(new_lats)
new_lons = np.array(new_lons)

# add to the cluster gpd data.
new_points = gpd.points_from_xy(
    x=new_lons,
    y=new_lats,
    crs="EPSG:4326",
)

# add to the cluster data
cluster_gpd['new_lat'] = new_lats
cluster_gpd['new_lon'] = new_lons
cluster_gpd['new_point'] = new_points
cluster_gpd['no_recenter'] = [i in old_ids for i in cluster_gpd['cluster_id']]

Download data from OSM
Downloading the OSM data took 443 seconds
fixing 1 invalid geometries
Recenter locations...


  0%|          | 0/2255 [00:00<?, ?it/s]

eth_010401088801204 could not find new location, use old location instead
eth_010403020100109 could not find new location, use old location instead
eth_010503088801501 could not find new location, use old location instead
eth_020102010100102 could not find new location, use old location instead
eth_020103088800802 could not find new location, use old location instead
eth_020106088800402 could not find new location, use old location instead
eth_020302088800102 could not find new location, use old location instead
eth_020304088800601 could not find new location, use old location instead
eth_020306088800601 could not find new location, use old location instead
eth_030104088800901 could not find new location, use old location instead
eth_030112088800203 could not find new location, use old location instead
eth_030201088800703 could not find new location, use old location instead
eth_030209010100104 could not find new location, use old location instead
eth_030302088802104 could not find new

In [8]:
describe_results(cluster_gpd)

The minimum distance between old and new points: 0.0
The maximum distance between old and new points: 4990.7438799954825
Maximum distance if urban: 1976.6647987328945
Maximum distance if rural: 4990.7438799954825
Number of clusters where displacement exceedes the max raidus (Urban): 0
Number of clusters where displacement exceedes the max raidus (Rural): 0
Number of clusters that were not relocated: 150
Number of clusters with same geo-locations: 48
Note that two clusters have the same geo-locations by default


## Second Iteration using buildings

In [9]:
# Some clusters cannot be relocated using this technique or are relocated to the same
# location, which happens sometimes in urban areas, where only houses are mapped, but not residential areas
# To fix this, I reattempt the recentering for those clusters using data on all buildings:
client = OhsomeClient(log=False)
fltr = '(place in (hamlet, isolated_dwelling) or building=*)'
old_id_mask = np.array([i in old_ids for i in cluster_gpd['cluster_id']])

same_location_mask = cluster_gpd[['new_lat', 'new_lon']].duplicated(keep = False)
mask = old_id_mask | np.array(same_location_mask)

new_lats_2, new_lons_2, old_count_2, old_ids_2 = recenter_data(cluster_gpd[mask], client, fltr, extreme = False)

# add the new lons and lats to the new lat / lon arrays.
new_lats[mask] = new_lats_2
new_lons[mask] = new_lons_2

# add to the cluster gpd data.
new_points = gpd.points_from_xy(
    x=new_lons,
    y=new_lats,
    crs="EPSG:4326",
)

# add to the cluster data
cluster_gpd['new_lat'] = new_lats
cluster_gpd['new_lon'] = new_lons
cluster_gpd['new_point'] = new_points
cluster_gpd['no_recenter'] = [i in old_ids_2 for i in cluster_gpd['cluster_id']]

Download data from OSM
Downloading the OSM data took 119 seconds
fixing 1 invalid geometries
Recenter locations...


  0%|          | 0/198 [00:00<?, ?it/s]

eth_010401088801204 could not find new location, use old location instead
eth_010403020100109 could not find new location, use old location instead
eth_020102010100102 could not find new location, use old location instead
eth_020103088800802 could not find new location, use old location instead
eth_020106088800402 could not find new location, use old location instead
eth_020302088800102 could not find new location, use old location instead
eth_020304088800601 could not find new location, use old location instead
eth_020306088800601 could not find new location, use old location instead
eth_030201088800703 could not find new location, use old location instead
eth_030209010100104 could not find new location, use old location instead
eth_030302088802104 could not find new location, use old location instead
eth_030411088800903 could not find new location, use old location instead
eth_030507088801204 could not find new location, use old location instead
eth_030611088802207 could not find new

In [10]:
describe_results(cluster_gpd)

The minimum distance between old and new points: 0.0
The maximum distance between old and new points: 4990.7438799954825
Maximum distance if urban: 1976.6647987328945
Maximum distance if rural: 4990.7438799954825
Number of clusters where displacement exceedes the max raidus (Urban): 0
Number of clusters where displacement exceedes the max raidus (Rural): 0
Number of clusters that were not relocated: 70
Number of clusters with same geo-locations: 2
Note that two clusters have the same geo-locations by default


## Third iteration using roads

In [11]:
# use the nearest highway instead
client = OhsomeClient(log=False)
fltr = 'highway=*'

old_id_mask = np.array([i in old_ids for i in cluster_gpd['cluster_id']])
mask = old_id_mask

new_lats_3, new_lons_3, old_count_3, old_ids_3 = recenter_data(cluster_gpd[mask], client, fltr, extreme = False )

# add the new lons and lats to the new lat / lon arrays.
new_lats[mask] = new_lats_3
new_lons[mask] = new_lons_3

# add to the cluster gpd data.
new_points = gpd.points_from_xy(
    x=new_lons,
    y=new_lats,
    crs="EPSG:4326",
)

# add to the cluster data
cluster_gpd['new_lat'] = new_lats
cluster_gpd['new_lon'] = new_lons
cluster_gpd['new_point'] = new_points
cluster_gpd['no_recenter'] = [i in old_ids_3 for i in cluster_gpd['cluster_id']]

Download data from OSM
Downloading the OSM data took 99 seconds
fixing 0 invalid geometries
Recenter locations...


  0%|          | 0/150 [00:00<?, ?it/s]

eth_010503088801501 could not find new location, use old location instead
eth_020102010100102 could not find new location, use old location instead
eth_020304088800601 could not find new location, use old location instead
eth_030104088800901 could not find new location, use old location instead
eth_030112088800203 could not find new location, use old location instead
eth_030201088800703 could not find new location, use old location instead
eth_030209010100104 could not find new location, use old location instead
eth_050102088802401 could not find new location, use old location instead
eth_050206088800901 could not find new location, use old location instead
eth_050904010100101 could not find new location, use old location instead
eth_050904088800302 could not find new location, use old location instead
eth_070907088801501 could not find new location, use old location instead
eth_120201088801701 could not find new location, use old location instead
eth_150102088803102 could not find new

In [12]:
describe_results(cluster_gpd)

The minimum distance between old and new points: 0.0
The maximum distance between old and new points: 4990.7438799954825
Maximum distance if urban: 1976.6647987328945
Maximum distance if rural: 4990.7438799954825
Number of clusters where displacement exceedes the max raidus (Urban): 0
Number of clusters where displacement exceedes the max raidus (Rural): 0
Number of clusters that were not relocated: 20
Number of clusters with same geo-locations: 2
Note that two clusters have the same geo-locations by default


## Fourth Iteration using extended buffer

In [13]:
# if none of the above worked, try the same just using a larger buffer (some clusters are displaced by more than 2 or 5 km

# first translate the dataset into a geopands dataframe containing the
# given coordinates as a point as well as the bounding box around it as a polygon
pt_geoms = gpd.points_from_xy(
    x=cluster_df["lon"],
    y=cluster_df["lat"],
    crs="EPSG:4326",
)

cluster_gpd_extreme = gpd.GeoDataFrame(cluster_df.copy(), geometry = pt_geoms)

# create region of interest
roi_10k = cluster_gpd_extreme['geometry'].to_crs("EPSG:3857").buffer(10000)
roi_5k = cluster_gpd_extreme['geometry'].to_crs("EPSG:3857").buffer(5000)
cluster_gpd_extreme['roi'] = roi_10k
urban_mask = cluster_gpd_extreme['rural'] == 0
cluster_gpd_extreme['roi'][urban_mask] = roi_5k[urban_mask]

# reset the crs to degrees
cluster_gpd_extreme['roi'] = cluster_gpd_extreme['roi'].to_crs("EPSG:4326")

client = OhsomeClient(log=False)
fltr = '(place in (isolated_dwelling, hamlet ,village, neighbourhood, quarter, city_block, suburb, town, city) or landuse=residential or building=* or highway=*)'

old_id_mask = np.array([i in old_ids_3 for i in cluster_gpd_extreme['cluster_id']])
mask = old_id_mask

new_lats_4, new_lons_4, old_count_4, old_ids_4 = recenter_data(cluster_gpd_extreme[mask], client, fltr, extreme = True)

# add the new lons and lats to the new lat / lon arrays.
new_lats[mask] = new_lats_4
new_lons[mask] = new_lons_4

# add to the cluster gpd data.
new_points = gpd.points_from_xy(
    x=new_lons,
    y=new_lats,
    crs="EPSG:4326",
)

# add to the cluster data
cluster_gpd['new_lat'] = new_lats
cluster_gpd['new_lon'] = new_lons
cluster_gpd['new_point'] = new_points
cluster_gpd['no_recenter'] = [i in old_ids_4 for i in cluster_gpd['cluster_id']]

Download data from OSM
Downloading the OSM data took 104 seconds
fixing 0 invalid geometries
Recenter locations...


  0%|          | 0/20 [00:00<?, ?it/s]

In [14]:
describe_results(cluster_gpd)

The minimum distance between old and new points: 0.0
The maximum distance between old and new points: 7896.828082530719
Maximum distance if urban: 2779.686636398385
Maximum distance if rural: 7896.828082530719
Number of clusters where displacement exceedes the max raidus (Urban): 4
Number of clusters where displacement exceedes the max raidus (Rural): 8
Number of clusters that were not relocated: 0
Number of clusters with same geo-locations: 2
Note that two clusters have the same geo-locations by default


## Save the data to the cluster_df

In [15]:
# save the data containing the new coordinates
cluster_df['new_lat'] = new_lats
cluster_df['new_lon'] = new_lons
cluster_df = cluster_df.rename(columns = {'lat':'lsms_lat','lon':'lsms_lon', 'new_lat':'lat', 'new_lon':'lon'})
cluster_df['no_recenter'] = [i in old_ids_4 for i in cluster_df['cluster_id']]

In [17]:
# save the data
pth = "../../Data/lsms/processed/recentered_cluster_gps_v1.csv"
cluster_df.to_csv(pth, index = False)

## Save the new coordinates in the actual labels dataset

In [18]:
LSMS_CSV_PATH = '../../Data/lsms/processed/labels_cluster.csv'
lsms_df = pd.read_csv(LSMS_CSV_PATH)
lsms_df = lsms_df.rename(columns = {'lat':'lsms_lat','lon':'lsms_lon'})

lsms_df_v1 = pd.merge(lsms_df, cluster_df[['cluster_id', 'lat', 'lon']], on = 'cluster_id')
new_file_path = '../../Data/lsms/processed/labels_cluster_v1.csv'
lsms_df_v1.to_csv(new_file_path, index = False)

# Map the recentered locations

In [21]:
# plot the old and new locations
location = [6.48676,9.15792]

old_point_marker = folium.CircleMarker(radius = 4, # Radius in metres
weight = 0, #outline weight
fill_color = '#000000',
fill_opacity = 1)
m = folium.Map(location = location, zoom_start = 5)

new_point_marker = folium.CircleMarker(radius = 4, # Radius in metres
weight = 0, #outline weight
fill_color = '#ff0000',
fill_opacity = 1)

def style_function(feature):
  return {'fillColor': 'red', 'opacity':0.1}

folium.GeoJson(cluster_gpd['geometry'], marker = old_point_marker).add_to(m)
folium.GeoJson(cluster_gpd['new_point'], marker = new_point_marker).add_to(m)

folium.GeoJson(cluster_gpd['roi'][cluster_gpd['rural'] == 1], style_function).add_to(m)
folium.GeoJson(cluster_gpd['roi'][cluster_gpd['rural'] == 0]).add_to(m)

m

# Old functions

In [None]:
# def get_distance(cluster_info, reponse_df):
#   lon, lat = cluster_info.lon, cluster_info.lat
#   point = gpd.points_from_xy(x=[lon], y=[lat], crs="EPSG:4326")
#   proj_point = point.to_crs('EPSG:3857').copy()
#   proj_geom = reponse_df['geometry'].to_crs('EPSG:3857')
#   # dist = []
#   # for i in range(len(proj_geom)):
#   #   with warnings.catch_warnings():
#   #     warnings.filterwarnings('error')
#   #     try:
#   #       d = proj_point.distance(proj_geom[i])[0]
#   #     except RuntimeWarning:
#   #       print(f"Warning in iteration {i}")
#   #       d = np.nan
#   #     dist.append(d)
#   dist = np.array([i.distance(proj_point)[0] for i in proj_geom])
#   return dist

# def subset_response_roi(cluster_info, response_df):
#   rural = cluster_info['rural']
#   lat = cluster_info['lat']
#   lon = cluster_info['lon']
#   roi = cluster_info['roi']

#   if rural == 1: radius = 5000
#   else: radius = 2000

#   # get the closest geometry:
#   nearest_geom, dist = get_nearest_geom(cluster_info, response_df)
#   # # calculate the distance to every object in the response_df
#   # dist = get_distance(cluster_info, response_df)
#   # dist_mask = dist < radius #np.array([i < radius for i in dist])

#   # subset the response dataframe to the roi
#   #resp['dist'] = dist
#   response_roi = resp.loc[dist_mask,:].reset_index(drop = True).copy()

#   # # procede only if at least one entry in OSM
#   # if len(response_roi) > 0:
#   #   response_roi['area'] = response_roi['geometry'].to_crs('EPSG:3857').area
#   #   area_mask = (response_roi['area'] > 100000) | (response_roi['area'] == 0) # villages, cities or alike are points, thus area = 0

#   #   # subset the data if there is at least one object that meets the requirements
#   #   if sum(area_mask) > 0:
#   #     response_roi = response_roi.loc[area_mask,:].reset_index(drop = True)

#   return response_roi.reset_index(drop = True).copy()


# def recenter_location(cluster_info, response_df):
#   if len(response_df) > 0:
#     # calculate the distance
#     dist = get_distance(cluster_info, response_df)
#     response_df['dist'] = dist
#     # get the new coordinates - those that are the closest to the original point
#     min_idx = response_df['dist'].idxmin() # if there is more than one element with equal distance, it just choses the first
#     min_dist = response_df['dist'].min()
#     if min_dist < 1:
#       # if the min distance is smaller than 1 meter, just stay
#       new_lon, new_lat = float(cluster_info.lon), float(cluster_info.lat)
#     else:
#       min_object = response_df.iloc[[min_idx]].reset_index(drop = True).copy()
#       closest_point = nearest_points(min_object['geometry'], cluster_info['geometry'])[0]
#       # intersection = min_object['geometry'].intersection(cluster_info.roi)[0]
#       # intersection_df = gpd.GeoDataFrame(geometry = [intersection], crs = 'EPSG:4326')
#       # recentered_loc = intersection_df['geometry'].to_crs('EPSG:3857').centroid.to_crs('EPSG:4326')
#       new_lon, new_lat = float(closest_point.x), float(closest_point.y)
#   else:
#     new_lon, new_lat = np.nan, np.nan
#   return new_lat, new_lon