In [2]:
import pandas as pd
import numpy as np
from haversine import haversine, haversine_vector, Unit
import geopandas as gpd
import matplotlib.pyplot as plt
import cma

In [3]:
np.set_printoptions(suppress=True)

In [528]:
# read in the prepared dataset. Evaluate point data to make it readible by geopandas
df = pd.read_csv('data/test_dataset.csv', converters={'point': pd.eval})

In [531]:
# Filter to only one region, and read in geojson file.
region = 'Amhara' # Test Amhara region
woreda = 'ET030908'
proposed_schools= 5

### Region Only

In [532]:
# # limit geojson to only selected region
# # limit clean dataset to only selected region
# gdf_region = gpd.read_file('eth_shape_files/json//eth_admin1v2.json') # read in geojson
# gdf_region_shp = gdf_region.loc[gdf_region['ADM1_EN']==region]['geometry'].reset_index(drop=True)
# df = df.loc[df['region'] == region]
# bounds = gdf_region_shp.bounds

### Woreda Only

In [533]:
gdf_woreda = gpd.read_file('eth_shape_files/json/eth_admin3v2.json')
gdf_woreda_shp = gdf_woreda.loc[gdf_woreda['ADM3_PCODE']==woreda]['geometry'].reset_index(drop=True)
df = df.loc[df['ADM3_PCODE'] == woreda]
bounds = gdf_woreda_shp.bounds
bounds

Unnamed: 0,minx,miny,maxx,maxy
0,36.689328,10.713719,36.96973,10.964773


In [534]:
# Establish boundaries for CMA
# Latitude is the Y axis, longitude is the X axis.
# bounds = gdf_region_shp.bounds
lat_bounds = bounds[['miny','maxy']].to_numpy()[0]
lon_bounds = bounds[['minx','maxx']].to_numpy()[0]
bounds = np.array([[lat_bounds[0], lon_bounds[0]], [lat_bounds[1], lon_bounds[1]]])
bounds # array - [[lower lat bounds, lower lon bounds],[upper lat bounds, upper lon bounds]]
# CMA expects a list of size 2 for bounds
x1y1 = np.repeat([bounds[0,:]],proposed_schools, axis=0).flatten()
x2y2 = np.repeat([bounds[1,:]],proposed_schools, axis=0).flatten()
boundsxy = [x1y1,x2y2]
boundsxy

[array([10.71371939, 36.68932805, 10.71371939, 36.68932805, 10.71371939,
        36.68932805, 10.71371939, 36.68932805, 10.71371939, 36.68932805]),
 array([10.96477258, 36.96973029, 10.96477258, 36.96973029, 10.96477258,
        36.96973029, 10.96477258, 36.96973029, 10.96477258, 36.96973029])]

In [535]:
# Create subset arrays required as input for enrollment function.
# 1. Primary school enrollment data
# 2. Primary school location data: lat lon point data. 
# 3. Secondary schoool location data: lat lon point data. 
# 4. Secondary school enrollment data. Potentially required for calibration function.

df_prim = df.loc[ (df['gr_offer'] == 'G.1-8') | (df['gr_offer'] == 'G.5-8')]
df_prim_enroll = df_prim['grade5_8'].reset_index(drop=True).to_numpy()
df_prim_loc = df_prim['point'].reset_index(drop=True).to_numpy()
df_prim_loc = np.array([np.array(i) for i in df_prim_loc])

df_sec = df.loc[ (df['gr_offer'] == 'G. 9-10') | (df['gr_offer'] == 'G. 9-12')]
df_sec_loc = df_sec['point'].reset_index(drop=True).to_numpy()
df_sec_enroll = df_sec['grade9_10'].reset_index(drop=True).to_numpy()
df_sec_loc = np.array([np.array(i) for i in df_sec_loc])

In [536]:
def shape(distance, enrollment):
    min_walk = 2 # distance not a factor issuing enrollment until 2km
    max_walk = 5 # distance greater than 5km assumes zero enrollment
    answer = np.rint(np.where(distance<min_walk, enrollment,
             np.where(distance>max_walk, 0,
                     enrollment*(1-(distance-min_walk)/(max_walk-min_walk)))
            ))
    return answer

In [537]:
# shape function not included.
def expected_enroll(prim_loc, x, prim_enroll, sec_loc):
    x = np.append(sec_loc, x) # The genotype
    x = np.array(np.array_split(x, (len(sec_loc)+proposed_schools)))
    distance = haversine_vector(prim_loc, x, Unit.KILOMETERS, comb=True)
    min_d = np.min(distance, axis=0) # array with minimum distance from each primacy school to every secondary.
    shapey = shape(min_d, prim_enroll)
    return np.sum(shapey*min_d)

In [92]:
# Shape Function included
def expected_enroll2(prim_loc, x, prim_enroll, sec_loc):
    if check_woreda(x):
        total_sec = (len(sec_loc)+proposed_schools)
        x = np.append(sec_loc, x) # The existing secondary (sec_loc) added to proposed secondary(x) i.e. the genotype
        x = np.array(np.array_split(x, total_sec)) # reshape for haversine fn (2,7)
        distance = haversine_vector(prim_loc,x, Unit.KILOMETERS, comb=True) # calculate distance - result= shape (21,7)
#         return np.sum(shape(distance, prim_enroll))

#         max_enroll = np.max(shape(distance, np.repeat([prim_enroll], total_sec, axis=0)), axis=0)
# min_d = np.min(distance, axis=1) # shape (21,)
#         print(distance.shape)
#         print(min_d.shape)
#         shaped_enroll = shape(min_d, (prim_enroll*0.79901)) # shape(21,)
#         return np.sum(s_e)
    else:
        return -100

In [93]:
# Shape Function included
def expected_enroll4(prim_loc, x, prim_enroll, sec_loc):
    if check_woreda(x):
        total_sec = (len(sec_loc)+proposed_schools)
        x = np.append(sec_loc, x) # The existing secondary (sec_loc) added to proposed secondary(x) i.e. the genotype
        x = np.array(np.array_split(x, total_sec)) # reshape for haversine fn (2,7)
        distance = np.min(haversine_vector(x,df_prim_loc, Unit.KILOMETERS, comb=True), axis=1)
        return np.sum(distance*df_prim_enroll)
#         arg_d = np.argmin(haversine_vector(x,df_prim_loc, Unit.KILOMETERS, comb=True), axis=0)
#         shaped_enroll = np.array(shape(distance, prim_enroll))
#         new_schools = np.zeros((total_sec,))
#         np.add.at(new_schools, arg_d, shaped_enroll) # overall enrollment per new school
#         return np.sum(new_schools)
    else:
        return 0

In [538]:
def expected_enroll8(prim_loc, x, prim_enroll, sec_loc):
    x = np.append(sec_loc, x) # The genotype
    x = np.array(np.array_split(x, (len(sec_loc)+proposed_schools)))
    distance = haversine_vector(prim_loc, x, Unit.KILOMETERS, comb=True)
    min_d = np.min(distance, axis=0) # array with minimum distance from each primacy school to every secondary.
    return np.sum(min_d*prim_enroll)

In [539]:
# Shape Function included
def expected_enroll9(prim_loc, x, prim_enroll, sec_loc):
    if check_woreda(x):
        x = np.append(sec_loc, x) # The existing secondary (sec_loc) added to proposed secondary(x) i.e. the genotype
        x = np.array(np.array_split(x, (len(sec_loc)+proposed_schools))) # reshape for haversine fn (2,7)
        distance = haversine_vector(x,prim_loc, Unit.KILOMETERS, comb=True) # calculate distance - result= shape (21,7)
#         print(distance.shape)
        min_d = np.min(distance, axis=1) # shape (21,)
#         print(min_d.shape)
        shaped_enroll = shape(min_d, prim_enroll) # shape(21,)
        return np.sum(shaped_enroll)
    else:
        return 0.01

In [94]:
p_s = 2
bb1 = es.result[0]
bb2 = np.array(np.array_split(bb1, p_s))

In [126]:
x = [create_random_sp() for _ in range(10000)] fx = [(f(xi), xi) for xi in x] bestf, best_solution = min(fx)

SyntaxError: invalid syntax (<ipython-input-126-7da0fd300ced>, line 2)

In [95]:
es.result[1]

-62502.09403276609

In [96]:
# distance = haversine_vector(df_prim_loc,bb2, Unit.KILOMETERS, comb=True) # calculate distance - result= shape (21,7)
# distance_min = np.min(distance, axis=0)
# d1 = shape(distance_min, df_prim_enroll)
# np.sum(distance_min*df_prim_enroll)
# # shapey = shape(distance_min, df_prim_enroll)
# # np.round(np.sum(distance_min*df_prim_enroll),0)
# # arg_d = np.argmin(haversine_vector(bb2,df_prim_loc, Unit.KILOMETERS, comb=True),axis=1)
# # shaped_enroll = shape(distance, df_prim_enroll)
# # np.mean(distance, axis=1)
# # shaped_enroll
# # new_schools = np.zeros((p_s,))
# # np.add.at(new_schools, arg_d, shaped_enroll)
# # new_schools

75077.71118030728

In [97]:
np.sum(distance_min*d1)

62502.09403276609

In [98]:
# es.result[0]

In [None]:
np.min(distance, axis=0)

In [None]:
df_prim_loc

In [None]:
# arr1 = prop
# arg_arr = arg_d

In [None]:
# arr1 = prop
# arg_arr = arg_d
# arr2 = np.zeros((5,))
# np.add.at(arr2, arg_arr, arr1)
# arr2

# # new_sch
# # np.add(new_sch[arg_d], arg_d, prop)
# # new_sch[arg_d] = prop
# # new_sch

# # new_sch
# # np.insert(new_sch, arg_d, prop, axis=0)

# # np.add.at(new_sch, arg_d, prop)


In [568]:
# The Objective Function
def f(x):
    """ The objective function which takes as input a 1d array.Every two elements 
    represent the lat/lon of a point in a given area. A locations variable is created
    which splits the points into pairs. The expected_enrollment function is then run 
    on this (5,2) list to return the total expected enrollment for the five locations.
    """
    locations = np.array(np.array_split(x, proposed_schools))
    test_case = expected_enroll8(df_prim_loc, locations, df_prim_enroll, df_sec_loc)
#     return test_case # for expect_enroll
    return test_case*-1 # for expect_enroll2

In [100]:
def check_woreda(vec):
    # lat = y, x=lon
    vec = gpd.points_from_xy(vec[:, 1], vec[:, 0])
    return vec.within(gdf_woreda_shp[0]).all()
            
def check_region(vec):
    # lat = y, x=lon
    vec = gpd.points_from_xy(vec[:, 1], vec[:, 0])
    return vec.within(gdf_region_shp[0]).all()

In [101]:
# Create starting points within regional box boundaries.

def create_random_sp():
    sp1 = np.random.uniform(low=lat_bounds[0], high=lat_bounds[1], size=proposed_schools)
    sp2 = np.random.uniform(low=lon_bounds[0], high=lon_bounds[1], size=proposed_schools)
    sp = np.vstack((sp1, sp2)).T
    return sp

# create a random starting point within the target region.
def get_random_sp():
    sp = create_random_sp()
    for i in range(0,10000):
        if check_woreda(sp) == True:
            sp = sp.flatten()
            return sp
            break
        else:
            sp = create_random_sp()

In [102]:
# sigma should be 1/4th of the search domain width?
sigma= np.min(bounds, axis=1)
sigma= np.max(sigma) - np.min(sigma)
sigma = round(sigma / 4, 4)

In [413]:
def random_search(f, n):
    x = [create_random_sp() for _ in range(n)] 
    fx = [(f(xi), xi) for xi in x]
    best_f, best_solution = max(fx, key=lambda x:x[0])
    return best_f, best_solution

In [377]:
bf, bs = random_search(f, 10000)

In [569]:
x = [get_random_sp() for _ in range(10000)] 
fx = [(f(xi), xi) for xi in x]

best_f, best_s = max(fx, key=lambda x:x[0])
best_f, best_s

(-48843.37929545716,
 array([10.91798638, 36.81866913, 10.79243935, 36.78634211, 10.79110162,
        36.92518429, 10.89873765, 36.73168349, 10.77931901, 36.94066287]))

In [571]:
best_fm, best_sm = min(fx, key=lambda x:x[0])
best_fm, best_sm

(-332433.5468907285,
 array([10.94542191, 36.71089279, 10.94575144, 36.71396595, 10.90827512,
        36.90325946, 10.8735292 , 36.69663964, 10.90930696, 36.89125428]))

In [403]:
best_s

array([10.94314696, 36.71653434, 10.96228894, 36.70368371])

In [433]:
# if there are two best, then choose the lowest overall distance. 
fx

[(561741.3882685842,
  array([10.8221614 , 36.77251808, 10.83278599, 36.77077205])),
 (518292.63335885777,
  array([10.91927592, 36.85999186, 10.92071221, 36.85992265])),
 (270121.4302032855,
  array([10.86479739, 36.92629211, 10.90892432, 36.73928443])),
 (675185.0268177064,
  array([10.90097981, 36.77073463, 10.89616898, 36.76229699])),
 (176221.3729489386,
  array([10.77664153, 36.88713258, 10.86458425, 36.71773236])),
 (318820.92933889065,
  array([10.93273284, 36.72449494, 10.81774403, 36.84832146])),
 (383106.87113138376,
  array([10.78919275, 36.78641703, 10.78227382, 36.8229845 ])),
 (305806.73397007963,
  array([10.85907835, 36.88460905, 10.90743512, 36.79460708])),
 (301975.52935659426,
  array([10.80923185, 36.85097853, 10.94961076, 36.72004784])),
 (351294.2219282582,
  array([10.84328406, 36.85034817, 10.92775277, 36.82382523])),
 (197594.14874533706,
  array([10.82321604, 36.89354604, 10.89514445, 36.78237372])),
 (142035.26022552972,
  array([10.74942445, 36.91754812, 10

In [312]:
# maxits=  1000
# # starting_point = np.array([[10.7875, 36.934],[10.79, 36.940]]).flatten()
# es = cma.CMAEvolutionStrategy(get_random_sp(), sigma0=0.2,
#                           inopts={'bounds': boundsxy,'seed':1234})
# es.optimize(f, iterations=maxits / es.popsize)

In [396]:
bs

array([[10.77539199, 36.85362335],
       [10.78157196, 36.94394426]])

In [576]:
best_solution = np.array(np.array_split(best_s, proposed_schools))
best_solution

array([[10.91798638, 36.81866913],
       [10.79243935, 36.78634211],
       [10.79110162, 36.92518429],
       [10.89873765, 36.73168349],
       [10.77931901, 36.94066287]])

In [577]:
# ff = np.array([10.78346285, 36.92888455, 10.77372189, 36.89949052])

In [578]:
# es.result[1]

In [2]:
# it is latitude then longitude.
# box = np.array([[10.713719, 36.689328], [10.713719, 36.96973],[10.964773, 36.96973], [10.964773, 36.689328], [10.713719, 36.689328]])
plt.figure(figsize=(15, 10))
# plt.plot(box[:,1], box[:,0], '.r-')
plt.scatter(df_prim_loc[:, 1], df_prim_loc[:, 0], s=df_prim_enroll/100, label="Prim") # s gives size
# plt.scatter(df_sec_loc[:, 1], df_sec_loc[:, 0], s=df_sec_enroll/100, label="Secondary") # s gives size
plt.scatter(best_solution[:, 1], best_solution[:, 0], s = 35, marker="o", label="New Secondary") # stars for supermarkets
plt.gca().set_aspect('equal')
plt.legend()
plt.show()

NameError: name 'plt' is not defined

In [470]:
# es.result

In [337]:
df_prim_loc

array([[10.7806, 36.9441],
       [10.794 , 36.9308]])