In [1]:
import pandas as pd
import numpy as np
from haversine import haversine, haversine_vector, Unit
import geopandas as gpd
import matplotlib.pyplot as plt
import cma

In [2]:
# To supress the scientific notation for easier reading.
np.set_printoptions(suppress=True)

# Test Development

In developing the code, three main tests have been completed within one region: Amhara. The first at a very micro level with a custom dummy dataset with only two primary schools and 2 proposed secondary schools. The second is for one woreda/district with 21 primary schools and two secondary schools, whereby 5 secondary schools are proposed. The final test was a region wide test with 1765 primary schools and 1658 secondary schools, with 5 new secondary schools proposed.

In [3]:
# Specify which test to perform
declare_test = 2 # micro test

In [4]:
# Declare key variables according to the test being performed.
region = 'Amhara' # Test Amhara region
woreda = 'ET030908'

if declare_test == 1: # for micro test
    # read in the prepared dataset. Evaluate point data to make it readible by geopandas
    df = pd.read_csv('data/test_dataset2.csv', converters={'point': pd.eval})
    proposed_schools= 2
    gdf_woreda = gpd.read_file('eth_shape_files/json/eth_admin3v2.json')
    gdf_woreda_shp = gdf_woreda.loc[gdf_woreda['ADM3_PCODE']==woreda]['geometry'].reset_index(drop=True)
    df = df.loc[df['ADM3_PCODE'] == woreda]
    bounds = gdf_woreda_shp.bounds
elif declare_test == 2:
    df = pd.read_csv('data/test_dataset.csv', converters={'point': pd.eval})
    proposed_schools= 5
    gdf_woreda = gpd.read_file('eth_shape_files/json/eth_admin3v2.json')
    gdf_woreda_shp = gdf_woreda.loc[gdf_woreda['ADM3_PCODE']==woreda]['geometry'].reset_index(drop=True)
    df = df.loc[df['ADM3_PCODE'] == woreda]
    bounds = gdf_woreda_shp.bounds
else:
    proposed_schools= 5
    df = pd.read_csv('data/clean_dataset.csv', converters={'point': pd.eval})
    # limit geojson to only selected region
    # limit clean dataset to only selected region
    gdf_region = gpd.read_file('eth_shape_files/json//eth_admin1v2.json') # read in geojson
    gdf_region_shp = gdf_region.loc[gdf_region['ADM1_EN']==region]['geometry'].reset_index(drop=True)
    df = df.loc[df['region'] == region]
    bounds = gdf_region_shp.bounds 

## Data Preparation

In [6]:
# Establish boundaries based on the bounds of region or woreda.
# Latitude is the Y axis, longitude is the X axis.

lat_bounds = bounds[['miny','maxy']].to_numpy(dtype=float)[0]
lon_bounds = bounds[['minx','maxx']].to_numpy(dtype=float)[0]
bounds = np.array([[lat_bounds[0], lon_bounds[0]], [lat_bounds[1], lon_bounds[1]]])
# array - [[lower lat bounds, lower lon bounds],[upper lat bounds, upper lon bounds]]
# CMA expects a list of size 2 for bounds
x1y1 = np.repeat([bounds[0,:]],proposed_schools, axis=0).flatten()
x2y2 = np.repeat([bounds[1,:]],proposed_schools, axis=0).flatten()
boundsxy = np.array([x1y1,x2y2])
boundsxy

array([[10.71371939, 36.68932805, 10.71371939, 36.68932805, 10.71371939,
        36.68932805, 10.71371939, 36.68932805, 10.71371939, 36.68932805],
       [10.96477258, 36.96973029, 10.96477258, 36.96973029, 10.96477258,
        36.96973029, 10.96477258, 36.96973029, 10.96477258, 36.96973029]])

In [7]:
# Create subset arrays required as input for enrollment function.
# 1. Primary school enrollment data
# 2. Primary school location data: lat lon point data. 
# 3. Secondary schoool location data: lat lon point data. 
# 4. Secondary school enrollment data. Potentially required for calibration function.

df_prim = df.loc[ (df['gr_offer'] == 'G.1-8') | (df['gr_offer'] == 'G.5-8')]
df_prim_enroll = df_prim['grade5_8'].reset_index(drop=True).to_numpy(dtype=float)
df_prim_loc = df_prim['point'].reset_index(drop=True).to_numpy()
df_prim_loc = np.array([np.array(i) for i in df_prim_loc], dtype=float)

df_sec = df.loc[ (df['gr_offer'] == 'G. 9-10') | (df['gr_offer'] == 'G. 9-12')]
df_sec_loc = df_sec['point'].reset_index(drop=True).to_numpy()
df_sec_enroll = df_sec['grade9_10'].reset_index(drop=True).to_numpy(dtype=float)
df_sec_loc = np.array([np.array(i) for i in df_sec_loc], dtype=float)

In [8]:
total_sec = len(df_sec_enroll)+ proposed_schools
total_sec

7

## Key functions

In [9]:
def check_woreda(vec):
    # lat = y, x=lon
    vec = gpd.points_from_xy(vec[:, 1], vec[:, 0])
    return vec.within(gdf_woreda_shp[0]).all()
            
def check_region(vec):
    # lat = y, x=lon
    vec = gpd.points_from_xy(vec[:, 1], vec[:, 0])
    return vec.within(gdf_region_shp[0]).all()

In [10]:
def shape(distance, enrollment):
    min_walk = 2 # distance not a factor issuing enrollment until 2km
    max_walk = 5 # distance greater than 5km assumes zero enrollment
    answer = np.where(distance<min_walk, enrollment,
             np.where(distance>max_walk, 0,
                     enrollment*(1-(distance-min_walk)/(max_walk-min_walk)))
            )
    return answer

In [11]:
def expected_enroll(prim_loc, x, prim_enroll):
    distance = haversine_vector(prim_loc, x, Unit.KILOMETERS, comb=True)
    min_d = np.min(distance, axis=0) # array with minimum distance from each primacy school to every secondary.
    shaped_enroll = shape(min_d, prim_enroll)
    return np.sum(shaped_enroll)

In [12]:
# The Objective Function with the shape function included.
def f(x):
    x = np.append(df_sec_loc, x).reshape(total_sec,2) #  add new schools to existing and reshape
    test_case = expected_enroll(df_prim_loc, x, df_prim_enroll)
    return test_case

In [23]:
# Create random points within regional or district boundaries boundaries.

def create_random_sp(sp_type):
    if sp_type == 'region':
        shape_file = gdf_region_shp[0]
    else:
        shape_file = gdf_woreda_shp[0]
    sp_in_bounds = False
    while sp_in_bounds == False:
        sp1 = np.random.uniform(low=lat_bounds[0], high=lat_bounds[1], size=proposed_schools)
        sp2 = np.random.uniform(low=lon_bounds[0], high=lon_bounds[1], size=proposed_schools)
        sp = np.vstack((sp1, sp2)).T
        sp_points = gpd.points_from_xy(sp[:, 1], sp[:, 0])
        if sp_points.within(shape_file).all():
            sp_in_bounds = True
            return sp.flatten()

In [None]:
gdf_woreda_shp[0]

In [None]:
sp_points.within(shape_file).all()

## Random Search

In [None]:
def random_search(f, n, test_type):
    x = [create_random_sp(test_type) for _ in range(n)] 
    fx = [(f(xi), xi) for xi in x]
    best_f, best_solution = max(fx, key=lambda x:x[0])
    return best_f, best_solution

In [None]:
%time fx = [random_search(f, 1000, 'woreda') for _ in range(4)]

array([6.4200536 , 0.97208858, 4.32832206, 5.29183205, 9.11916044,
       5.27713026, 1.73032317, 3.33592388, 5.22039308, 9.74686057,
       4.88778134, 4.00496738, 0.756862  , 7.40811787, 4.05068789,
       7.28259465, 9.52187629, 4.88278866, 5.29026977, 4.59711784,
       2.82367034])

array([10.89509746, 36.83795863, 10.90310309, 36.85796824, 10.84827089,
       36.77392375, 10.73741988, 36.90180524, 10.7663454 , 36.90314876])

In [191]:
x = create_random_sp('woreda')
x = x.reshape(proposed_schools,2)

In [376]:
EEI = 0
current_ps_distance = df_prim['nearest_lwr_sec'].to_numpy()
distance = haversine_vector(df_prim_loc, x, Unit.KILOMETERS, comb=True)
ff = (distance < 5) & (distance < current_ps_distance) # < 5km and < other secondary schools.
dd = np.concatenate([df_prim_enroll[i] for i in ff]) 
EEI += np.sum(shape(distance[ff], dd)) # add enrollment to expected enrollment increase.

In [387]:
for i in distance:
    for j in range(0, len(i)):
        print(i[j])

14.50699770804724
11.35584917135505
15.704716729603387
11.409989801118783
2.4761731018760984
6.351289851296947
9.767600263475675
14.841820841074307
7.771842573671798
8.789239053577836
7.628409503785916
14.113840932263892
11.152259726017087
8.035558591056743
15.474500092397834
12.924634499668567
15.02759222592423
13.640977870626068
12.206195300885357
10.510131602986617
12.84027801444574
19.99084181998597
4.674353203934396
16.872083654649543
8.815000490428602
8.510352635276016
3.844415592789101
11.637803071663791
7.672433323121746
7.5864734429575105
4.590122840740848
5.4979838116576
13.174649269714424
12.949787686517626
13.812519440097535
7.601152606569728
18.030516345860708
21.658469457944456
9.680201241329234
16.25285308369192
2.621515364012487
7.909573555018298
22.21930594569133
9.214285658373733
20.841125185923627
13.372177749784155
9.876662639253915
7.709110781490203
15.177466529702576
7.494365601288641
11.441950547817338
0.8234100362967812
9.66479663072288
17.569897597637194
16.586

In [381]:
u,c = np.unique(np.where(ff)[1], return_counts=True)
dup = u[c > 1]
dup

array([4, 9], dtype=int64)

In [383]:
np.where(ff)

(array([0, 1, 1, 1, 2, 3, 4, 4], dtype=int64),
 array([ 4,  5,  9, 19,  9, 14,  4, 13], dtype=int64))

In [374]:
np.bincount(ff[1])

array([18,  3], dtype=int64)

In [350]:
haversine_vector(df_prim_loc, x, Unit.KILOMETERS, comb=True)

array([[14.50699771, 11.35584917, 15.70471673, 11.4099898 ,  2.4761731 ,
         6.35128985,  9.76760026, 14.84182084,  7.77184257,  8.78923905,
         7.6284095 , 14.11384093, 11.15225973,  8.03555859, 15.47450009,
        12.9246345 , 15.02759223, 13.64097787, 12.2061953 , 10.5101316 ,
        12.84027801],
       [19.99084182,  4.6743532 , 16.87208365,  8.81500049,  8.51035264,
         3.84441559, 11.63780307,  7.67243332,  7.58647344,  4.59012284,
         5.49798381, 13.17464927, 12.94978769, 13.81251944,  7.60115261,
        18.03051635, 21.65846946,  9.68020124, 16.25285308,  2.62151536,
         7.90957356],
       [22.21930595,  9.21428566, 20.84112519, 13.37217775,  9.87666264,
         7.70911078, 15.17746653,  7.4943656 , 11.44195055,  0.82341004,
         9.66479663, 17.5698976 , 16.58600325, 15.73535479, 10.23147831,
        20.44204322, 23.09274416, 14.39738853, 19.14373255,  6.01622878,
        12.62557988],
       [24.45254119,  3.94630308, 18.32058285,  9.47082822

In [356]:
np.min(distance, axis=0)

array([10.33630113,  3.94630308, 11.63735847,  8.81500049,  2.08648512,
        3.84441559,  5.95121628,  7.15618299,  5.79471765,  0.82341004,
        5.49798381, 10.98753105,  7.18290915,  3.8672342 ,  2.24185541,
        8.59780467, 11.59191873,  8.25548937,  7.72068907,  2.62151536,
        6.29129072])

In [292]:
np.min([distance[ff]], axis=0)

array([2.4761731 , 3.84441559, 4.59012284, 2.62151536, 0.82341004,
       2.24185541, 2.08648512, 3.8672342 ])

In [307]:
ff

array([[False, False, False, False,  True, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False],
       [False, False, False, False, False,  True, False, False, False,
         True, False, False, False, False, False, False, False, False,
        False,  True, False],
       [False, False, False, False, False, False, False, False, False,
         True, False, False, False, False, False, False, False, False,
        False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False,  True, False, False, False,
        False, False, False],
       [False, False, False, False,  True, False, False, False, False,
        False, False, False, False,  True, False, False, False, False,
        False, False, False]])

In [339]:
np.unique(ff == True, return_index=True)

(array([False,  True]), array([0, 4], dtype=int64))

In [325]:
for i in range(proposed_schools):
    np.

IndexError: boolean index did not match indexed array along dimension 0; dimension is 5 but corresponding boolean dimension is 21

In [297]:
Conditional selection of values based on comparison across vectors. 
    

(array([0, 1, 1, 1, 2, 3, 4, 4], dtype=int64),
 array([ 4,  5,  9, 19,  9, 14,  4, 13], dtype=int64))

In [273]:
ss = np.where(ff)
ss

(array([0, 1, 1, 1, 2, 3, 4, 4], dtype=int64),
 array([ 4,  5,  9, 19,  9, 14,  4, 13], dtype=int64))

In [281]:
np.unique(ss[1], return_inverse=False, return_counts=True, return_index=True)

(array([ 4,  5,  9, 13, 14, 19], dtype=int64),
 array([0, 1, 2, 7, 5, 3], dtype=int64),
 array([2, 1, 2, 1, 1, 1], dtype=int64))

In [267]:
np.bincount(ff[1])

array([18,  3], dtype=int64)

In [265]:
np.unique(ff)

array([[False, False, False, False,  True, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False],
       [False, False, False, False, False,  True, False, False, False,
         True, False, False, False, False, False, False, False, False,
        False,  True, False],
       [False, False, False, False, False, False, False, False, False,
         True, False, False, False, False, False, False, False, False,
        False, False, False],
       [False, False, False, False, False, False, False, False, False,
        False, False, False, False, False,  True, False, False, False,
        False, False, False],
       [False, False, False, False,  True, False, False, False, False,
        False, False, False, False,  True, False, False, False, False,
        False, False, False]])

In [254]:
EEI

2767.3808873388402

In [227]:
[distance[i] for i in ff]

IndexError: boolean index did not match indexed array along dimension 0; dimension is 5 but corresponding boolean dimension is 21

array([[10.81665275, 36.88701445],
       [10.83406083, 36.76029432],
       [10.82165553, 36.92905401],
       [10.78225412, 36.86877747],
       [10.9070319 , 36.76519061]])

In [181]:
ll = [10.9070319 , 36.76519061]

In [184]:
haversine_vector(ll, df_prim_loc, Unit.KILOMETERS, comb=True)

array([[24.07076218],
       [ 3.93164668],
       [18.98719204],
       [10.07868335],
       [13.49111516],
       [ 7.93721566],
       [14.81324261],
       [ 5.08715308],
       [10.80486161],
       [ 7.60243069],
       [ 8.63614735],
       [14.50550116],
       [15.88200635],
       [18.29845013],
       [ 2.36086773],
       [22.02413793],
       [26.21619799],
       [ 9.50976878],
       [19.88495891],
       [ 2.63851422],
       [ 7.41693572]])

In [164]:
haversine_vector(x, df_prim_loc, Unit.KILOMETERS, comb=True)

array([[ 7.41277497],
       [13.24229097],
       [ 8.12004878],
       [ 9.01846775],
       [ 5.9165214 ],
       [ 8.91690618],
       [ 3.46359133],
       [20.28516042],
       [ 6.06932388],
       [15.75621063],
       [ 8.08088888],
       [ 8.72017463],
       [ 4.11585991],
       [ 2.64399233],
       [18.75245616],
       [ 5.40520203],
       [ 9.86132162],
       [11.59902135],
       [ 3.89821598],
       [14.68540056],
       [12.17668457]])

In [160]:
less_than5 = np.where(distance < 5)
less_than5

(array([ 6, 12, 13, 18], dtype=int64),)

In [None]:
EEI += np.sum(shape(distance[ff], df_prim_enroll[ff]))

In [156]:
# then i need to find number of primary schools that are 

# less_than5.flatten() < current_distance
# df_prim_enroll[distance.flatten() < current_distance]
# np.where(distance > )
# np.where(distance < 5)
# EEI

In [125]:
distance[less_than5]

array([1.71112112, 3.26969206, 3.2762255 ])

In [118]:
np.where(distance[less_than5] < current_distance[less_than5], EEI +=

SyntaxError: invalid syntax (<ipython-input-118-1ad0532a8cd8>, line 1)

array([ 208.        , 5767.69313345,   64.35424813])

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [92]:
distance[less_than5]

IndexError: index 13 is out of bounds for axis 0 with size 1

IndexError: index 13 is out of bounds for axis 0 with size 1

In [84]:
distance.flatten()[less_than5]

array([1.71112112, 3.26969206, 3.2762255 ])

In [41]:
distance

array([[ 5.044768  ],
       [15.87673969],
       [ 9.16207275],
       [11.63796154],
       [ 7.40260374],
       [11.38018826],
       [ 5.86725427],
       [22.65573425],
       [ 8.73104073],
       [17.74857666],
       [10.69604236],
       [10.84469902],
       [ 6.08722119],
       [ 1.71112112],
       [21.35137037],
       [ 3.26969206],
       [ 7.18894129],
       [14.19118556],
       [ 3.2762255 ],
       [17.15517878],
       [14.83760664]])

In [None]:
EEI = 0
current_dist_ss = df['distance'].to_numpy() # existing distance to nearest ss for each ps
for i in ss:
    # calculate distance from proposed secondary to all primary.
    ps_d = haversine_distance(i, ps, Unit.KM, comb=true)
    ps_feeder = np.where(ps_d < 5) # vector of ps with distance less than 5km to secondary.
    # compare distances to existing secondary schools, if less, then pop.
    if ps_d > current_dist_ss:
        pop(ps_feeder[i]) # remove school from vector.
    else:
        # subtract the previous enrollment
        EEI += shape(ps_d, ps_enrollment) # add ps enrollment to EEI.
        
        

In [None]:
fig, ((ax0, ax1), (ax2, ax3)) = plt.subplots(2, 2, figsize=(15,15))
fig.suptitle('Random Search F(2). 4 Outputs with Different Starting Points')

for i in range(4):
    ax = 'ax'+str(i)
    eval(ax).scatter(df_prim_loc[:, 1], df_prim_loc[:, 0], s=df_prim_enroll/100, label="Prim") # s gives size
    if(len(df_sec) != 0): eval(ax).scatter(df_sec_loc[:, 1], df_sec_loc[:, 0], s=df_sec_enroll/100, label="Secondary") # s gives size
    eval(ax).scatter(fx[i][1][1::2], fx[i][1][::2], s = 35, marker="o", label="New Secondary") # stars for supermarkets
    eval(ax).set_title(np.round(fx[i][0],0), fontstyle='italic')

for ax in fig.get_axes():
    ax.label_outer()

## CMA

In [None]:
sigmas = (0.01, 0.05, 0.1, 0.12, 0.14, 0.16, 0.18, 0.2, 0.22, 0.24)
maxits = 10

In [None]:
fcma = []

for i in range(4):
    for j in sigmas:
        es = cma.CMAEvolutionStrategy(create_random_sp('woreda'), sigma0=j,
                                  inopts={'bounds': boundsxy,'seed':1234})
        es.optimize(f, iterations=maxits / es.popsize)
        fcma.append((es.result[1], es.result[0], j))
        
fcma_s = sorted(fcma, key=lambda t: t[0])[:4]

In [None]:
fcma_s

In [None]:
fig, ((ax0, ax1), (ax2, ax3)) = plt.subplots(2, 2, figsize=(15,15))
fig.suptitle('CMA f(1). 4 Outputs with Different Starting Points')

for i in range(4):
    ax = 'ax'+str(i)
    eval(ax).scatter(df_prim_loc[:, 1], df_prim_loc[:, 0], s=df_prim_enroll/100, label="Prim") # s gives size
    if(len(df_sec) != 0): eval(ax).scatter(df_sec_loc[:, 1], df_sec_loc[:, 0], s=df_sec_enroll/100, label="Secondary") # s gives size
    eval(ax).scatter(fcma_s[i][1][1::2], fcma_s[i][1][::2], s = 35, marker="o", label="New Secondary") # stars for supermarkets
    eval(ax).set_title('Max: ' + str(np.round(fcma_s[i][0],0))+ ' . Sigma: ' + str(fcma_s[i][2]), fontstyle='italic')

for ax in fig.get_axes():
    ax.label_outer()

In [None]:
fcma2 = []
maxits= 10000

for i in range(4):
    for j in sigmas:
        es = cma.CMAEvolutionStrategy(get_random_sp(), sigma0=j,
                                  inopts={'bounds': boundsxy,'seed':1234})
        es.optimize(f2, iterations=maxits / es.popsize)
        fcma2.append((es.result[1], es.result[0], j))
        
fcma2_s = sorted(fcma2, key=lambda t: t[0])[:4]

In [None]:
fcma2_s

In [None]:
fig, ((ax0, ax1), (ax2, ax3)) = plt.subplots(2, 2, figsize=(15,15))
fig.suptitle('CMA F(2). 4 Outputs with Different Starting Points')

for i in range(4):
    ax = 'ax'+str(i)
    eval(ax).scatter(df_prim_loc[:, 1], df_prim_loc[:, 0], s=df_prim_enroll/100, label="Prim") # s gives size
    if(len(df_sec) != 0): eval(ax).scatter(df_sec_loc[:, 1], df_sec_loc[:, 0], s=df_sec_enroll/100, label="Secondary") # s gives size
    eval(ax).scatter(fcma2_s[i][1][1::2], fcma2_s[i][1][::2], s = 35, marker="o", label="New Secondary") # stars for supermarkets
    eval(ax).set_title('Max: ' + str(np.round(fcma2_s[i][0],0))+ ' . Sigma: ' + str(fcma2_s[i][2]), fontstyle='italic')

for ax in fig.get_axes():
    ax.label_outer()

In [None]:
chosenll = sorted(fcma2, key=lambda t: t[0])[:1]
chosenll = chosen[0][1]
# chosenll = np.array(np.array_split(chosenll, proposed_schools))

In [None]:
gdf_woreda_shp[0]

In [None]:
create_random_sp()

In [None]:
vec = gpd.points_from_xy(chosenll[:, 1], chosenll[:, 0])


In [None]:
gdf_woreda['geometry'] # check all woredas and return corect. 

In [None]:
chosenll

In [None]:
gdf_region = gpd.read_file('eth_shape_files/json//eth_admin1v2.json') # read in geojson
gdf_region_shp = gdf_region.loc[gdf_region['ADM1_EN']==region]['geometry'].reset_index(drop=True)

In [None]:
for i in chosenll:
    for j in gdf_region:
        vec.within(gdf_region['geometry'])
        
#     for j in gdf_woreda:
# for i in gdf_woreda:
    

# vec.within(gdf_woreda['geometry'])

In [None]:
check_woreda(chosenll)

In [None]:
chosenll

In [None]:
points = []
for i in chosenll:
    points += gpd.points_from_xy(i[0], i[1])

In [None]:
chosenll[:1:2]

In [None]:
dd = [gpd.points_from_xy(i[::2], i[1::2]) for i in chosen]
[i.within(gdf_woreda_shp[0]) for i in dd]

In [None]:
i

In [None]:
def get_location(vec):
    
    
    region
    zone
    woreda
    
    
    
    return 

In [None]:
def check_woreda(vec):
    # lat = y, x=lon
    vec = gpd.points_from_xy(vec[:, 1], vec[:, 0])
    return vec.within(gdf_woreda_shp[0]).all()
            
def check_region(vec):
    # lat = y, x=lon
    vec = gpd.points_from_xy(vec[:, 1], vec[:, 0])
    return vec.within(gdf_region_shp[0]).all()

In [None]:
# function to find region, zone and woreda of new school. 