In [110]:
import pandas as pd
import numpy as np
from haversine import haversine, haversine_vector, Unit

In [8]:
df = pd.read_csv('data/gis_correct_admin.csv')
# df1 = pd.read_csv('merged_dataset.csv')

In [12]:
df.shape

(26227, 33)

In [26]:
# produces a tuple of coordinates for use in haversine function.
df['point'] = [(i,j) for i,j in zip(df.lat,df.lon)]

In [260]:
# Reduce complexity of calculations by reducing size of dataset
dfx = df[['school_code', 'point', 'gr_offer']]
dfx = dfx.assign(nearest_dist='',nearest_school = '')
dfx = dfx.sort_values(['point'])

In [278]:
# split into levels and vectorise
df_lwr_prim = dfx.loc[df["gr_offer"].isin(['G.1-4'])].to_numpy() # lower primary only
df_up_prim = dfx.loc[df["gr_offer"].isin(['G.1-8','G.5-8'])].to_numpy() # primary and upper primary only
df_lwr_sec = dfx.loc[df["gr_offer"].isin(['G. 9-10', 'G. 9-12'])].to_numpy() # lower secondary only and secondary
df_up_sec = dfx.loc[df["gr_offer"].isin(['G. 11-12', 'G. 9-12'])].to_numpy() # upper secondary only and secondary

In [279]:
df_lwr_prim

array([['S0404110412', (3.5453330000000003, 39.048621999999995), 'G.1-4',
        '', ''],
       ['S0404110152', (3.5511968, 39.04677), 'G.1-4', '', ''],
       ['S0404110292', (3.5563872, 39.045596999999994), 'G.1-4', '', ''],
       ...,
       ['S0104020402', (14.594949, 39.533885999999995), 'G.1-4', '', ''],
       ['S0104020552', (14.644032500000002, 39.516804), 'G.1-4', '', ''],
       ['S0103040532', (14.6949, 37.831542999999996), 'G.1-4', '', '']],
      dtype=object)

In [280]:
df_lwr_prim[1]

array(['S0404110152', (3.5511968, 39.04677), 'G.1-4', '', ''],
      dtype=object)

In [297]:
# two variables for iteration. 

import time

def calculate_distance(df1,df2):

    startTime = time.time()
    
    for i in df1:
        distance = []
        for j in range(len(df2)):
            distance.append(haversine(i[1], df2[:,1][j], Unit.KILOMETERS))
        min_distance = np.min(distance)
        min_school_code = df2[np.argmin(distance)][0]
        i[-2] = min_distance
        i[-1] = min_school_code
        distance = []
        
    executionTime = (time.time() - startTime)
    print('Execution time in seconds: ' + str(executionTime))
    
    return df1

Execution time in seconds: 303.1739547252655


In [298]:
df_lwr_prim2 = calculate_distance(df_lwr_prim, df_up_prim)

array([['S0404110412', (3.5453330000000003, 39.048621999999995), 'G.1-4',
        1.437070541796174, 'S0404110282'],
       ['S0404110152', (3.5511968, 39.04677), 'G.1-4',
        1.2493032269743682, 'S0404110222'],
       ['S0404110292', (3.5563872, 39.045596999999994), 'G.1-4',
        0.6600941515642944, 'S0404110222'],
       ...,
       ['S0104020402', (14.594949, 39.533885999999995), 'G.1-4',
        1.832135394336647, 'S0104020252'],
       ['S0104020552', (14.644032500000002, 39.516804), 'G.1-4',
        1.7668975084234932, 'S0104020042'],
       ['S0103040532', (14.6949, 37.831542999999996), 'G.1-4',
        4.2102282379411164, 'S0103040032']], dtype=object)

In [300]:
df_up_prim2 = calculate_distance(df_up_prim, df_lwr_sec)

Execution time in seconds: 91.50277638435364


In [299]:
df_lwr_sec2 = calculate_distance(df_lwr_sec, df_up_sec)

Execution time in seconds: 3.309664726257324


In [323]:
df_lwr_prim = pd.DataFrame(df_lwr_prim2, columns = ['school_code', 'point', 'gr_offer',\
                                                   'nearest_up_prim', 'nearest_sch_code'])
df_up_prim = pd.DataFrame(df_up_prim2, columns = ['school_code', 'point', 'gr_offer',\
                                                   'nearest_lwr_sec', 'nearest_sch_code'])
df_lwr_sec = pd.DataFrame(df_lwr_sec2, columns = ['school_code', 'point', 'gr_offer',\
                                                   'nearest_up_sec', 'nearest_sch_code'])

In [324]:
x = df_lwr_prim.merge(df_up_prim, how='outer')
x = x.merge(df_lwr_sec, how='outer')

In [325]:
df = df.merge(x, how='outer')

In [326]:
df.to_csv('data/clean_dataset.csv', index=False, encoding = 'utf-8')