In [21]:
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
from sklearn.neighbors import KNeighborsClassifier
from geopy.distance import geodesic
import folium
from geopy import distance
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import usaddress

In [22]:
# Read dataset
data = pd.read_csv('new_data.csv')

In [23]:
data.head()

Unnamed: 0,NatEmis,Province,Official_Institution_Name,STATUS,Sector,Phase_PED,Specialisation,GIS_Long,GIS_Lat,District Mun Name,...,Ward_ID,Town_City,StreetAddress,Telephone,Section21,School Fees,Urban_Rural,Town_Suburb,Towns/suburb,suburb_coordinates
0,700400770,GT,LYNNWOOD MANOR PRIVATE SCHOOL,OPEN,INDEPENDENT,PRIMARY SCHOOL,ORDINARY,28.290307,-25.75658,CITY OF TSHWANE METROPOLITAN MUNICIPALITY,...,79900046,LYNNWOOD MANOR,"99, BRAMPTON STREET, LYNNWOOD MANOR, PRETORIA",123489998,NO,Yes,URBAN,LYNNWOOD MANOR,Lynnwood Manor,"(-25.7630556, 28.2811111)"
1,700401072,GT,ADVANCED COLLEGE BROOKLYN,OPEN,INDEPENDENT,COMBINED SCHOOL,ORDINARY,28.248905,-25.768852,CITY OF TSHWANE METROPOLITAN MUNICIPALITY,...,79900082,BROOKLY,"415, MARAIS ROAD, BROOKLYN, PRETORIA",120301031,NO,Yes,URBAN,BROOKLYN,Brooklyn,"(-25.7646, 28.2393)"
2,700400322,GT,NOOITGEDACHT AKADEMIE SKOOL,OPEN,INDEPENDENT,INTERMEDIATE SCHOOL,ORDINARY,28.743439,-25.803851,CITY OF TSHWANE METROPOLITAN MUNICIPALITY,...,79900105,BRONKHORSPRUIT,"17, MARK STREET, NOOITGEDACHT, BRONKHORSPRUIT",732577542,NO,Yes,URBAN,BRONKHORSTSPRUIT,Bronkhorstspruit,"(-25.8084, 28.7081)"
3,700401082,GT,MELWOOD INSTITUTE OF TECHNOLOGY,OPEN,INDEPENDENT,SECONDARY SCHOOL,ORDINARY,28.191676,-25.742973,CITY OF TSHWANE METROPOLITAN MUNICIPALITY,...,79900058,PRETORIA CENTRAL,"274, JOHANNES RAMOKHOASE STREET, PRETORIA",123270294,NO,Yes,URBAN,PRETORIA CENTRAL,Pretoria Central,"(-25.7518426, 28.1899743)"
4,700400787,GT,BRITISH INTERNATIONAL COLLEGE-PRETORIA,OPEN,INDEPENDENT,SECONDARY SCHOOL,ORDINARY,28.218095,-25.744145,CITY OF TSHWANE METROPOLITAN MUNICIPALITY,...,79900092,PRETORIA,"774, CHURCH STREET, EASTWOOD, PRETORIA",117067775,NO,Yes,URBAN,EASTWOOD,Eastwood,"(-25.7411, 28.2198)"


In [24]:
data.columns

Index(['NatEmis', 'Province', 'Official_Institution_Name', 'STATUS', 'Sector',
       'Phase_PED', 'Specialisation', 'GIS_Long', 'GIS_Lat',
       'District Mun Name', 'Local MunName', 'Ward_ID', 'Town_City',
       'StreetAddress', 'Telephone', 'Section21', 'School Fees', 'Urban_Rural',
       'Town_Suburb', 'Towns/suburb', 'suburb_coordinates'],
      dtype='object')

In [25]:
data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)  

In [26]:
data = data.apply(lambda x: x.str.title() if x.dtype == "object" else x)

In [27]:
data.head(5)

Unnamed: 0,NatEmis,Province,Official_Institution_Name,STATUS,Sector,Phase_PED,Specialisation,GIS_Long,GIS_Lat,District Mun Name,...,Ward_ID,Town_City,StreetAddress,Telephone,Section21,School Fees,Urban_Rural,Town_Suburb,Towns/suburb,suburb_coordinates
0,700400770,Gt,Lynnwood Manor Private School,Open,Independent,Primary School,Ordinary,28.290307,-25.75658,City Of Tshwane Metropolitan Municipality,...,79900046,Lynnwood Manor,"99, Brampton Street, Lynnwood Manor, Pretoria",123489998,No,Yes,Urban,Lynnwood Manor,Lynnwood Manor,"(-25.7630556, 28.2811111)"
1,700401072,Gt,Advanced College Brooklyn,Open,Independent,Combined School,Ordinary,28.248905,-25.768852,City Of Tshwane Metropolitan Municipality,...,79900082,Brookly,"415, Marais Road, Brooklyn, Pretoria",120301031,No,Yes,Urban,Brooklyn,Brooklyn,"(-25.7646, 28.2393)"
2,700400322,Gt,Nooitgedacht Akademie Skool,Open,Independent,Intermediate School,Ordinary,28.743439,-25.803851,City Of Tshwane Metropolitan Municipality,...,79900105,Bronkhorspruit,"17, Mark Street, Nooitgedacht, Bronkhorspruit",732577542,No,Yes,Urban,Bronkhorstspruit,Bronkhorstspruit,"(-25.8084, 28.7081)"
3,700401082,Gt,Melwood Institute Of Technology,Open,Independent,Secondary School,Ordinary,28.191676,-25.742973,City Of Tshwane Metropolitan Municipality,...,79900058,Pretoria Central,"274, Johannes Ramokhoase Street, Pretoria",123270294,No,Yes,Urban,Pretoria Central,Pretoria Central,"(-25.7518426, 28.1899743)"
4,700400787,Gt,British International College-Pretoria,Open,Independent,Secondary School,Ordinary,28.218095,-25.744145,City Of Tshwane Metropolitan Municipality,...,79900092,Pretoria,"774, Church Street, Eastwood, Pretoria",117067775,No,Yes,Urban,Eastwood,Eastwood,"(-25.7411, 28.2198)"


In [28]:
data.shape

(700, 21)

In [29]:
# Check for null values
data.isna().sum()

NatEmis                      0
Province                     0
Official_Institution_Name    0
STATUS                       0
Sector                       0
Phase_PED                    0
Specialisation               0
GIS_Long                     0
GIS_Lat                      0
District Mun Name            0
Local MunName                0
Ward_ID                      0
Town_City                    0
StreetAddress                0
Telephone                    0
Section21                    0
School Fees                  0
Urban_Rural                  0
Town_Suburb                  0
Towns/suburb                 0
suburb_coordinates           0
dtype: int64

In [30]:
# Check for duplicates
duplicate = data[data.duplicated()]
print("Duplicate Rows :")

Duplicate Rows :


In [31]:
# Split the suburb coordinates
data[['suburb_Latitude', 'suburb_longitude']] = data['suburb_coordinates'].str.split(', ', expand=True)

In [32]:
# remove suburb coordinates column
data = data.drop(['suburb_coordinates', 'Town_City', 'Town_Suburb'], axis=1)
data.columns

Index(['NatEmis', 'Province', 'Official_Institution_Name', 'STATUS', 'Sector',
       'Phase_PED', 'Specialisation', 'GIS_Long', 'GIS_Lat',
       'District Mun Name', 'Local MunName', 'Ward_ID', 'StreetAddress',
       'Telephone', 'Section21', 'School Fees', 'Urban_Rural', 'Towns/suburb',
       'suburb_Latitude', 'suburb_longitude'],
      dtype='object')

In [33]:
# Strip unwanted characters
data['suburb_Latitude'] = data['suburb_Latitude'].str[1:]
data['suburb_longitude'] = data['suburb_longitude'].str[:-1]
data.head()

Unnamed: 0,NatEmis,Province,Official_Institution_Name,STATUS,Sector,Phase_PED,Specialisation,GIS_Long,GIS_Lat,District Mun Name,Local MunName,Ward_ID,StreetAddress,Telephone,Section21,School Fees,Urban_Rural,Towns/suburb,suburb_Latitude,suburb_longitude
0,700400770,Gt,Lynnwood Manor Private School,Open,Independent,Primary School,Ordinary,28.290307,-25.75658,City Of Tshwane Metropolitan Municipality,City Of Tshwane Metropolitan Municipality,79900046,"99, Brampton Street, Lynnwood Manor, Pretoria",123489998,No,Yes,Urban,Lynnwood Manor,-25.7630556,28.2811111
1,700401072,Gt,Advanced College Brooklyn,Open,Independent,Combined School,Ordinary,28.248905,-25.768852,City Of Tshwane Metropolitan Municipality,City Of Tshwane Metropolitan Municipality,79900082,"415, Marais Road, Brooklyn, Pretoria",120301031,No,Yes,Urban,Brooklyn,-25.7646,28.2393
2,700400322,Gt,Nooitgedacht Akademie Skool,Open,Independent,Intermediate School,Ordinary,28.743439,-25.803851,City Of Tshwane Metropolitan Municipality,City Of Tshwane Metropolitan Municipality,79900105,"17, Mark Street, Nooitgedacht, Bronkhorspruit",732577542,No,Yes,Urban,Bronkhorstspruit,-25.8084,28.7081
3,700401082,Gt,Melwood Institute Of Technology,Open,Independent,Secondary School,Ordinary,28.191676,-25.742973,City Of Tshwane Metropolitan Municipality,City Of Tshwane Metropolitan Municipality,79900058,"274, Johannes Ramokhoase Street, Pretoria",123270294,No,Yes,Urban,Pretoria Central,-25.7518426,28.1899743
4,700400787,Gt,British International College-Pretoria,Open,Independent,Secondary School,Ordinary,28.218095,-25.744145,City Of Tshwane Metropolitan Municipality,City Of Tshwane Metropolitan Municipality,79900092,"774, Church Street, Eastwood, Pretoria",117067775,No,Yes,Urban,Eastwood,-25.7411,28.2198


In [34]:
# Check for datatypes
print(data.dtypes)

NatEmis                        int64
Province                      object
Official_Institution_Name     object
STATUS                        object
Sector                        object
Phase_PED                     object
Specialisation                object
GIS_Long                     float64
GIS_Lat                      float64
District Mun Name             object
Local MunName                 object
Ward_ID                        int64
StreetAddress                 object
Telephone                      int64
Section21                     object
School Fees                   object
Urban_Rural                   object
Towns/suburb                  object
suburb_Latitude               object
suburb_longitude              object
dtype: object


In [35]:
# Change the town coordinates datatype from object to float
data['suburb_Latitude'] = pd.to_numeric(data['suburb_Latitude'], errors='coerce')
data['suburb_longitude'] = pd.to_numeric(data['suburb_longitude'], errors='coerce')
print(data.dtypes)

NatEmis                        int64
Province                      object
Official_Institution_Name     object
STATUS                        object
Sector                        object
Phase_PED                     object
Specialisation                object
GIS_Long                     float64
GIS_Lat                      float64
District Mun Name             object
Local MunName                 object
Ward_ID                        int64
StreetAddress                 object
Telephone                      int64
Section21                     object
School Fees                   object
Urban_Rural                   object
Towns/suburb                  object
suburb_Latitude              float64
suburb_longitude             float64
dtype: object


In [36]:
# Check for null values
print(data.isna().sum())

NatEmis                      0
Province                     0
Official_Institution_Name    0
STATUS                       0
Sector                       0
Phase_PED                    0
Specialisation               0
GIS_Long                     0
GIS_Lat                      0
District Mun Name            0
Local MunName                0
Ward_ID                      0
StreetAddress                0
Telephone                    0
Section21                    0
School Fees                  0
Urban_Rural                  0
Towns/suburb                 0
suburb_Latitude              0
suburb_longitude             0
dtype: int64


In [37]:
suburbs_town_df = data[['Province', 'Towns/suburb','suburb_Latitude', 'suburb_longitude']].copy()
suburbs_town_df.head(2)

Unnamed: 0,Province,Towns/suburb,suburb_Latitude,suburb_longitude
0,Gt,Lynnwood Manor,-25.763056,28.281111
1,Gt,Brooklyn,-25.7646,28.2393


In [38]:
# Pretoria Coordinates
latitude = -25.7479
longitude = 28.2293

##### Map of Suburbs/Towns

In [39]:
suburb_data = suburbs_town_df
map_pretoria = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(suburb_data['suburb_Latitude'], suburb_data['suburb_longitude'], suburb_data['Towns/suburb']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_pretoria)

map_pretoria

In [40]:
data.columns

Index(['NatEmis', 'Province', 'Official_Institution_Name', 'STATUS', 'Sector',
       'Phase_PED', 'Specialisation', 'GIS_Long', 'GIS_Lat',
       'District Mun Name', 'Local MunName', 'Ward_ID', 'StreetAddress',
       'Telephone', 'Section21', 'School Fees', 'Urban_Rural', 'Towns/suburb',
       'suburb_Latitude', 'suburb_longitude'],
      dtype='object')

In [41]:
school_data = data[['Province','Towns/suburb','suburb_Latitude','suburb_longitude','Official_Institution_Name', 'GIS_Lat', 'GIS_Long']].copy()

In [42]:
school_data.head()

Unnamed: 0,Province,Towns/suburb,suburb_Latitude,suburb_longitude,Official_Institution_Name,GIS_Lat,GIS_Long
0,Gt,Lynnwood Manor,-25.763056,28.281111,Lynnwood Manor Private School,-25.75658,28.290307
1,Gt,Brooklyn,-25.7646,28.2393,Advanced College Brooklyn,-25.768852,28.248905
2,Gt,Bronkhorstspruit,-25.8084,28.7081,Nooitgedacht Akademie Skool,-25.803851,28.743439
3,Gt,Pretoria Central,-25.751843,28.189974,Melwood Institute Of Technology,-25.742973,28.191676
4,Gt,Eastwood,-25.7411,28.2198,British International College-Pretoria,-25.744145,28.218095


##### Map of Schools in Pretoria

In [43]:
map_schools = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(school_data['GIS_Lat'], school_data['GIS_Long'], school_data['Official_Institution_Name']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_schools)  
    
map_schools

In [44]:
from sklearn.neighbors import BallTree

In [45]:
X = data[['GIS_Lat', 'GIS_Long']].values
tree = BallTree(X, leaf_size=2)

In [46]:
def find_nearest_schools(suburb_name, k=3):
    # Find the suburb latitude and longitude
    suburb = data[data['Towns/suburb'] == suburb_name].iloc[0]
    suburb_lat, suburb_lon = suburb['suburb_Latitude'], suburb['suburb_longitude']
    
    # Query the BallTree for the k nearest schools
    _, idxs = tree.query([[suburb_lat, suburb_lon]], k=5)
    nearest_schools = data.iloc[idxs[0]][['Official_Institution_Name']]
    
    # Concatenate the names of the nearest schools into a single string with line breaks
    return '\n'.join(f"{i+1}. {name}" for i, name in enumerate(nearest_schools['Official_Institution_Name']))

In [47]:
find_nearest_schools('Pretoria Central')

'1. Tshwane Secondary School\n2. Greenwood College\n3. St Aquinas-Pretoria Campus\n4. Hamilton Primary School\n5. Loreto Convent School'

In [48]:
data.columns

Index(['NatEmis', 'Province', 'Official_Institution_Name', 'STATUS', 'Sector',
       'Phase_PED', 'Specialisation', 'GIS_Long', 'GIS_Lat',
       'District Mun Name', 'Local MunName', 'Ward_ID', 'StreetAddress',
       'Telephone', 'Section21', 'School Fees', 'Urban_Rural', 'Towns/suburb',
       'suburb_Latitude', 'suburb_longitude'],
      dtype='object')

In [49]:
def get_closest_schools(suburb_name, data):
    # Filter the dataframe to include only the suburb of interest
    suburb_df = data[data['Towns/suburb'] == suburb_name]
    
    # Get the latitude and longitude of the suburb
    suburb_lat = suburb_df.iloc[0]['suburb_Latitude']
    suburb_lon = suburb_df.iloc[0]['suburb_longitude']
    
    # Create a list to hold the distances between the suburb and each school
    distances = []
    
    # Iterate over the schools in the dataframe
    for index, row in data.iterrows():
        # Get the latitude and longitude of the school
        school_lat = row['GIS_Lat']
        school_lon = row['GIS_Long']
        
        # Calculate the distance between the suburb and the school
        distance = geodesic((suburb_lat, suburb_lon), (school_lat, school_lon)).km
        
        # Add the distance to the list
        distances.append(distance)
    
    # Add a new column to the dataframe with the distances
    data['Distance'] = distances
    
    # Sort the dataframe by distance
    closest_schools = data.sort_values(by='Distance').head(5)
    
    # Create a map centered on the suburb
    map = folium.Map(location=[suburb_lat, suburb_lon], zoom_start=13)
    
    # Add a marker for the suburb
    folium.Marker(location=[suburb_lat, suburb_lon], popup=suburb_name).add_to(map)
    
    # Add markers for the closest schools
    for index, row in closest_schools.iterrows():
        folium.Marker(location=[row['GIS_Lat'], row['GIS_Long']], popup=row['Official_Institution_Name']).add_to(map)
    
    # Return the map
    return map

In [50]:
get_closest_schools('Brooklyn', data)