In [2]:
import pandas as pd
import numpy as np
import math
from geopy import distance

In [3]:
# https://open.canada.ca/data/en/dataset/00a331db-121b-445d-b119-35dbbe3eedd9 -- National Broadband Data
speeds = pd.read_csv("raw/PHH_Speeds_Current-PHH_Vitesses_Actuelles_BC.csv")

# Same source, map data with population information for each hex
phh = pd.read_csv("raw/PHH-BC.csv")
phhWithSpeeds = pd.merge(phh, speeds, on="PHH_ID")

# Filtering out hexes with no population
regionsWithPop = phhWithSpeeds[phhWithSpeeds["Pop2016"] > 0].copy()
regionsWithPop.columns

Index(['PHH_ID', 'Type', 'Pop2016', 'TDwell2016_TLog2016',
       'URDwell2016_RH2016', 'DBUID_Ididu', 'HEXUID_IdUHEX', 'Pruid_Pridu',
       'Latitude', 'Longitude', 'Combined_lt5_1_Combine',
       'Wired_lt5_1_Filaire', 'Wireless_lt5_Sans_fil', 'Combined_5_1_Combine',
       'Wired_5_1_Filaire', 'Wireless_5_1_Sans_fil', 'Combined_10_2_Combine',
       'Wired_10_2_Filaire', 'Wireless_10_2_Sans_fil', 'Combined_25_5_Combine',
       'Wired_25_5_Filaire', 'Wireless_25_5_Sans_fil',
       'Combined_50_10_Combine', 'Wired_50_10_Filaire',
       'Wireless_50_10_Sans_fil', 'Combined_Max_Threshold-Combine_Seuil_Max',
       'Wired_Max_Threshold-Filaire_Seuil_Max',
       'Wireless_Max_Threshold-Sans_fil_Seuil_Max', 'Avail_LTE_Mobile_Dispo'],
      dtype='object')

In [4]:
school_report = pd.read_csv("combined_school_report.csv")
school_report

Unnamed: 0.1,Unnamed: 0,SCHOOL_YEAR,SCHOOL_NUMBER,SCHOOL_NAME,DISTRICT_NUMBER,DISTRICT_NAME,PUBLIC_OR_INDEPENDENT,STREET_ADDRESS,PHYSICAL_ADDRESS_CITY,FACILTY_TYPE,...,LOCATION,ENROLMENT,PERCENT_ESL,PERCENT_SPECIAL_NEEDS,PERCENT_FRENCH_IMM,AVERAGE_EXAM_MARK,PERCENT_EXAMS_FAILED,PERCENT_GRAD,PERCENT_DELAYED,OVERALL
0,0,2019/2020,502001,Mount Baker Secondary,5,Southeast Kootenay,Public,1410 Baker St,Cranbrook,Standard School,...,Cranbrook,Gr 12 enrolment: 257,0.4,14.4,6.2,66.9,8.9,99.2,8.2,6.8
1,1,2019/2020,502001,Mount Baker Secondary,5,Southeast Kootenay,Public,1410 Baker St,Cranbrook,Standard School,...,Cranbrook,Gr 12 enrolment: 257,0.4,14.4,6.2,66.4,10.6,96.8,16.9,6.1
2,2,2019/2020,502001,Mount Baker Secondary,5,Southeast Kootenay,Public,1410 Baker St,Cranbrook,Standard School,...,Cranbrook,Gr 12 enrolment: 257,0.4,14.4,6.2,66.2,9.6,99.2,5.1,6.6
3,3,2019/2020,502001,Mount Baker Secondary,5,Southeast Kootenay,Public,1410 Baker St,Cranbrook,Standard School,...,Cranbrook,Gr 12 enrolment: 257,0.4,14.4,6.2,68.0,8.7,97.8,9.7,6.7
4,4,2019/2020,502001,Mount Baker Secondary,5,Southeast Kootenay,Public,1410 Baker St,Cranbrook,Standard School,...,Cranbrook,Gr 12 enrolment: 257,0.4,14.4,6.2,66.4,8.8,99.6,7.3,6.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170,1170,2019/2020,9191024,Nechako Valley Secondary,91,Nechako Lakes,Public,PO Box 950,Vanderhoof,Standard School,...,Vanderhoof,Gr 12 enrolment: 81,0.0,16.6,4.1,66.8,11.1,97.6,9.1,5.8
1171,1171,2019/2020,9191024,Nechako Valley Secondary,91,Nechako Lakes,Public,PO Box 950,Vanderhoof,Standard School,...,Vanderhoof,Gr 12 enrolment: 81,0.0,16.6,4.1,65.6,13.2,96.2,8.3,5.1
1172,1172,2019/2020,9191024,Nechako Valley Secondary,91,Nechako Lakes,Public,PO Box 950,Vanderhoof,Standard School,...,Vanderhoof,Gr 12 enrolment: 81,0.0,16.6,4.1,67.0,12.6,98.0,3.3,6.4
1173,1173,2019/2020,9191024,Nechako Valley Secondary,91,Nechako Lakes,Public,PO Box 950,Vanderhoof,Standard School,...,Vanderhoof,Gr 12 enrolment: 81,0.0,16.6,4.1,71.7,1.6,98.4,7.8,6.9


In [6]:
school_representative = school_report.groupby("SCHOOL_NUMBER").first().reset_index()
regionsWithPop["DISTANCE"] = -1
regionsWithPop["SCHOOL_NUMBER"] = -1
for index, hexagon in regionsWithPop.iterrows():
    hexagon_coord = (hexagon["Latitude"], hexagon["Longitude"])
    closest_school_num = None
    closest_distance = None
    for i, school in school_representative.iterrows():
        dist = distance.distance(hexagon_coord, (school["SCHOOL_LATITUDE"], school["SCHOOL_LONGITUDE"])).km
        if (closest_distance is None or closest_distance > dist):
            closest_distance = dist
            closest_school_num = school["SCHOOL_NUMBER"]
    regionsWithPop.loc[index, "DISTANCE"] = closest_distance
    regionsWithPop.loc[index, "SCHOOL_NUMBER"] = closest_school_num

regionsWithPop.to_csv("pop_regions_with_school.csv")
regionsWithPop.groupby("SCHOOL_NUMBER").size().reset_index(name='counts')

Unnamed: 0,SCHOOL_NUMBER,counts
0,502001,4296
1,505033,352
2,505034,2180
3,505035,659
4,603008,2635
...,...,...
230,8585026,1821
231,9156002,1515
232,9156007,1048
233,9191023,1679


In [5]:
linked = pd.read_csv("pop_regions_with_school.csv")

# Discard hexes where the closest school is over 25km away -- arbitrary assumption that students aren't traveling that far for school
discard_hex_too_far_away = linked[linked["DISTANCE"] > 25]
print("Number of hexes discarded: %d" % (linked.size - discard_hex_too_far_away.size))
discard_hex_too_far_away

Number of hexes discarded: 11974816


Unnamed: 0.1,Unnamed: 0,PHH_ID,Type,Pop2016,TDwell2016_TLog2016,URDwell2016_RH2016,DBUID_Ididu,HEXUID_IdUHEX,Pruid_Pridu,Latitude,...,Wireless_25_5_Sans_fil,Combined_50_10_Combine,Wired_50_10_Filaire,Wireless_50_10_Sans_fil,Combined_Max_Threshold-Combine_Seuil_Max,Wired_Max_Threshold-Filaire_Seuil_Max,Wireless_Max_Threshold-Sans_fil_Seuil_Max,Avail_LTE_Mobile_Dispo,DISTANCE,SCHOOL_NUMBER
25,13004,5226091,5,6.428571,5.571429,3.000000,59331492007,BC50901209,59,50.883726,...,0,0,0,0,,,,0,45.347990,7324055
26,13005,5226252,6,17.500000,24.500000,9.500000,59331492084,BC50901209,59,50.883931,...,0,0,0,0,,,,0,45.335320,7324055
27,13006,5367797,5,7.000000,2.700000,2.200000,59490109021,BC55171291,59,55.194293,...,0,0,0,0,,,,1,80.264716,8288024
28,13007,5367762,6,14.000000,3.500000,3.500000,59490109014,BC55171291,59,55.194556,...,0,0,0,0,,,,1,80.295323,8288024
63,13049,5199801,5,4.000000,2.400000,2.000000,59310142014,BC50691219,59,50.693891,...,1,0,0,0,25_5,25_5,25_5,1,73.756027,4848021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420587,661349,5500964,2,0.542017,0.390756,0.277311,59030243029,BC49901182,59,49.879670,...,1,0,0,0,25_5,,25_5,1,66.480871,807025
420588,661350,5500965,2,0.180672,0.130252,0.092437,59030243029,BC49931181,59,49.914920,...,0,0,0,0,<5_1,,<5_1,1,66.881717,807025
420589,661351,5500966,2,5.075000,2.887500,2.275000,59030137019,BC49931174,59,49.904697,...,0,0,0,0,,,,1,45.302126,807013
420590,661353,5500968,2,4.068966,4.068966,1.908046,59070135043,BC49611204,59,49.618459,...,0,0,0,0,<5_1,,<5_1,0,49.891046,6777002
