In [None]:
import json
from collections import defaultdict
import geopandas as gpd
import pandas as pd
import re

# 1. Africa

### Turning polygon schools into points

Since some of the OSM schools are stored as polygons, we still need to turn them into points as well, so that they can seamlessly integrate with the point-based schools.

In [None]:
## Loading the multipolygon GeoJSON
multipolygon_file = "Africa/schools_full_polygons.geojson"
gdf_multi = gpd.read_file(multipolygon_file)

## Checking the current CRS
print(f"Original CRS: {gdf_multi.crs}")

Original CRS: EPSG:4326


In [None]:
## Converting to projected CRS
projected_gdf = gdf_multi.to_crs("EPSG:3857")  

## Computing the centroids in projected coordinates
projected_gdf["geometry"] = projected_gdf["geometry"].centroid

## Converting back to geographic CRS for saving
gdf_multi = projected_gdf.to_crs("EPSG:4326")

## Saving as GeoJSON
centroid_geojson = "Africa/schools_centroids.geojson"
gdf_multi.to_file(centroid_geojson, driver="GeoJSON")

### Loading centroid and points file for cleaning and merging

In [34]:
## File paths
nodes_file = "Africa/schools_full_nodes.geojson"
centroids_file = "Africa/schools_centroids.geojson"

In [35]:
## Loading both GeoJSON files
with open(centroids_file, "r", encoding="utf-8") as f:
    centroids_data = json.load(f)

with open(nodes_file, "r", encoding="utf-8") as f:
    nodes_data = json.load(f)

### Expanding the other_tags column in the centroids file

The centroids file, which are the centerpoints of the schools captured as polygons in OSM, has a large number of tags collapsed into a single column. In the nodes file, which contains the schools captured as points in OSM, these tags all receive a seperate column. This step makes this also the case for the centroids file.

In [36]:
## Creating function to extract tags

def extract_other_tags(properties):
    """Extract key-value pairs from 'other_tags' and add them as separate columns."""
    if "other_tags" in properties and properties["other_tags"]:
        tag_str = properties["other_tags"]
        tag_dict = {}

        #Extracting key-value pairs
        matches = re.findall(r'"(.*?)"=>"(.*?)"', tag_str)
        for key, value in matches:
            tag_dict[key] = value

        #Merging extracted tags into properties
        properties.update(tag_dict)

    #Removing the "other_tags" column since it's now expanded
    properties.pop("other_tags", None)
    return properties

In [37]:
## Applying extraction function
for feature in centroids_data["features"]:
    feature["properties"] = extract_other_tags(feature["properties"])

### Preparing centroid and node datasets for merging

The two datasets have slightly different structures. Here, we are cleaning and processing them so that they can be merged.

In [38]:
## Creating function to clean None values from properties
def clean_properties(properties):
    return {k: v for k, v in properties.items() if v is not None}

In [39]:
## Processing centroids: Removing None values & ensure "name" exists (if not, fallback to OSM ID)
for feature in centroids_data["features"]:
    feature["properties"] = clean_properties(feature["properties"])

    if "name" not in feature["properties"] or not feature["properties"]["name"]:
        feature["properties"]["name"] = f"Unnamed School (OSM ID: {feature['properties'].get('osm_id', 'Unknown')})"


In [40]:
## Processing nodes: Removing none values & ensure "name" exists (if not, fallback to OSM ID)
for feature in nodes_data["features"]:
    feature["properties"] = clean_properties(feature["properties"])

    if "name" not in feature["properties"] or not feature["properties"]["name"]:
        feature["properties"]["name"] = f"Unnamed School (OSM ID: {feature['properties'].get('osm_id', 'Unknown')})"

    

### Merging the two datasets

In [41]:
## Merging
merged_features = centroids_data["features"] + nodes_data["features"]

In [42]:
## Checking the number of schools included:
len(merged_features)

172290

### Checking columns

There is a strong variety in detail for each school: sometimes a lot of detail is provided, other times only the name. Here we are checking what the most commonly provided details are.

In [43]:
## Creating function to count non-null values in each column
def count_non_nulls(features):
    column_counts = defaultdict(int)
    total_features = len(features)

    for feature in features:
        for key, value in feature["properties"].items():
            if value not in [None, ""]: 
                column_counts[key] += 1

    return dict(column_counts), total_features

In [44]:
## Counting non-null values
counts, total = count_non_nulls(merged_features)

In [45]:
## Converting counts to dataframe for easy sorting
merged_df = pd.DataFrame(list(counts.items()), columns=["Column", "Non-Null Count"])
merged_df["Total Features"] = total
merged_df["Coverage (%)"] = (merged_df["Non-Null Count"] / total) * 100
merged_df = merged_df.sort_values(by="Non-Null Count", ascending=False)


In [46]:
## Printing sorted column coverage
print("Columns Sorted by Observations:")
merged_df.head(10)

Columns Sorted by Observations:


Unnamed: 0,Column,Non-Null Count,Total Features,Coverage (%)
1,name,172290,172290,100.0
3,amenity,172290,172290,100.0
135,osm_way_id,63506,172290,36.859945
690,source,59760,172290,34.685704
11,addr:city,58358,172290,33.87196
14,isced:level,53881,172290,31.273434
8,operator:type,45279,172290,26.28069
70,addr:district,44084,172290,25.587092
35,source:date,25340,172290,14.70776
33,name:en,21746,172290,12.621742


### Saving the full dataset as GeoJSON

We won't use this dataset for the index calculation since it contains mainy unneeded columns. Saving it anyways in case it is of any use later.

In [47]:
## Creating the merged GeoJSON
merged_data = {
    "type": "FeatureCollection",
    "features": merged_features
}

In [None]:
## Saving the cleaned and merged GeoJSON
with open("Africa/schools_merged_allcolumns.geojson", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2)

### Saving sub-selection of columns as GeoJSON for further analysis

In [49]:
columns_to_keep = {"name", "amenity", "isced:level", "grades"}

In [50]:
## Processing each feature to retain only selected columns
for feature in merged_data["features"]:
    feature["properties"] = {k: v for k, v in feature["properties"].items() if k in columns_to_keep}

In [None]:
## Saving the filtered dataset as GeoJSON
with open("Africa/schools_merged_finalcolumns.geojson", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2)

# 2. Asia

### Turning polygon schools into points

Since some of the OSM schools are stored as polygons, we still need to turn them into points as well, so that they can seamlessly integrate with the point-based schools.

In [52]:
## Loading the multipolygon GeoJSON
multipolygon_file = "Asia/schools_full_polygons.geojson"
gdf_multi = gpd.read_file(multipolygon_file)

## Checking the current CRS
print(f"Original CRS: {gdf_multi.crs}")

Original CRS: EPSG:4326


In [None]:
## Converting to projected CRS
projected_gdf = gdf_multi.to_crs("EPSG:3857")

## Computing the centroids in projected coordinates
projected_gdf["geometry"] = projected_gdf["geometry"].centroid

## Converting back to geographic CRS for saving
gdf_multi = projected_gdf.to_crs("EPSG:4326")

## Saving as GeoJSON
centroid_geojson = "Asia/schools_centroids.geojson"
gdf_multi.to_file(centroid_geojson, driver="GeoJSON")

### Loading centroid and points file for cleaning and merging

In [54]:
## File paths
nodes_file = "Asia/schools_full_nodes.geojson"
centroids_file = "Asia/schools_centroids.geojson"

In [55]:
## Loading both GeoJSON files
with open(centroids_file, "r", encoding="utf-8") as f:
    centroids_data = json.load(f)

with open(nodes_file, "r", encoding="utf-8") as f:
    nodes_data = json.load(f)

### Expanding the other_tags column in the centroids file

The centroids file, which are the centerpoints of the schools captured as polygons in OSM, has a large number of tags collapsed into a single column. In the nodes file, which contains the schools captured as points in OSM, these tags all receive a seperate column. This step makes this also the case for the centroids file.

In [56]:
## Applying extract_other_tags function defined earlier when first running it for Africa

for feature in centroids_data["features"]:
    feature["properties"] = extract_other_tags(feature["properties"])

### Preparing centroid and node datasets for merging

The two datasets have slightly different structures. Here, we are cleaning and processing them so that they can be merged.

In [57]:
## Processing centroids: Removing None values using the clean_properties function defined when first running it for 
## Africa & ensuring "name" exists (if not, fallback to OSM ID)

for feature in centroids_data["features"]:
    feature["properties"] = clean_properties(feature["properties"])
    
    if "name" not in feature["properties"] or not feature["properties"]["name"]:
        feature["properties"]["name"] = f"Unnamed School (OSM ID: {feature['properties'].get('osm_id', 'Unknown')})"


In [58]:
## Processing nodes: Removing None values using the clean_properties function defined when first running it for
## Africa & ensuring "name" exists (if not, fallback to OSM ID)

for feature in nodes_data["features"]:
    feature["properties"] = clean_properties(feature["properties"])
    
    if "name" not in feature["properties"] or not feature["properties"]["name"]:
        feature["properties"]["name"] = f"Unnamed School (OSM ID: {feature['properties'].get('osm_id', 'Unknown')})"

    

### Merging the two datasets

In [59]:
## Merging
merged_features = centroids_data["features"] + nodes_data["features"]

In [60]:
## Checking the number of schools included:
len(merged_features)

430214

### Checking columns

There is a strong variety in detail for each school: sometimes a lot of detail is provided, other times only the name. Here we are checking what the most commonly provided details are.

In [61]:
## Counting non-null values
counts, total = count_non_nulls(merged_features)

In [62]:
## Converting counts to dataframe for easy sorting
merged_df = pd.DataFrame(list(counts.items()), columns=["Column", "Non-Null Count"])
merged_df["Total Features"] = total
merged_df["Coverage (%)"] = (merged_df["Non-Null Count"] / total) * 100
merged_df = merged_df.sort_values(by="Non-Null Count", ascending=False)


In [63]:
## Showing top 10 columns in terms of coverage
print("Top 10 Columns Sorted by Observations")
merged_df.head(10)

Top 10 Columns Sorted by Observations


Unnamed: 0,Column,Non-Null Count,Total Features,Coverage (%)
1,name,430214,430214,100.0
3,amenity,430214,430214,100.0
608,osm_way_id,292525,430214,67.99523
36,operator:type,72222,430214,16.787459
63,isced:level,70263,430214,16.332104
28,operator,67019,430214,15.578061
8,addr:city,57102,430214,13.272929
24,name:en,54767,430214,12.730176
69,addr:province,53815,430214,12.508891
6,addr:street,49672,430214,11.545882


### Saving the full dataset as GeoJSON

We won't use this dataset for the index calculation since it contains mainy unneeded columns. Saving it anyways in case it is of any use later.

In [64]:
## Creating the merged GeoJSON
merged_data = {
    "type": "FeatureCollection",
    "features": merged_features
}

In [None]:
## Saving the cleaned and merged GeoJSON
with open("Asia/schools_merged_allcolumns.geojson", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2)

### Saving sub-selection of columns as GeoJSON for further analysis

In [66]:
columns_to_keep = {"name", "amenity", "isced:level", "grades"}

In [67]:
## Processing each feature to retain only selected columns
for feature in merged_data["features"]:
    feature["properties"] = {k: v for k, v in feature["properties"].items() if k in columns_to_keep}

In [None]:
## Saving the filtered dataset as GeoJSON
with open("Asia/schools_merged_finalcolumns.geojson", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2)

# 3. Europe

### Turning polygon schools into points

Since some of the OSM schools are stored as polygons, we still need to turn them into points as well, so that they can seamlessly integrate with the point-based schools.

In [68]:
## Loading the multipolygon GeoJSON
multipolygon_file = "Europe/schools_full_polygons.geojson"
gdf_multi = gpd.read_file(multipolygon_file)

## Checking the current CRS
print(f"Original CRS: {gdf_multi.crs}")

Original CRS: EPSG:4326


In [None]:
## Converting to projected CRS
projected_gdf = gdf_multi.to_crs("EPSG:3857")

## Computing the centroids in projected coordinates
projected_gdf["geometry"] = projected_gdf["geometry"].centroid

## Converting back to geographic CRS for saving
gdf_multi = projected_gdf.to_crs("EPSG:4326")

## Saving as GeoJSON
centroid_geojson = "Europe/schools_centroids.geojson"
gdf_multi.to_file(centroid_geojson, driver="GeoJSON")

### Loading centroid and points file for cleaning and merging

In [70]:
## File paths
nodes_file = "Europe/schools_full_nodes.geojson"
centroids_file = "Europe/schools_centroids.geojson"

In [71]:
## Loading both GeoJSON files
with open(centroids_file, "r", encoding="utf-8") as f:
    centroids_data = json.load(f)

with open(nodes_file, "r", encoding="utf-8") as f:
    nodes_data = json.load(f)

### Expanding the other_tags column in the centroids file

The centroids file, which are the centerpoints of the schools captured as polygons in OSM, has a large number of tags collapsed into a single column. In the nodes file, which contains the schools captured as points in OSM, these tags all receive a seperate column. This step makes this also the case for the centroids file.

In [72]:
## Applying extract_other_tags function defined earlier when first running it for Africa

for feature in centroids_data["features"]:
    feature["properties"] = extract_other_tags(feature["properties"])

### Preparing centroid and node datasets for merging

The two datasets have slightly different structures. Here, we are cleaning and processing them so that they can be merged.

In [73]:
## Processing centroids: Removing None values using the clean_properties function defined when first running it for 
## Africa & ensuring "name" exists (if not, fallback to OSM ID)

for feature in centroids_data["features"]:
    feature["properties"] = clean_properties(feature["properties"])

    if "name" not in feature["properties"] or not feature["properties"]["name"]:
        feature["properties"]["name"] = f"Unnamed School (OSM ID: {feature['properties'].get('osm_id', 'Unknown')})"


In [74]:
## Processing nodes: Removing None values using the clean_properties function defined when first running it for 
## Africa & ensuring "name" exists (if not, fallback to OSM ID)

for feature in nodes_data["features"]:
    feature["properties"] = clean_properties(feature["properties"])

    if "name" not in feature["properties"] or not feature["properties"]["name"]:
        feature["properties"]["name"] = f"Unnamed School (OSM ID: {feature['properties'].get('osm_id', 'Unknown')})"

    

### Merging the two datasets

In [75]:
## Merging
merged_features = centroids_data["features"] + nodes_data["features"]

In [76]:
## Checking the number of schools included:
len(merged_features)

330206

### Checking columns

There is a strong variety in detail for each school: sometimes a lot of detail is provided, other times only the name. Here we are checking what the most commonly provided details are.

In [77]:
## Counting non-null values
counts, total = count_non_nulls(merged_features)

In [78]:
## Converting counts to dataframe for easy sorting
merged_df = pd.DataFrame(list(counts.items()), columns=["Column", "Non-Null Count"])
merged_df["Total Features"] = total
merged_df["Coverage (%)"] = (merged_df["Non-Null Count"] / total) * 100
merged_df = merged_df.sort_values(by="Non-Null Count", ascending=False)


In [79]:
## Showing top 10 columns in terms of coverage
print("Top 10 Columns Sorted by Observations")
merged_df.head(10)

Top 10 Columns Sorted by Observations


Unnamed: 0,Column,Non-Null Count,Total Features,Coverage (%)
3,amenity,330206,330206,100.0
1,name,330206,330206,100.0
820,osm_way_id,253698,330206,76.830221
7,addr:street,115333,330206,34.927591
6,addr:postcode,109116,330206,33.044827
5,addr:city,103737,330206,31.415843
10,website,88575,330206,26.824164
8,addr:housenumber,86403,330206,26.166393
46,operator:type,76505,330206,23.16887
21,phone,69436,330206,21.028085


### Saving the full dataset as GeoJSON

We won't use this dataset for the index calculation since it contains mainy unneeded columns. Saving it anyways in case it is of any use later.

In [80]:
## Creating the merged GeoJSON
merged_data = {
    "type": "FeatureCollection",
    "features": merged_features
}

In [None]:
## Saving the cleaned and merged GeoJSON
with open("Europe/schools_merged_allcolumns.geojson", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2)

### Saving sub-selection of columns as GeoJSON for further analysis

In [81]:
columns_to_keep = {"name", "amenity", "isced:level", "grades"}

In [82]:
## Processing each feature to retain only selected columns
for feature in merged_data["features"]:
    feature["properties"] = {k: v for k, v in feature["properties"].items() if k in columns_to_keep}

In [None]:
## Saving the filtered dataset as GeoJSON
with open("Europe/schools_merged_finalcolumns.geojson", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2)

# 4. Australia & Oceania

### Turning polygon schools into points

Since some of the OSM schools are stored as polygons, we still need to turn them into points as well, so that they can seamlessly integrate with the point-based schools.

In [83]:
## Loading the multipolygon GeoJSON
multipolygon_file = "Australia-Oceania/schools_full_polygons.geojson"
gdf_multi = gpd.read_file(multipolygon_file)

## Checking the current CRS
print(f"Original CRS: {gdf_multi.crs}")

Original CRS: EPSG:4326


In [None]:
## Converting to projected CRS
projected_gdf = gdf_multi.to_crs("EPSG:3857")

## Computing the centroids in projected coordinates
projected_gdf["geometry"] = projected_gdf["geometry"].centroid

## Converting back to geographic CRS for saving
gdf_multi = projected_gdf.to_crs("EPSG:4326")

## Saving as GeoJSON
centroid_geojson = "Australia-Oceania/schools_centroids.geojson"
gdf_multi.to_file(centroid_geojson, driver="GeoJSON")

### Loading centroid and points file for cleaning and merging

In [85]:
## File paths
nodes_file = "Australia-Oceania/schools_full_nodes.geojson"
centroids_file = "Australia-Oceania/schools_centroids.geojson"

In [86]:
## Loading both GeoJSON files
with open(centroids_file, "r", encoding="utf-8") as f:
    centroids_data = json.load(f)

with open(nodes_file, "r", encoding="utf-8") as f:
    nodes_data = json.load(f)

### Expanding the other_tags column in the centroids file

The centroids file, which are the centerpoints of the schools captured as polygons in OSM, has a large number of tags collapsed into a single column. In the nodes file, which contains the schools captured as points in OSM, these tags all receive a seperate column. This step makes this also the case for the centroids file.

In [87]:
## Applying extract_other_tags function defined earlier when first running it for Africa

for feature in centroids_data["features"]:
    feature["properties"] = extract_other_tags(feature["properties"])

### Preparing centroid and node datasets for merging

The two datasets have slightly different structures. Here, we are cleaning and processing them so that they can be merged.

In [88]:
## Processing centroids: Removing None values using the clean_properties function defined when first running it for 
## Africa & ensuring "name" exists (if not, fallback to OSM ID)

for feature in centroids_data["features"]:
    feature["properties"] = clean_properties(feature["properties"])

    if "name" not in feature["properties"] or not feature["properties"]["name"]:
        feature["properties"]["name"] = f"Unnamed School (OSM ID: {feature['properties'].get('osm_id', 'Unknown')})"


In [89]:
## Process nodes: Removing None values using the clean_properties function defined when first running it for 
## Africa & ensuring "name" exists (if not, fallback to OSM ID)

for feature in nodes_data["features"]:
    feature["properties"] = clean_properties(feature["properties"])

    if "name" not in feature["properties"] or not feature["properties"]["name"]:
        feature["properties"]["name"] = f"Unnamed School (OSM ID: {feature['properties'].get('osm_id', 'Unknown')})"

    

### Merging the two datasets

In [90]:
## Merging
merged_features = centroids_data["features"] + nodes_data["features"]

In [91]:
## Checking the number of schools included:
len(merged_features)

18430

### Checking columns

There is a strong variety in detail for each school: sometimes a lot of detail is provided, other times only the name. Here we are checking what the most commonly provided details are.

In [92]:
## Counting non-null values
counts, total = count_non_nulls(merged_features)

In [93]:
## Converting counts to dataframe for easy sorting
merged_df = pd.DataFrame(list(counts.items()), columns=["Column", "Non-Null Count"])
merged_df["Total Features"] = total
merged_df["Coverage (%)"] = (merged_df["Non-Null Count"] / total) * 100
merged_df = merged_df.sort_values(by="Non-Null Count", ascending=False)


In [94]:
## Showing top 10 columns in terms of coverage
print("Top 10 Columns Sorted by Observations")
merged_df.head(10)

Top 10 Columns Sorted by Observations


Unnamed: 0,Column,Non-Null Count,Total Features,Coverage (%)
3,amenity,18430,18430,100.0
1,name,18430,18430,100.0
97,osm_way_id,16451,18430,89.262073
13,operator:type,7783,18430,42.23006
9,addr:street,7678,18430,41.660336
16,website,7670,18430,41.616929
12,operator,7625,18430,41.372762
11,grades,7608,18430,41.280521
20,addr:housenumber,6439,18430,34.937602
14,operator:wikidata,5439,18430,29.511666


### Saving the full dataset as GeoJSON

We won't use this dataset for the index calculation since it contains mainy unneeded columns. Saving it anyways in case it is of any use later.

In [95]:
## Creating the merged GeoJSON
merged_data = {
    "type": "FeatureCollection",
    "features": merged_features
}

In [None]:
## Saving the cleaned and merged GeoJSON
with open("Australia-Oceania/schools_merged_allcolumns.geojson", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2)

### Saving sub-selection of columns as GeoJSON for further analysis

In [96]:
columns_to_keep = {"name", "amenity", "isced:level", "grades"}

In [97]:
## Processing each feature to retain only selected columns
for feature in merged_data["features"]:
    feature["properties"] = {k: v for k, v in feature["properties"].items() if k in columns_to_keep}

In [None]:
## Saving the filtered dataset as GeoJSON
with open("Australia-Oceania/schools_merged_finalcolumns.geojson", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2)

# 5. South America

### Turning polygon schools into points

Since some of the OSM schools are stored as polygons, we still need to turn them into points as well, so that they can seamlessly integrate with the point-based schools.

In [98]:
## Loading the multipolygon GeoJSON
multipolygon_file = "South America/schools_full_polygons.geojson"
gdf_multi = gpd.read_file(multipolygon_file)

## Checking the current CRS
print(f"Original CRS: {gdf_multi.crs}")

Original CRS: EPSG:4326


In [None]:
## Converting to projected CRS
projected_gdf = gdf_multi.to_crs("EPSG:3857")

## Computing the centroids in projected coordinates
projected_gdf["geometry"] = projected_gdf["geometry"].centroid

## Converting back to geographic CRS for saving
gdf_multi = projected_gdf.to_crs("EPSG:4326")

## Saving as GeoJSON
centroid_geojson = "South America/schools_centroids.geojson"
gdf_multi.to_file(centroid_geojson, driver="GeoJSON")

### Loading centroid and points file for cleaning and merging

In [100]:
## File paths
nodes_file = "South America/schools_full_nodes.geojson"
centroids_file = "South America/schools_centroids.geojson"

In [101]:
## Loading both GeoJSON files
with open(centroids_file, "r", encoding="utf-8") as f:
    centroids_data = json.load(f)

with open(nodes_file, "r", encoding="utf-8") as f:
    nodes_data = json.load(f)

### Expanding the other_tags column in the centroids file

The centroids file, which are the centerpoints of the schools captured as polygons in OSM, has a large number of tags collapsed into a single column. In the nodes file, which contains the schools captured as points in OSM, these tags all receive a seperate column. This step makes this also the case for the centroids file.

In [102]:
## Applying extract_other_tags function defined earlier when first running it for Africa

for feature in centroids_data["features"]:
    feature["properties"] = extract_other_tags(feature["properties"])

### Preparing centroid and node datasets for merging

The two datasets have slightly different structures. Here, we are cleaning and processing them so that they can be merged.

In [103]:
## Processing centroids: Removing None values using the clean_properties function defined when first running it for 
## Africa & ensuring "name" exists (if not, fallback to OSM ID)

for feature in centroids_data["features"]:
    feature["properties"] = clean_properties(feature["properties"])

    if "name" not in feature["properties"] or not feature["properties"]["name"]:
        feature["properties"]["name"] = f"Unnamed School (OSM ID: {feature['properties'].get('osm_id', 'Unknown')})"


In [104]:
## Processing nodes: Removing None values using the clean_properties function defined when first running it for 
## Africa & ensuring "name" exists (if not, fallback to OSM ID)

for feature in nodes_data["features"]:
    feature["properties"] = clean_properties(feature["properties"])

    if "name" not in feature["properties"] or not feature["properties"]["name"]:
        feature["properties"]["name"] = f"Unnamed School (OSM ID: {feature['properties'].get('osm_id', 'Unknown')})"

    

### Merging the two datasets

In [105]:
## Merging
merged_features = centroids_data["features"] + nodes_data["features"]

In [106]:
## Checking the number of schools included:
len(merged_features)

207733

### Checking columns

There is a strong variety in detail for each school: sometimes a lot of detail is provided, other times only the name. Here we are checking what the most commonly provided details are.

In [107]:
## Counting non-null values
counts, total = count_non_nulls(merged_features)

In [108]:
## Converting counts to dataframe for easy sorting
merged_df = pd.DataFrame(list(counts.items()), columns=["Column", "Non-Null Count"])
merged_df["Total Features"] = total
merged_df["Coverage (%)"] = (merged_df["Non-Null Count"] / total) * 100
merged_df = merged_df.sort_values(by="Non-Null Count", ascending=False)


In [109]:
## Showing top 10 columns in terms of coverage
print("Top 10 Columns Sorted by Observations")
merged_df.head(10)

Top 10 Columns Sorted by Observations


Unnamed: 0,Column,Non-Null Count,Total Features,Coverage (%)
1,name,207733,207733,100.0
3,amenity,207733,207733,100.0
7,addr:city,80496,207733,38.749741
616,source,74536,207733,35.880674
199,osm_way_id,71865,207733,34.594889
44,isced:level,57995,207733,27.918049
13,addr:street,55822,207733,26.871994
32,operator:type,51034,207733,24.567113
37,ref,46225,207733,22.252122
49,addr:full,45483,207733,21.894932


### Saving the full dataset as GeoJSON

We won't use this dataset for the index calculation since it contains mainy unneeded columns. Saving it anyways in case it is of any use later.

In [110]:
## Creating the merged GeoJSON
merged_data = {
    "type": "FeatureCollection",
    "features": merged_features
}

In [None]:
## Saving the cleaned and merged GeoJSON
with open("South America/schools_merged_allcolumns.geojson", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2)

### Saving sub-selection of columns as GeoJSON for further analysis

In [111]:
columns_to_keep = {"name", "amenity", "isced:level", "grades"}

In [112]:
## Processing each feature to retain only selected columns
for feature in merged_data["features"]:
    feature["properties"] = {k: v for k, v in feature["properties"].items() if k in columns_to_keep}

In [None]:
## Saving the filtered dataset as GeoJSON
with open("South America/schools_merged_finalcolumns.geojson", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2)

# 6. North America

### Turning polygon schools into points

Since some of the OSM schools are stored as polygons, we still need to turn them into points as well, so that they can seamlessly integrate with the point-based schools.

In [113]:
## Loading the multipolygon GeoJSON
multipolygon_file = "North America/schools_full_polygons.geojson"
gdf_multi = gpd.read_file(multipolygon_file)

## Checking the current CRS
print(f"Original CRS: {gdf_multi.crs}")

Original CRS: EPSG:4326


In [None]:
## Converting to projected CRS
projected_gdf = gdf_multi.to_crs("EPSG:3857")

## Computing the centroids in projected coordinates
projected_gdf["geometry"] = projected_gdf["geometry"].centroid

## Converting back to geographic CRS for saving
gdf_multi = projected_gdf.to_crs("EPSG:4326")

## Saving as GeoJSON
centroid_geojson = "North America/schools_centroids.geojson"
gdf_multi.to_file(centroid_geojson, driver="GeoJSON")

### Loading centroid and points file for cleaning and merging

In [115]:
## File paths
nodes_file = "North America/schools_full_nodes.geojson"
centroids_file = "North America/schools_centroids.geojson"

In [116]:
## Loading both GeoJSON files
with open(centroids_file, "r", encoding="utf-8") as f:
    centroids_data = json.load(f)

with open(nodes_file, "r", encoding="utf-8") as f:
    nodes_data = json.load(f)

### Expanding the other_tags column in the centroids file

The centroids file, which are the centerpoints of the schools captured as polygons in OSM, has a large number of tags collapsed into a single column. In the nodes file, which contains the schools captured as points in OSM, these tags all receive a seperate column. This step makes this also the case for the centroids file.

In [117]:
## Applying extract_other_tags function defined earlier when first running it for Africa

for feature in centroids_data["features"]:
    feature["properties"] = extract_other_tags(feature["properties"])

### Preparing centroid and node datasets for merging

The two datasets have slightly different structures. Here, we are cleaning and processing them so that they can be merged.

In [118]:
## Processing centroids: Removing None values using the clean_properties function defined when first running it for 
## Africa & ensuring "name" exists (if not, fallback to OSM ID)

for feature in centroids_data["features"]:
    feature["properties"] = clean_properties(feature["properties"])

    if "name" not in feature["properties"] or not feature["properties"]["name"]:
        feature["properties"]["name"] = f"Unnamed School (OSM ID: {feature['properties'].get('osm_id', 'Unknown')})"


In [119]:
## Processing nodes: Removing None values using the clean_properties function defined when first running it for 
## Africa & ensuring "name" exists (if not, fallback to OSM ID)

for feature in nodes_data["features"]:
    feature["properties"] = clean_properties(feature["properties"])

    if "name" not in feature["properties"] or not feature["properties"]["name"]:
        feature["properties"]["name"] = f"Unnamed School (OSM ID: {feature['properties'].get('osm_id', 'Unknown')})"

    

### Merging the two datasets

In [120]:
## Merging
merged_features = centroids_data["features"] + nodes_data["features"]

In [121]:
## Checking the number of schools included:
len(merged_features)

197348

### Checking columns

There is a strong variety in detail for each school: sometimes a lot of detail is provided, other times only the name. Here we are checking what the most commonly provided details are.

In [122]:
## Counting non-null values
counts, total = count_non_nulls(merged_features)

In [123]:
## Converting counts to dataframe for easy sorting
merged_df = pd.DataFrame(list(counts.items()), columns=["Column", "Non-Null Count"])
merged_df["Total Features"] = total
merged_df["Coverage (%)"] = (merged_df["Non-Null Count"] / total) * 100
merged_df = merged_df.sort_values(by="Non-Null Count", ascending=False)


In [124]:
## Showing top 10 columns in terms of coverage
print("Top 10 Columns Sorted by Observations")
merged_df.head(10)

Top 10 Columns Sorted by Observations


Unnamed: 0,Column,Non-Null Count,Total Features,Coverage (%)
1,name,197348,197348,100.0
3,amenity,197348,197348,100.0
297,osm_way_id,106244,197348,53.835864
19,gnis:feature_id,88429,197348,44.808663
9,addr:street,63511,197348,32.182236
7,addr:postcode,58513,197348,29.649654
954,ele,57771,197348,29.273669
6,addr:housenumber,47941,197348,24.29262
22,operator,46787,197348,23.707866
5,addr:city,43655,197348,22.120822


### Saving the full dataset as GeoJSON

We won't use this dataset for the index calculation since it contains mainy unneeded columns. Saving it anyways in case it is of any use later.

In [125]:
## Creating the merged GeoJSON
merged_data = {
    "type": "FeatureCollection",
    "features": merged_features
}

In [None]:
## Saving the cleaned and merged GeoJSON
with open("North America/schools_merged_allcolumns.geojson", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2)

### Saving sub-selection of columns as GeoJSON for further analysis

In [126]:
columns_to_keep = {"name", "amenity", "isced:level", "grades"}

In [127]:
## Processing each feature to retain only selected columns
for feature in merged_data["features"]:
    feature["properties"] = {k: v for k, v in feature["properties"].items() if k in columns_to_keep}

In [None]:
## Saving the filtered dataset as GeoJSON
with open("North America/schools_merged_finalcolumns.geojson", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2)

# 7. Central America

### Turning polygon schools into points

Since some of the OSM schools are stored as polygons, we still need to turn them into points as well, so that they can seamlessly integrate with the point-based schools.

In [128]:
## Loading the multipolygon GeoJSON
multipolygon_file = "Central America/schools_full_polygons.geojson"
gdf_multi = gpd.read_file(multipolygon_file)

## Checking the current CRS
print(f"Original CRS: {gdf_multi.crs}")

Original CRS: EPSG:4326


In [None]:
## Converting to projected CRS
projected_gdf = gdf_multi.to_crs("EPSG:3857")

## Computing the centroids in projected coordinates
projected_gdf["geometry"] = projected_gdf["geometry"].centroid

## Converting back to geographic CRS for saving
gdf_multi = projected_gdf.to_crs("EPSG:4326")

## Saving as GeoJSON
centroid_geojson = "Central America/schools_centroids.geojson"
gdf_multi.to_file(centroid_geojson, driver="GeoJSON")

### Loading centroid and points file for cleaning and merging

In [130]:
## File paths
nodes_file = "Central America/schools_full_nodes.geojson"
centroids_file = "Central America/schools_centroids.geojson"

In [131]:
## Loading both GeoJSON files
with open(centroids_file, "r", encoding="utf-8") as f:
    centroids_data = json.load(f)

with open(nodes_file, "r", encoding="utf-8") as f:
    nodes_data = json.load(f)

### Expanding the other_tags column in the centroids file

The centroids file, which are the centerpoints of the schools captured as polygons in OSM, has a large number of tags collapsed into a single column. In the nodes file, which contains the schools captured as points in OSM, these tags all receive a seperate column. This step makes this also the case for the centroids file.

In [132]:
## Applying extract_other_tags function defined earlier when first running it for Africa

for feature in centroids_data["features"]:
    feature["properties"] = extract_other_tags(feature["properties"])

### Preparing centroid and node datasets for merging

The two datasets have slightly different structures. Here, we are cleaning and processing them so that they can be merged.

In [133]:
## Processing centroids: Removing None values using the clean_properties function defined when first running it for 
## Africa & ensuring "name" exists (if not, fallback to OSM ID)

for feature in centroids_data["features"]:
    feature["properties"] = clean_properties(feature["properties"])

    if "name" not in feature["properties"] or not feature["properties"]["name"]:
        feature["properties"]["name"] = f"Unnamed School (OSM ID: {feature['properties'].get('osm_id', 'Unknown')})"


In [134]:
## Processing nodes: Removing None values using the clean_properties function defined when first running it for 
## Africa & ensuring "name" exists (if not, fallback to OSM ID)

for feature in nodes_data["features"]:
    feature["properties"] = clean_properties(feature["properties"])

    if "name" not in feature["properties"] or not feature["properties"]["name"]:
        feature["properties"]["name"] = f"Unnamed School (OSM ID: {feature['properties'].get('osm_id', 'Unknown')})"

    

### Merging the two datasets

In [135]:
## Merging
merged_features = centroids_data["features"] + nodes_data["features"]

In [136]:
## Checking the number of schools included:
len(merged_features)

25547

### Checking columns

There is a strong variety in detail for each school: sometimes a lot of detail is provided, other times only the name. Here we are checking what the most commonly provided details are.

In [137]:
## Counting non-null values
counts, total = count_non_nulls(merged_features)

In [138]:
## Converting counts to dataframe for easy sorting
merged_df = pd.DataFrame(list(counts.items()), columns=["Column", "Non-Null Count"])
merged_df["Total Features"] = total
merged_df["Coverage (%)"] = (merged_df["Non-Null Count"] / total) * 100
merged_df = merged_df.sort_values(by="Non-Null Count", ascending=False)


In [139]:
## Showing top 10 columns in terms of coverage
print("Top 10 Columns Sorted by Observations")
merged_df.head(10)

Top 10 Columns Sorted by Observations


Unnamed: 0,Column,Non-Null Count,Total Features,Coverage (%)
3,amenity,25547,25547,100.0
1,name,25547,25547,100.0
74,osm_way_id,14680,25547,57.462716
9,addr:city,7793,25547,30.50456
4,operator,6963,25547,27.255646
10,addr:street,6047,25547,23.670098
5,operator:type,5330,25547,20.863506
407,source,4927,25547,19.286022
11,grades,2373,25547,9.288762
89,operational_status,2197,25547,8.599836


### Saving the full dataset as GeoJSON

We won't use this dataset for the index calculation since it contains mainy unneeded columns. Saving it anyways in case it is of any use later.

In [140]:
## Creating the merged GeoJSON
merged_data = {
    "type": "FeatureCollection",
    "features": merged_features
}

In [None]:
## Saving the cleaned and merged GeoJSON
with open("Central America/schools_merged_allcolumns.geojson", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2)

### Saving sub-selection of columns as GeoJSON for further analysis

In [141]:
columns_to_keep = {"name", "amenity", "isced:level", "grades"}

In [142]:
## Processing each feature to retain only selected columns
for feature in merged_data["features"]:
    feature["properties"] = {k: v for k, v in feature["properties"].items() if k in columns_to_keep}

In [None]:
## Saving the filtered dataset as GeoJSON
with open("Central America/schools_merged_finalcolumns.geojson", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2)

# 8. MERGING TO ONE GLOBAL FILE

### Loading the files created in the previous steps

In [143]:
Africa = "Africa/schools_merged_finalcolumns.geojson"
Asia = "Asia/schools_merged_finalcolumns.geojson"
Europe = "Europe/schools_merged_finalcolumns.geojson"
AustraliaOceania = "Australia-Oceania/schools_merged_finalcolumns.geojson"
SouthAmerica = "South America/schools_merged_finalcolumns.geojson"
NorthAmerica = "North America/schools_merged_finalcolumns.geojson"
CentralAmerica = "Central America/schools_merged_finalcolumns.geojson"

In [163]:
with open(Africa, "r", encoding="utf-8") as f:
    Africa_data = json.load(f)

In [164]:
with open(Asia, "r", encoding="utf-8") as f:
    Asia_data = json.load(f)

In [165]:
with open(Europe, "r", encoding="utf-8") as f:
    Europe_data = json.load(f)

In [166]:
with open(AustraliaOceania, "r", encoding="utf-8") as f:
    AustraliaOceania_data = json.load(f)

In [167]:
with open(SouthAmerica, "r", encoding="utf-8") as f:
    SouthAmerica_data = json.load(f)

In [168]:
with open(NorthAmerica, "r", encoding="utf-8") as f:
    NorthAmerica_data = json.load(f)

In [169]:
with open(CentralAmerica, "r", encoding="utf-8") as f:
    CentralAmerica_data = json.load(f)

### Merging the files

In [172]:
## Merging
merged_continents = Africa_data["features"] + Asia_data["features"] + Europe_data["features"] + AustraliaOceania_data["features"] + SouthAmerica_data["features"] + NorthAmerica_data["features"]+ CentralAmerica_data["features"]

In [174]:
## Checking the number of schools included:
len(merged_continents)

1381768

### Saving the global dataset as GeoJSON

In [None]:
## Creating the merged GeoJSON
merged_data = {
    "type": "FeatureCollection",
    "features": merged_continents
}

In [None]:
## Saving the cleaned and merged GeoJSON
with open("00_GLOBAL FINAL/schools_global_unedited/schools_global_unedited.geojson", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=2)

### Saving the global dataset as Shapefile

In [None]:
## Loading GeoJSON
gdf = gpd.read_file("00_GLOBAL FINAL/schools_global_unedited/schools_global_unedited.geojson")

In [None]:
## Saving as Shapefile
gdf.to_file("00_GLOBAL FINAL/schools_global_unedited/schools_global_unedited.shp")

  gdf.to_file("00_GLOBAL/schools_global_shp/schools_global.shp")
