In [1]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
import pandas as pd
import plotly.express as px
import numpy as np
import geopandas as gpd
from pyproj import Transformer
from scripts.utils import split_column_at
from scripts.accent_cleaner import AccentCleaner
#from deep_translator import GoogleTranslator

In [2]:
# Read the GeoJSON file
gdf = gpd.read_file('../data/raw/spanish_schools.geojson')

# Ensure that the geometry column contains points
# Extract latitude and longitude from the geometry column
gdf['latitude'] = gdf.geometry.y
gdf['longitude'] = gdf.geometry.x

# Now drop the geometry column if you don't need it anymore
df_schools = gdf.drop(columns='geometry')

df_schools.rename(columns={"Gestion": "public_private",
                            "OBJECTID": "school_id", 
                            "Gestion": "public_private",
                            "Nombre": "name",
                            "Tipo": "type"},
                            inplace=True)

df_schools.drop(["Direccion", "CodigoPostal", "Fuente", "Municipio"], axis=1,  inplace=True)

# I dropped Municipio because we don't have the code, and maybe not needed as we can just calculate distance

df_schools.head()

Unnamed: 0,school_id,public_private,type,name,latitude,longitude
0,1,Público,COLEGIO PÚBLICO DE EDUCACIÓN INFANTIL Y PRIMAR...,CPEIP Abarzuza Zumadia HLHIP,42.72629,-2.022176
1,2,Público,ESCUELA PÚBLICA DE MÚSICA,Esc. Mús. Púb. Ablitas,41.975297,-1.640062
2,3,Público,ESCUELA INFANTIL,EI Ablitas Mi Cole,41.974719,-1.641089
3,4,Público,COLEGIO DE EDUCACIÓN INFANTIL Y PRIMARIA,CPEIP Ablitas S. Babil HLHIP,41.974047,-1.641728
4,5,Público,COLEGIO DE EDUCACIÓN INFANTIL Y PRIMARIA,CPEIP Aibar G. V. Casamayor HLHIP,42.590753,-1.361802


In [3]:
df_schools["public_private"].unique()

array(['Público', 'Concertado', 'Privado', None], dtype=object)

In [4]:
df_schools

Unnamed: 0,school_id,public_private,type,name,latitude,longitude
0,1,Público,COLEGIO PÚBLICO DE EDUCACIÓN INFANTIL Y PRIMAR...,CPEIP Abarzuza Zumadia HLHIP,42.726290,-2.022176
1,2,Público,ESCUELA PÚBLICA DE MÚSICA,Esc. Mús. Púb. Ablitas,41.975297,-1.640062
2,3,Público,ESCUELA INFANTIL,EI Ablitas Mi Cole,41.974719,-1.641089
3,4,Público,COLEGIO DE EDUCACIÓN INFANTIL Y PRIMARIA,CPEIP Ablitas S. Babil HLHIP,41.974047,-1.641728
4,5,Público,COLEGIO DE EDUCACIÓN INFANTIL Y PRIMARIA,CPEIP Aibar G. V. Casamayor HLHIP,42.590753,-1.361802
...,...,...,...,...,...,...
26601,26602,Público,AULA DE EDUCACIÓN PERMANENTE DE ADULTOS,A.E.P.A. Herrera del Duque,39.168134,-5.048491
26602,26603,Público,AULA DE EDUCACIÓN PERMANENTE DE ADULTOS,A.E.P.A. Madrigalejo,39.135204,-5.626350
26603,26604,Público,CENTRO DE EDUCACIÓN PERMANENTE DE ADULTOS,C.E.P.A. Coria,39.987626,-6.538688
26604,26605,Público,CENTRO DE EDUCACIÓN PERMANENTE DE ADULTOS,C.E.P.A. Talayuela,39.982802,-5.607226


In [5]:
translation_map = {
    'Público': 'Public',
    'Concertado': 'Charter',
    'Privado': 'Private'
}

# Replace values in the column
df_schools["public_private"] = df_schools["public_private"].replace(translation_map)

# Print unique values to confirm the change
print(df_schools["public_private"].unique())


['Public' 'Charter' 'Private' None]


In [6]:
df_schools["type"].unique()


array(['COLEGIO PÚBLICO DE EDUCACIÓN INFANTIL Y PRIMARIA/HAUR ETA LEHEN HEZKUNTZAKO IKASTETXE PUBLIKOA',
       'ESCUELA PÚBLICA DE MÚSICA', 'ESCUELA INFANTIL',
       'COLEGIO DE EDUCACIÓN INFANTIL Y PRIMARIA',
       'INSTITUTO DE EDUCACIÓN SECUNDARIA/BIGARREN HEZKUNTZAKO INSTITUTUA',
       'ESCUELA PÚBLICA DE MÚSICA Y DANZA',
       'AULA DE EDUCACIÓN DE ADULTOS', 'CENTRO INTEGRADO POLITÉCNICO',
       'CENTRO PRIVADO DE EDUCACIÓN INFANTIL Y PRIMARIA',
       'CENTRO PRIVADO DE EDUCACIÓN SECUNDARIA OBLIGATORIA',
       'CENTRO PRIVADO DE EDUCACIÓN INFANTIL',
       'INSTITUTO DE EDUCACIÓN SECUNDARIA OBLIGATORIA/DERRIGORREZKO BIGARREN HEZKUNTZAKO INSTITUTUA',
       'INSTITUTO DE EDUCACIÓN SECUNDARIA OBLIGATORIA',
       'INSTITUTO DE EDUCACIÓN SECUNDARIA',
       'CENTRO PRIVADO DE EDUCACIÓN INFANTIL, PRIMARIA Y ESO', ' ',
       'CENTRO INTEGRADO',
       'CENTRO PRIVADO DE EDUCACIÓN INFANTIL PRIMARIA Y SECUNDARIA',
       'ESCUELA PRIVADA DE MÚSICA',
       'CENTRO PRIVADO DE FOR

In [7]:
df_schools['type'] = df_schools['type'].str.capitalize()

In [8]:
df_schools

Unnamed: 0,school_id,public_private,type,name,latitude,longitude
0,1,Public,Colegio público de educación infantil y primar...,CPEIP Abarzuza Zumadia HLHIP,42.726290,-2.022176
1,2,Public,Escuela pública de música,Esc. Mús. Púb. Ablitas,41.975297,-1.640062
2,3,Public,Escuela infantil,EI Ablitas Mi Cole,41.974719,-1.641089
3,4,Public,Colegio de educación infantil y primaria,CPEIP Ablitas S. Babil HLHIP,41.974047,-1.641728
4,5,Public,Colegio de educación infantil y primaria,CPEIP Aibar G. V. Casamayor HLHIP,42.590753,-1.361802
...,...,...,...,...,...,...
26601,26602,Public,Aula de educación permanente de adultos,A.E.P.A. Herrera del Duque,39.168134,-5.048491
26602,26603,Public,Aula de educación permanente de adultos,A.E.P.A. Madrigalejo,39.135204,-5.626350
26603,26604,Public,Centro de educación permanente de adultos,C.E.P.A. Coria,39.987626,-6.538688
26604,26605,Public,Centro de educación permanente de adultos,C.E.P.A. Talayuela,39.982802,-5.607226


In [9]:
df_schools.dropna()

Unnamed: 0,school_id,public_private,type,name,latitude,longitude
0,1,Public,Colegio público de educación infantil y primar...,CPEIP Abarzuza Zumadia HLHIP,42.726290,-2.022176
1,2,Public,Escuela pública de música,Esc. Mús. Púb. Ablitas,41.975297,-1.640062
2,3,Public,Escuela infantil,EI Ablitas Mi Cole,41.974719,-1.641089
3,4,Public,Colegio de educación infantil y primaria,CPEIP Ablitas S. Babil HLHIP,41.974047,-1.641728
4,5,Public,Colegio de educación infantil y primaria,CPEIP Aibar G. V. Casamayor HLHIP,42.590753,-1.361802
...,...,...,...,...,...,...
26601,26602,Public,Aula de educación permanente de adultos,A.E.P.A. Herrera del Duque,39.168134,-5.048491
26602,26603,Public,Aula de educación permanente de adultos,A.E.P.A. Madrigalejo,39.135204,-5.626350
26603,26604,Public,Centro de educación permanente de adultos,C.E.P.A. Coria,39.987626,-6.538688
26604,26605,Public,Centro de educación permanente de adultos,C.E.P.A. Talayuela,39.982802,-5.607226


In [10]:
df_schools.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26606 entries, 0 to 26605
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   school_id       26606 non-null  int32  
 1   public_private  25028 non-null  object 
 2   type            26606 non-null  object 
 3   name            26606 non-null  object 
 4   latitude        26592 non-null  float64
 5   longitude       26592 non-null  float64
dtypes: float64(2), int32(1), object(3)
memory usage: 1.1+ MB


In [11]:
df_schools.to_csv("../data/processed/filtered_files/filtered_schools.csv", index=False)

In [15]:
title = "Schools in Spain"

lats = df_schools.latitude
lons = df_schools.longitude

fig = px.scatter_map(df_schools, 
                     lat=lats, 
                     lon=lons,
                     color_continuous_scale=px.colors.carto.Aggrnyl,
                     zoom=5,
                     size_max=50  # Increase max size of markers
                     )

# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=1000, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='asdf')
)
fig.update_layout(title="Schools in Spain ")

fig.show()

### looks like there might be missing data in CLM