In [14]:
import pandas as pd
import plotly.express as px
import numpy as np
import geopandas as gpd
from pyproj import Transformer
from deep_translator import GoogleTranslator

### Airports

In [15]:
# Read the GeoJSON file
gdf = gpd.read_file('../data/raw/spanish_airports.geojson')
# Ensure that the geometry column contains points
# Extract latitude and longitude from the geometry column
gdf['latitude'] = gdf.geometry.y
gdf['longitude'] = gdf.geometry.x

# Now drop the geometry column if you don't need it anymore
df_airports = gdf.drop(columns='geometry')

df_airports.rename(columns={"Texto": "airport_name", "OBJECTID": "airport_id", "lat": "latitude", "lon": "longitude"}, inplace=True)
df_airports.head()

Unnamed: 0,airport_id,airport_name,latitude,longitude
0,1,Aeropuerto de Jerez,36.743828,-6.062658
1,2,Aeropuerto de Vigo,42.22574,-8.630491
2,3,Aeropuerto de Badajoz,38.889845,-6.821121
3,4,Aeropuerto de Granada - Jaén F.G.L.,37.186826,-3.778613
4,5,Aeropuerto de Almería,36.844781,-2.371234


In [16]:
title = "Airports in Spain"

lats = df_airports.latitude
lons = df_airports.longitude

fig = px.scatter_map(df_airports, 
                     lat=lats, 
                     lon=lons,
                     color_continuous_scale=px.colors.carto.Aggrnyl,
                     zoom=5,
                     size_max=50  # Increase max size of markers
                     )

# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=1000, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='asdf')
)
fig.update_layout(title="Airports in Spain ")

fig.show()

In [17]:
df_airports.to_csv("../data/processed/filtered_airports.csv", index=False)

### Connectivity

In [18]:
df_connectivity = pd.read_csv("../data/raw/conectivity_municipality.csv", encoding="ISO-8859-1", sep=";", decimal=",")

df_connectivity.drop(["Community", "Province"], axis=1, inplace = True)
df_connectivity.rename(columns={"housing": "number_of_houses"}, inplace=True)



df_connectivity

Unnamed: 0,municipality_code,municipality,population,number_of_houses,VDSL_30Mbps,fixed_wireless,FTTH,HFC,reception_30Mbps,reception_100Mbps,reception_1Gbps,4G,5G,"5G_3,5GHz"
0,1001,Alegría-Dulantzi,2.960,1.275,"29,18%","100,00%","98,59%","0,00%","99,98%","80,71%","14,90%","100,00%","0,90%","0,00%"
1,1002,Amurrio,10.281,4.937,"10,07%","99,82%","91,84%","0,28%","93,49%","86,98%","89,39%","100,00%","95,61%","0,00%"
2,1003,Aramaio,1.431,738,"20,97%","93,50%","84,55%","0,00%","89,21%","78,42%","83,87%","99,96%","59,65%","0,00%"
3,1004,Artziniega,1.810,1.099,"10,51%","99,73%","86,53%","0,00%","97,68%","95,37%","97,08%","100,00%","84,00%","0,00%"
4,1006,Armiñón,235,180,"0,00%","52,78%","87,22%","0,00%","98,88%","83,80%","89,50%","96,32%","0,15%","0,00%"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8126,14902,La Guijarrosa,1.341,643,"0,00%","95,33%","78,07%","0,00%","89,34%","89,34%","86,46%","100,00%","4,87%","0,00%"
8127,18077,Fornes,519,357,"0,00%","100,00%","95,24%","0,00%","99,16%","95,22%","95,24%","100,00%","30,71%","0,00%"
8128,18916,Torrenueva Costa,3.038,5.583,"0,00%","99,98%","97,85%","0,00%","99,89%","97,83%","96,69%","100,00%","100,00%","100,00%"
8129,21902,La Zarza-Perrunal,1.230,1.086,"0,00%","100,00%","100,00%","0,00%","100,00%","100,00%","87,86%","100,00%","100,00%","0,00%"


In [19]:
df_connectivity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8131 entries, 0 to 8130
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   municipality_code  8131 non-null   int64 
 1   municipality       8131 non-null   object
 2   population         8131 non-null   object
 3   number_of_houses   8131 non-null   object
 4   VDSL_30Mbps        8131 non-null   object
 5   fixed_wireless     8131 non-null   object
 6   FTTH               8131 non-null   object
 7   HFC                8131 non-null   object
 8   reception_30Mbps   8131 non-null   object
 9   reception_100Mbps  8131 non-null   object
 10  reception_1Gbps    8131 non-null   object
 11  4G                 8131 non-null   object
 12  5G                 8131 non-null   object
 13  5G_3,5GHz          8131 non-null   object
dtypes: int64(1), object(13)
memory usage: 889.5+ KB


In [20]:
df_connectivity.to_csv("../data/processed/filtered_connectivity.csv", index=False)

### Schools

In [21]:
# Read the GeoJSON file
gdf = gpd.read_file('../data/raw/spanish_schools.geojson')

# Ensure that the geometry column contains points
# Extract latitude and longitude from the geometry column
gdf['latitude'] = gdf.geometry.y
gdf['longitude'] = gdf.geometry.x

# Now drop the geometry column if you don't need it anymore
df_schools = gdf.drop(columns='geometry')

df_schools.rename(columns={"Gestion": "public_private",
                            "OBJECTID": "school_id", 
                            "Gestion": "public_private",
                            "Nombre": "name",
                            "Tipo": "type"},
                            inplace=True)

df_schools.drop(["Direccion", "CodigoPostal", "Fuente", "Municipio"], axis=1,  inplace=True)

# I dropped Municipio because we don't have the code, and maybe not needed as we can just calculate distance

df_schools.head()

Unnamed: 0,school_id,public_private,type,name,latitude,longitude
0,1,Público,COLEGIO PÚBLICO DE EDUCACIÓN INFANTIL Y PRIMAR...,CPEIP Abarzuza Zumadia HLHIP,42.72629,-2.022176
1,2,Público,ESCUELA PÚBLICA DE MÚSICA,Esc. Mús. Púb. Ablitas,41.975297,-1.640062
2,3,Público,ESCUELA INFANTIL,EI Ablitas Mi Cole,41.974719,-1.641089
3,4,Público,COLEGIO DE EDUCACIÓN INFANTIL Y PRIMARIA,CPEIP Ablitas S. Babil HLHIP,41.974047,-1.641728
4,5,Público,COLEGIO DE EDUCACIÓN INFANTIL Y PRIMARIA,CPEIP Aibar G. V. Casamayor HLHIP,42.590753,-1.361802


In [22]:
df_schools["public_private"].unique()

array(['Público', 'Concertado', 'Privado', None], dtype=object)

In [23]:
translation_map = {
    'Público': 'Public',
    'Concertado': 'Charter',
    'Privado': 'Private'
}

# Replace values in the column
df_schools["public_private"] = df_schools["public_private"].replace(translation_map)

# Print unique values to confirm the change
print(df_schools["public_private"].unique())


['Public' 'Charter' 'Private' None]


In [24]:
df_schools["type"].unique()

array(['COLEGIO PÚBLICO DE EDUCACIÓN INFANTIL Y PRIMARIA/HAUR ETA LEHEN HEZKUNTZAKO IKASTETXE PUBLIKOA',
       'ESCUELA PÚBLICA DE MÚSICA', 'ESCUELA INFANTIL',
       'COLEGIO DE EDUCACIÓN INFANTIL Y PRIMARIA',
       'INSTITUTO DE EDUCACIÓN SECUNDARIA/BIGARREN HEZKUNTZAKO INSTITUTUA',
       'ESCUELA PÚBLICA DE MÚSICA Y DANZA',
       'AULA DE EDUCACIÓN DE ADULTOS', 'CENTRO INTEGRADO POLITÉCNICO',
       'CENTRO PRIVADO DE EDUCACIÓN INFANTIL Y PRIMARIA',
       'CENTRO PRIVADO DE EDUCACIÓN SECUNDARIA OBLIGATORIA',
       'CENTRO PRIVADO DE EDUCACIÓN INFANTIL',
       'INSTITUTO DE EDUCACIÓN SECUNDARIA OBLIGATORIA/DERRIGORREZKO BIGARREN HEZKUNTZAKO INSTITUTUA',
       'INSTITUTO DE EDUCACIÓN SECUNDARIA OBLIGATORIA',
       'INSTITUTO DE EDUCACIÓN SECUNDARIA',
       'CENTRO PRIVADO DE EDUCACIÓN INFANTIL, PRIMARIA Y ESO', ' ',
       'CENTRO INTEGRADO',
       'CENTRO PRIVADO DE EDUCACIÓN INFANTIL PRIMARIA Y SECUNDARIA',
       'ESCUELA PRIVADA DE MÚSICA',
       'CENTRO PRIVADO DE FOR

In [13]:
# Translate function
def translate_text(text):
    if pd.isna(text):  # Handle NaN values
        return text
    return GoogleTranslator(source='auto', target='en').translate(text)

# Apply translation to unique values
df_schools["type"] = df_schools["type"].apply(translate_text)

# Check unique values after translation
print(df_schools["type"].unique())


NameError: name 'df_schools' is not defined