In [1]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
import pandas as pd
import plotly.express as px
import numpy as np
import geopandas as gpd
from pyproj import Transformer
from scripts.utils import split_column_at
from scripts.accent_cleaner import AccentCleaner
#from deep_translator import GoogleTranslator

### Airports

In [2]:
# Read the GeoJSON file
gdf = gpd.read_file('../data/raw/spanish_airports.geojson')
# Ensure that the geometry column contains points
# Extract latitude and longitude from the geometry column
gdf['latitude'] = gdf.geometry.y
gdf['longitude'] = gdf.geometry.x

# Now drop the geometry column if you don't need it anymore
df_airports = gdf.drop(columns='geometry')

df_airports.rename(columns={"Texto": "airport_name", "OBJECTID": "airport_id", "lat": "latitude", "lon": "longitude"}, inplace=True)
df_airports.head()

Unnamed: 0,airport_id,airport_name,latitude,longitude
0,1,Aeropuerto de Jerez,36.743828,-6.062658
1,2,Aeropuerto de Vigo,42.22574,-8.630491
2,3,Aeropuerto de Badajoz,38.889845,-6.821121
3,4,Aeropuerto de Granada - Jaén F.G.L.,37.186826,-3.778613
4,5,Aeropuerto de Almería,36.844781,-2.371234


In [3]:
title = "Airports in Spain"

lats = df_airports.latitude
lons = df_airports.longitude

fig = px.scatter_map(df_airports, 
                     lat=lats, 
                     lon=lons,
                     color_continuous_scale=px.colors.carto.Aggrnyl,
                     zoom=5,
                     size_max=50  # Increase max size of markers
                     )

# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=1000, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='asdf')
)
fig.update_layout(title="Airports in Spain ")

fig.show()

In [4]:
df_airports.to_csv("../data/processed/filtered_airports.csv", index=False)

### Trains

In [5]:
df_trains = pd.read_csv("../data/raw/listado_completo_av_ld_md.csv", sep=";", decimal=",")
df_trains = df_trains.drop(["PAIS", "C.P.", "DIRECCIÓN"], axis = 1)
df_trains.rename(columns={" CÓDIGO": "station_code",
                          "DESCRIPCION": "station_name",
                          "LATITUD": "latitude",
                          "POBLACION": "municipality",
                          "PROVINCIA": "province",
                          "LONGITUD": "longitude"}, inplace=True)

In [6]:
df_trains.isnull().sum()

station_code     0
station_name     0
latitude         0
longitude        0
municipality     0
province        43
dtype: int64

In [7]:
cleaner = AccentCleaner([df_trains], columns=["province", "municipality"])
cleaner.cleanAccents()
df_trains

Unnamed: 0,station_code,station_name,latitude,longitude,municipality,province,province_clean,municipality_clean
0,11208,VITORIA/GASTEIZ,42.841528,-2.672665,Vitoria-Gasteiz,Araba/Álava,araba/alava,vitoria-gasteiz
1,11212,AGURAIN/SALVATIERRA DE ALAVA,42.846437,-2.389227,Salvatierra/Agurain,Araba/Álava,araba/alava,salvatierra/agurain
2,11213,ARAIA,42.869471,-2.306802,Arraia-Maeztu,Araba/Álava,araba/alava,arraia-maeztu
3,11203,MANZANOS,42.742875,-2.867530,Ribera Baja/Erribera Beitia,Araba/Álava,araba/alava,ribera baja/erribera beitia
4,11205,NANCLARES-LANGRAIZ,42.816234,-2.804934,Iruña Oka/Iruña de Oca,Araba/Álava,araba/alava,iruna oka/iruna de oca
...,...,...,...,...,...,...,...,...
641,94536,MANGUALDE,40.586500,-7.760260,MANGUALDE,,,mangualde
642,94543,CELORICO-BEIRA,40.665200,-7.382460,CELORICO DA BEIRA,,,celorico da beira
643,94551,GUARDA,40.552710,-7.239840,GUARDA,,,guarda
644,94563,VILAR FORMOSO,40.606450,-6.829100,VILAR FORMOSO,,,vilar formoso


In [8]:
df_trains["municipality_clean"] = df_trains["municipality_clean"].str.split("/", expand=False)
df_trains["province_clean"] = df_trains["province_clean"].str.split("/", expand=False)

df_trains["municipality_clean"] = df_trains["municipality_clean"].apply(
    lambda x: x[0] if len(x) == 1 else x[1]
)
df_trains["province_clean"] = df_trains["province_clean"].apply(
    lambda x: x[0] if len(x) == 1 else x[1]
)




In [9]:
df_trains.sample(20)

Unnamed: 0,station_code,station_name,latitude,longitude,municipality,province,province_clean,municipality_clean
103,37305,ALMADENEJOS-ALMADEN,38.740577,-4.730331,Almadén,Ciudad Real,ciudad real,almaden
636,94428,ENTRONCAMENTO,39.46144,-8.47307,ENTRONCAMENTO,,,entroncamento
359,11006,QUINTANA DEL PUENTE,42.080031,-4.198594,Quintana del Puente,Palencia,palencia,quintana del puente
395,22401,TUI,42.056554,-8.643011,Tui,Pontevedra,pontevedra,tui
235,20111,BEMBIBRE,42.609579,-6.422201,Bembibre,León,leon,bembibre
57,37600,ALJUCEN,38.936409,-6.407883,Aljucén,Badajoz,badajoz,aljucen
143,23013,OSEBE,42.825301,-8.615838,Teo,"Coruña, A","coruna, a",teo
266,78406,BELLPUIG,41.632559,1.012974,Bellpuig,Lleida,lleida,bellpuig
227,56004,JODAR-UBEDA,37.913342,-3.350202,Jódar,Jaén,jaen,jodar
556,70500,ARIZA,41.309662,-2.059545,Ariza,Zaragoza,zaragoza,ariza


In [10]:
df_trains['station_name'] = df_trains['station_name'].str.title()
df_trains['municipality'] = df_trains['municipality'].str.lower()
df_trains['province'] = df_trains['province'].str.lower()

In [11]:
if len(df_trains["province"]):
    df_trains.drop(columns=["province", "municipality"], inplace=True)

df_trains

Unnamed: 0,station_code,station_name,latitude,longitude,province_clean,municipality_clean
0,11208,Vitoria/Gasteiz,42.841528,-2.672665,alava,vitoria-gasteiz
1,11212,Agurain/Salvatierra De Alava,42.846437,-2.389227,alava,agurain
2,11213,Araia,42.869471,-2.306802,alava,arraia-maeztu
3,11203,Manzanos,42.742875,-2.867530,alava,erribera beitia
4,11205,Nanclares-Langraiz,42.816234,-2.804934,alava,iruna de oca
...,...,...,...,...,...,...
641,94536,Mangualde,40.586500,-7.760260,,mangualde
642,94543,Celorico-Beira,40.665200,-7.382460,,celorico da beira
643,94551,Guarda,40.552710,-7.239840,,guarda
644,94563,Vilar Formoso,40.606450,-6.829100,,vilar formoso


In [12]:
title = "Train Stations (LD, MD, High Speed) in Spain"

lats = df_trains.latitude
lons = df_trains.longitude

fig = px.scatter_map(df_trains, 
                     lat=lats, 
                     lon=lons,
                     hover_data=["municipality_clean", "station_name"], 
                     color_continuous_scale=px.colors.carto.Aggrnyl,
                     zoom=5,
                     size_max=50  # Increase max size of markers
                     )

# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=1000, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='asdf')
)
fig.update_layout(title="Train Stations in Spain ")

fig.show()

In [13]:
df_trains.to_csv("../data/processed/filtered_trains.csv", index=False)