In [196]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
import pandas as pd
import plotly.express as px
import numpy as np
import geopandas as gpd
from pyproj import Transformer
from scripts.utils import split_column_at
from scripts.accent_cleaner import AccentCleaner
#from deep_translator import GoogleTranslator

### Airports

In [207]:
# Read the GeoJSON file
gdf = gpd.read_file('../data/raw/spanish_airports.geojson')
# Ensure that the geometry column contains points
# Extract latitude and longitude from the geometry column
gdf['latitude'] = gdf.geometry.y
gdf['longitude'] = gdf.geometry.x

# Now drop the geometry column if you don't need it anymore
df_airports = gdf.drop(columns='geometry')

df_airports.rename(columns={"Texto": "airport_name", "OBJECTID": "airport_id", "lat": "latitude", "lon": "longitude"}, inplace=True)
df_airports

Unnamed: 0,airport_id,airport_name,latitude,longitude
0,1,Aeropuerto de Jerez,36.743828,-6.062658
1,2,Aeropuerto de Vigo,42.22574,-8.630491
2,3,Aeropuerto de Badajoz,38.889845,-6.821121
3,4,Aeropuerto de Granada - Jaén F.G.L.,37.186826,-3.778613
4,5,Aeropuerto de Almería,36.844781,-2.371234
5,6,Aeropuerto de Salamanca,40.950757,-5.502955
6,7,Aeropuerto de Ibiza,38.874352,1.372669
7,8,Aeropuerto de Lanzarote,28.951045,-13.60502
8,9,Aeropuerto de La Palma,28.622084,-17.753653
9,10,Aeropuerto de Tenerife Norte,28.485811,-16.347639


In [208]:
df_airports["airport_name"] = df_airports["airport_name"].str.replace(r"^Aeropuerto de ", "", regex=True)
df_airports.rename(columns={"airport_name": "airport"}, inplace=True)
df_airports["airport"] = df_airports["airport"].str.replace("-", " - ", regex=False)

df_airports.drop("airport_id", axis=1, inplace=True)
df_airports.head()

Unnamed: 0,airport,latitude,longitude
0,Jerez,36.743828,-6.062658
1,Vigo,42.22574,-8.630491
2,Badajoz,38.889845,-6.821121
3,Granada - Jaén F.G.L.,37.186826,-3.778613
4,Almería,36.844781,-2.371234


In [209]:
airports_info = pd.read_csv("../data/raw/airports_info.csv")
airports_info.drop(["web-scraper-order", "web-scraper-start-url", "runways"], axis = 1, inplace = True)

In [210]:
airports_info.head()

Unnamed: 0,airport,y_passengers
0,Adolfo Suárez Madrid-Barajas,60 220 984
1,Albacete[3]​,2 644
2,Alicante-Elche Miguel Hernández,15 747 678
3,Almería,775 393
4,Asturias,1 974 850


In [None]:
airports_info["airport"] = airports_info["airport"].str.replace(r"\[.*?\]", "", regex=True).str.strip()
airports_info["airport"] = airports_info["airport"].str.replace("-", "  -  ", regex=False)


In [213]:
airports_info["y_passengers"] = airports_info["y_passengers"].str.replace(r" ", "", regex=True)
airports_info["y_passengers"].astype(int)

0     60220984
1         2644
2     15747678
3       775393
4      1974850
5        80181
6     49220984
7      6336441
8         4053
9         5938
10     1039429
11     1586463
12    13961638
13     6020413
14     1252022
15     8212943
16         276
17     8931598
18      904000
19       63442
20       16728
21        1956
22    22344373
23      501069
24     4045215
25      877796
26    31105987
27      197509
28     1045419
29        6207
30       21083
31      482662
32     3537445
33     8071524
34     1242089
35       10061
36     6120550
37    12337244
38     1368821
39      113318
40      301241
41     9948141
42      208923
43     1136157
44      309929
45      685690
Name: y_passengers, dtype: int32

In [218]:
airports_info["airport"] = airports_info["airport"].replace("La Coruña", "A Coruña")


In [222]:
airports_info.sort_values(by = "airport", ascending = True).reset_index(inplace=True)
airports_info.head()

Unnamed: 0,airport,y_passengers
0,Adolfo Suárez Madrid - Barajas,60220984
1,Albacete​,2644
2,Alicante - Elche Miguel Hernández,15747678
3,Almería,775393
4,Asturias,1974850


In [223]:
df_airports= df_airports.sort_values(by = "airport", ascending = True)
df_airports


Unnamed: 0,airport,latitude,longitude
20,A Coruña,43.302718,-8.377502
21,Adolfo Suárez Madrid - Barajas,40.49208,-3.576319
22,Albacete,38.949634,-1.863164
24,Alicante - Elche,38.285826,-0.563349
4,Almería,36.844781,-2.371234
14,Asturias,43.562196,-6.032618
2,Badajoz,38.889845,-6.821121
25,Barcelona - El prat,41.296302,2.085025
18,Bilbao,43.302507,-2.908358
26,Burgos,42.35433,-3.621685


In [225]:
# Show all rows and columns
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
airports = pd.merge(left = df_airports, right = airports_info, on= "airport", how="outer")
airports.to_csv("../data/marina.csv")

In [None]:
title = "Airports in Spain"

lats = df_airports.latitude
lons = df_airports.longitude

fig = px.scatter_map(df_airports, 
                     lat=lats, 
                     lon=lons,
                     color_continuous_scale=px.colors.carto.Aggrnyl,
                     zoom=5,
                     size_max=50  # Increase max size of markers
                     )

# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=1000, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='asdf')
)
fig.update_layout(title="Airports in Spain ")

fig.show()

In [None]:
df_airports.to_csv("../data/processed/filtered_airports.csv", index=False)

### Trains

In [None]:
df_trains = pd.read_csv("../data/raw/listado_completo_av_ld_md.csv", sep=";", decimal=",")
df_trains = df_trains.drop(["PAIS", "C.P.", "DIRECCIÓN"], axis = 1)
df_trains.rename(columns={" CÓDIGO": "station_code",
                          "DESCRIPCION": "station_name",
                          "LATITUD": "latitude",
                          "POBLACION": "municipality",
                          "PROVINCIA": "province",
                          "LONGITUD": "longitude"}, inplace=True)

In [None]:
df_trains.isnull().sum()

In [None]:
cleaner = AccentCleaner([df_trains], columns=["province", "municipality"])
cleaner.cleanAccents()
df_trains

In [None]:
df_trains["municipality_clean"] = df_trains["municipality_clean"].str.split("/", expand=False)
df_trains["province_clean"] = df_trains["province_clean"].str.split("/", expand=False)

df_trains["municipality_clean"] = df_trains["municipality_clean"].apply(
    lambda x: x[0] if len(x) == 1 else x[1]
)
df_trains["province_clean"] = df_trains["province_clean"].apply(
    lambda x: x[0] if len(x) == 1 else x[1]
)




In [None]:
df_trains.sample(20)

In [None]:
df_trains['station_name'] = df_trains['station_name'].str.title()
df_trains['municipality'] = df_trains['municipality'].str.lower()
df_trains['province'] = df_trains['province'].str.lower()

In [None]:
if len(df_trains["province"]):
    df_trains.drop(columns=["province", "municipality"], inplace=True)

df_trains

In [None]:
title = "Train Stations (LD, MD, High Speed) in Spain"

lats = df_trains.latitude
lons = df_trains.longitude

fig = px.scatter_map(df_trains, 
                     lat=lats, 
                     lon=lons,
                     hover_data=["municipality_clean", "station_name"], 
                     color_continuous_scale=px.colors.carto.Aggrnyl,
                     zoom=5,
                     size_max=50  # Increase max size of markers
                     )

# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=1000, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='asdf')
)
fig.update_layout(title="Train Stations in Spain ")

fig.show()

In [None]:
df_trains.to_csv("../data/processed/filtered_trains.csv", index=False)