In [1]:
import sys
import os
import re
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
import pandas as pd
import plotly.express as px
import numpy as np
import geopandas as gpd
from pyproj import Transformer
from scripts.utils import split_column_at
from scripts.accent_cleaner import AccentCleaner
#from deep_translator import GoogleTranslator

### Airports

In [2]:
# Read the GeoJSON file
gdf = gpd.read_file('../data/raw/spanish_airports.geojson')
# Ensure that the geometry column contains points
# Extract latitude and longitude from the geometry column
gdf['latitude'] = gdf.geometry.y
gdf['longitude'] = gdf.geometry.x

# Now drop the geometry column if you don't need it anymore
df_airports = gdf.drop(columns='geometry')

df_airports.rename(columns={"Texto": "airport_name", "OBJECTID": "airport_id", "lat": "latitude", "lon": "longitude"}, inplace=True)
df_airports.head()

Unnamed: 0,airport_id,airport_name,latitude,longitude
0,1,Aeropuerto de Jerez,36.743828,-6.062658
1,2,Aeropuerto de Vigo,42.22574,-8.630491
2,3,Aeropuerto de Badajoz,38.889845,-6.821121
3,4,Aeropuerto de Granada - Jaén F.G.L.,37.186826,-3.778613
4,5,Aeropuerto de Almería,36.844781,-2.371234


In [3]:
df_airports["airport_name"] = df_airports["airport_name"].str.replace(r"^Aeropuerto de ", "", regex=True)
df_airports.rename(columns={"airport_name": "airport"}, inplace=True)
df_airports.head()

Unnamed: 0,airport_id,airport,latitude,longitude
0,1,Jerez,36.743828,-6.062658
1,2,Vigo,42.22574,-8.630491
2,3,Badajoz,38.889845,-6.821121
3,4,Granada - Jaén F.G.L.,37.186826,-3.778613
4,5,Almería,36.844781,-2.371234


In [4]:
df_airports["airport"] = df_airports["airport"].replace("Girona - Costa Brava", "Gerona - Costa Brava")
df_airports["airport"] = df_airports["airport"].replace("Lanzarote", "César Manrique - Lanzarote")
df_airports["airport"] = df_airports["airport"].replace("Barcelona - El prat", "Josep Tarradellas Barcelona-El Prat")
df_airports["airport"] = df_airports["airport"].replace("Granada - Jaén F.G.L.", "Federico García Lorca Granada-Jaén")
df_airports["airport"] = df_airports["airport"].replace("Murcia - San Javier", "Internacional Región de Murcia")
df_airports["airport"] = df_airports["airport"].replace("Huesca Pirineos", "Huesca-Pirineos")
df_airports["airport"] = df_airports["airport"].replace("Tenerife Norte ", "Tenerife Norte-Ciudad de La Laguna")
df_airports["airport"] = df_airports["airport"].replace("Santiago", "Santiago-Rosalía de Castro")
df_airports["airport"] = df_airports["airport"].replace("Santander-Seve Ballesteros", "Severiano Ballesteros-Santander")
df_airports["airport"] = df_airports["airport"].replace("A Coruña", "La Coruña")
df_airports["airport"] = df_airports["airport"].replace("Alicante - Elche", "Alicante-Elche Miguel Hernández")

In [5]:
df_airports["airport_clean"] = df_airports["airport"].str.replace(" ", "", regex=False)
df_airports["airport_clean"] = df_airports["airport_clean"].astype(str)  # Ensure string type
df_airports["airport_clean"] = df_airports["airport_clean"].str.strip()  # Remove leading/trailing spaces
df_airports["airport_clean"] = df_airports["airport_clean"].str.replace(r"\s+", " ", regex=True)  # Remove extra spaces
df_airports["airport_clean"] = df_airports["airport_clean"].str.replace(r"[\u200b\u2060]", "", regex=True)  # Remove zero-width spaces
df_airports["airport_clean"] = df_airports["airport_clean"].str.lower()


df_airports.drop("airport_id", axis=1, inplace=True)
df_airports.head()

Unnamed: 0,airport,latitude,longitude,airport_clean
0,Jerez,36.743828,-6.062658,jerez
1,Vigo,42.22574,-8.630491,vigo
2,Badajoz,38.889845,-6.821121,badajoz
3,Federico García Lorca Granada-Jaén,37.186826,-3.778613,federicogarcíalorcagranada-jaén
4,Almería,36.844781,-2.371234,almería


In [6]:
airports_info = pd.read_csv("../data/raw/airports_info.csv")
airports_info.drop(["web-scraper-order", "web-scraper-start-url", "runways"], axis = 1, inplace = True)

In [7]:
airports_info.head()

Unnamed: 0,airport,y_passengers
0,Adolfo Suárez Madrid-Barajas,60 220 984
1,Albacete[3]​,2 644
2,Alicante-Elche Miguel Hernández,15 747 678
3,Almería,775 393
4,Asturias,1 974 850


In [8]:
airports_info["airport"] = airports_info["airport"].str.replace(r"\[.*?\]", "", regex=True).str.strip()
airports_info["airport_clean"] = airports_info["airport"].str.replace(" ", "", regex=False)
airports_info["airport_clean"] = airports_info["airport_clean"].astype(str)  # Ensure string type
airports_info["airport_clean"] = airports_info["airport_clean"].str.strip()  # Remove leading/trailing spaces
airports_info["airport_clean"] = airports_info["airport_clean"].str.replace(r"\s+", " ", regex=True)  # Remove extra spaces
airports_info["airport_clean"] = airports_info["airport_clean"].str.replace(r"[\u200b\u2060]", "", regex=True)  # Remove zero-width spaces
airports_info["airport_clean"] = airports_info["airport_clean"].str.lower()
airports_info.head()

Unnamed: 0,airport,y_passengers,airport_clean
0,Adolfo Suárez Madrid-Barajas,60 220 984,adolfosuárezmadrid-barajas
1,Albacete​,2 644,albacete
2,Alicante-Elche Miguel Hernández,15 747 678,alicante-elchemiguelhernández
3,Almería,775 393,almería
4,Asturias,1 974 850,asturias


In [9]:
airports_info["y_passengers"] = airports_info["y_passengers"].str.replace(r" ", "", regex=True)
airports_info["y_passengers"].astype(int)
airports_info.head(1)

Unnamed: 0,airport,y_passengers,airport_clean
0,Adolfo Suárez Madrid-Barajas,60220984,adolfosuárezmadrid-barajas


In [10]:
airports_info = airports_info.sort_values(by="airport_clean", ascending=True).reset_index(drop=True)

In [11]:
df_airports = df_airports.sort_values(by="airport_clean", ascending=True).reset_index(drop=True)


In [12]:
# Show all rows and columns
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
airports = pd.merge(left = df_airports, right = airports_info, on= "airport_clean", how="outer")
airports.isna().sum()

airport_x        0
latitude         0
longitude        0
airport_clean    0
airport_y        2
y_passengers     2
dtype: int64

In [13]:
airports = airports[airports["airport_x"] != "Helipuerto de Algeciras"]
airports = airports[airports["airport_x"] != "Helipuerto de Ceuta"]


In [14]:
airports = airports.drop(["airport_clean", "airport_y"], axis =1)

In [15]:
airports.head()

Unnamed: 0,airport_x,latitude,longitude,y_passengers
0,Adolfo Suárez Madrid - Barajas,40.49208,-3.576319,60220984
1,Albacete,38.949634,-1.863164,2644
2,Alicante-Elche Miguel Hernández,38.285826,-0.563349,15747678
3,Almería,36.844781,-2.371234,775393
4,Asturias,43.562196,-6.032618,1974850


In [16]:
airports.rename(columns={"airport_x": "airport"}, inplace=True)

In [17]:
# Convert 'y_passengers' to numeric, forcing errors to NaN if needed
airports["y_passengers"] = pd.to_numeric(airports["y_passengers"], errors="coerce")


In [18]:
airports.to_csv("../data/processed/filtered_files/filtered_airports.csv", index=False)

In [98]:
lats = airports.latitude
lons = airports.longitude

airports["scaled_size"] = airports["y_passengers"] / 80

fig = px.scatter_mapbox(airports, 
                        lat="latitude", 
                        lon="longitude",
                        size="scaled_size",  
                        hover_name="airport",  # Main hover title
                        hover_data={"y_passengers": True,  # Show passenger count
                                    "latitude": False,  # Hide latitude
                                    "longitude": False},  # Hide longitude
                        color_continuous_scale=px.colors.carto.Aggrnyl,
                        zoom=5,
                        size_max=5)


# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=600, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='Population')
)
fig.update_layout(title="Airports in Spain and traffic")

fig.show()


*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



### Trains - Medium Distance, Long Distance, High Speed

In [100]:
df_trains = pd.read_csv("../data/raw/listado_completo_av_ld_md.csv", sep=";", decimal=",")

In [101]:
df_trains.shape

(646, 9)

In [102]:
## get rid of unnecesary columns in trains
df_trains = df_trains.drop(["C.P.", "DIRECCIÓN"], axis = 1)

## rename to english
df_trains.rename(columns={" CÓDIGO": "station_code",
                          "DESCRIPCION": "station_name",
                          "LATITUD": "latitude",
                          "POBLACION": "municipality",
                          "PROVINCIA": "province",
                          "LONGITUD": "longitude"}, inplace=True)

In [103]:
df_trains = df_trains.drop(df_trains[df_trains["PAIS"] == "Francia"].index)
df_trains = df_trains.drop(df_trains[df_trains["PAIS"] == "Portugal"].index)
df_trains = df_trains.drop(["PAIS"], axis = 1)


In [104]:
df_trains['station_name'] = df_trains['station_name'].str.title()
df_trains['municipality'] = df_trains['municipality'].str.title()
df_trains['province'] = df_trains['province'].str.lower()

In [105]:
# df_trains["municipality_clean"] = df_trains["municipality_clean"].str.split("/", expand=False)
# df_trains["province_clean"] = df_trains["province_clean"].str.split("/", expand=False)

# df_trains["municipality_clean"] = df_trains["municipality_clean"].apply(
#     lambda x: x[0] if len(x) == 1 else x[1]
# )
# df_trains["province_clean"] = df_trains["province_clean"].apply(
#     lambda x: x[0] if len(x) == 1 else x[1]
# )

In [106]:
# if len(df_trains["province"]):
#     df_trains.drop(columns=["province", "municipality"], inplace=True)

df_trains.head()

Unnamed: 0,station_code,station_name,latitude,longitude,municipality,province
0,11208,Vitoria/Gasteiz,42.841528,-2.672665,Vitoria-Gasteiz,araba/álava
1,11212,Agurain/Salvatierra De Alava,42.846437,-2.389227,Salvatierra/Agurain,araba/álava
2,11213,Araia,42.869471,-2.306802,Arraia-Maeztu,araba/álava
3,11203,Manzanos,42.742875,-2.86753,Ribera Baja/Erribera Beitia,araba/álava
4,11205,Nanclares-Langraiz,42.816234,-2.804934,Iruña Oka/Iruña De Oca,araba/álava


In [107]:
title = "Train Stations (LD, MD, High Speed) in Spain"

lats = df_trains.latitude
lons = df_trains.longitude

fig = px.scatter_map(df_trains, 
                     lat=lats, 
                     lon=lons,
                     color_continuous_scale=px.colors.carto.Aggrnyl,
                     zoom=5,
                     size_max=50  # Increase max size of markers
                     )

# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=1000, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='asdf')
)
fig.update_layout(title="Train Stations in Spain ")

fig.show()

In [108]:
df_trains.to_csv("../data/processed/filtered_files/filtered_trains.csv", index=False)

### Trains - Regional

In [109]:
df_trains_regional = pd.read_csv("../data/raw/listado-estaciones-completo-act.csv",encoding="ISO-8859-1", sep=";", decimal=",")

In [110]:
df_trains_regional.head()

Unnamed: 0,CODIGO,DESCRIPCION,LATITUD,LONGITUD,DIRECION,CP,POBLACION,PROVINCIA,PAIS,CERCANIAS,FEVE,COMUN
0,1001,EL SORBITO (APD-CGD),37.208475,-5.706642,,,ALCALÁ DE GUADAÍRA,SEVILLA,ESPAÑA,NO,NO,
1,1002,LA TRINIDAD (APT-CGD),,,,,ALCALÁ DE GUADAÍRA,SEVILLA,ESPAÑA,NO,NO,
2,1003,ARAHAL,37.268081,-5.548514,"Calle Virgen de los Dolores, S/N",41600.0,ARAHAL,SEVILLA,ESPAÑA,NO,NO,
3,1004,PARADAS (APD-CGD),,,,,PARADAS,SEVILLA,ESPAÑA,NO,NO,
4,1005,MARCHENA,37.334282,-5.425519,"Avenida Maestro Santos Ruano, 8",41620.0,MARCHENA,SEVILLA,ESPAÑA,NO,NO,


In [111]:
df_trains_regional.isna().sum()

CODIGO            0
DESCRIPCION       2
LATITUD         597
LONGITUD        597
DIRECION       1233
CP             1268
POBLACION        21
PROVINCIA       154
PAIS              3
CERCANIAS         0
FEVE              0
COMUN          2704
dtype: int64

In [112]:
df_trains_regional = df_trains_regional.drop(["DIRECION", "PROVINCIA", "FEVE", "COMUN", "CP"], axis=1)
df_trains_regional.rename(columns={"CODIGO": "station_code",
                          "DESCRIPCION": "station_name",
                          "LATITUD": "latitude",
                          "POBLACION": "municipality",
                          "LONGITUD": "longitude",
                          "CERCANIAS": "suburban_train" }, inplace=True)

In [113]:
df_trains_regional["PAIS"].unique()

array(['ESPAÑA', nan, 'ITALIA', 'SUIZA', 'FRANCIA', 'PORTUGAL'],
      dtype=object)

In [114]:
df_trains_regional = df_trains_regional.drop(df_trains_regional[df_trains_regional["PAIS"] == "FRANCIA"].index)
df_trains_regional = df_trains_regional.drop(df_trains_regional[df_trains_regional["PAIS"] == "PORTUGAL"].index)
df_trains_regional = df_trains_regional.drop(df_trains_regional[df_trains_regional["PAIS"] == "SUIZA"].index)

df_trains_regional = df_trains_regional.drop(["PAIS"], axis = 1)

In [115]:
# I will remove these ones for now, to not overcomplicate. the municipality could be figured out at some point in the future. 
df_trains_regional = df_trains_regional.dropna(subset=["municipality"])

In [116]:
df_trains_regional['station_name'] = df_trains_regional['station_name'].str.title()
df_trains_regional['municipality'] = df_trains_regional['municipality'].str.title()

In [117]:
df_trains_regional.isna().sum()

station_code        0
station_name        2
latitude          532
longitude         532
municipality        0
suburban_train      0
dtype: int64

In [118]:
df_trains_regional[['municipality_1', 'municipality_2']] = df_trains_regional['municipality'].str.split('/', n=1, expand=True)

In [119]:
municipalities_to_change = ['Karrantza Harana',
       'Iruña Oka', 'Olazti', 'Altsasu', 'Donostia',
       'Peníscola',  'Borriana', 'Hiriberri']

# Assign "change" to the new column where language=True and municipality_1 is in the list
df_trains_regional.loc[
    (df_trains_regional["municipality_1"].isin(municipalities_to_change)), 
    "change"
] = "change"

In [120]:
df_trains_regional.loc[df_trains_regional["change"] == "change", ["municipality_1", "municipality_2"]] = df_trains_regional.loc[df_trains_regional["change"] == "change", ["municipality_2", "municipality_1"]].values

In [121]:
df_trains_regional.isna().sum()

station_code         0
station_name         2
latitude           532
longitude          532
municipality         0
suburban_train       0
municipality_1       0
municipality_2    2562
change            2626
dtype: int64

In [122]:
municipality_coordinates = pd.read_csv("../data/processed/filtered_files/filtered_municipalities.csv")
municipality_coordinates.head()

Unnamed: 0,cmun,municipality,municipality_1,municipality_2,province,autonomous_community,latitude,longitude,altitude
0,1001,Alegría-Dulantzi,alegriadulantzi,,Álava,País Vasco,42.84149,-2.513507,561.6857
1,1002,Amurrio,amurrio,,Álava,País Vasco,43.05265,-3.001022,219.691
2,1003,Aramaio,aramaio,,Álava,País Vasco,43.054,-2.566,381.8797
3,1004,Artziniega,artziniega,,Álava,País Vasco,43.1222,-3.128209,196.9808
4,1006,Armiñón,arminon,,Álava,País Vasco,42.72305,-2.872574,463.5815


In [123]:
from unidecode import unidecode

# Function to clean a column (remove accents, normalize case)
def clean_column(column):
    return column.apply(lambda x: unidecode(str(x)).strip().lower() if pd.notna(x) else x)

# Apply cleaning to df_municipalities
df_trains_regional["municipality_1"] = clean_column(df_trains_regional["municipality_1"])
df_trains_regional["municipality_2"] = clean_column(df_trains_regional["municipality_2"])

In [124]:
df_trains_regional["municipality_1"] = df_trains_regional["municipality_1"].str.strip()  # Remove leading/trailing spaces
df_trains_regional["municipality_1"] = df_trains_regional["municipality_1"].str.replace(r"\s+", " ", regex=True)  # Remove extra spaces
df_trains_regional["municipality_1"] = df_trains_regional["municipality_1"].str.replace(r"[\u200b\u2060]", "", regex=True)  # Remove zero-width spaces
df_trains_regional["municipality_1"] = df_trains_regional["municipality_1"].str.replace(" ", "", regex=True)  # Remove zero-width spaces
df_trains_regional['municipality_1'] = df_trains_regional['municipality_1'].str.replace(r'/.*', '', regex=True)
df_trains_regional['municipality_1'] = df_trains_regional['municipality_1'].str.replace(",","")
df_trains_regional['municipality_1'] = df_trains_regional['municipality_1'].str.replace("(","")
df_trains_regional['municipality_1'] = df_trains_regional['municipality_1'].str.replace(")","")

df_trains_regional["municipality_2"] = df_trains_regional["municipality_2"].str.strip()  # Remove leading/trailing spaces
df_trains_regional["municipality_2"] = df_trains_regional["municipality_2"].str.replace(r"\s+", " ", regex=True)  # Remove extra spaces
df_trains_regional["municipality_2"] = df_trains_regional["municipality_2"].str.replace(r"[\u200b\u2060]", "", regex=True)  # Remove zero-width spaces
df_trains_regional["municipality_2"] = df_trains_regional["municipality_2"].str.replace(" ", "", regex=True)  # Remove zero-width spaces
df_trains_regional['municipality_2'] = df_trains_regional['municipality_2'].str.replace(r'/.*', '', regex=True)
df_trains_regional['municipality_2'] = df_trains_regional['municipality_2'].str.replace(",","")
df_trains_regional['municipality_2'] = df_trains_regional['municipality_2'].str.replace("(","")
df_trains_regional['municipality_2'] = df_trains_regional['municipality_2'].str.replace(")","")

df_trains_regional.head()

Unnamed: 0,station_code,station_name,latitude,longitude,municipality,suburban_train,municipality_1,municipality_2,change
0,1001,El Sorbito (Apd-Cgd),37.208475,-5.706642,Alcalá De Guadaíra,NO,alcaladeguadaira,,
1,1002,La Trinidad (Apt-Cgd),,,Alcalá De Guadaíra,NO,alcaladeguadaira,,
2,1003,Arahal,37.268081,-5.548514,Arahal,NO,arahal,,
3,1004,Paradas (Apd-Cgd),,,Paradas,NO,paradas,,
4,1005,Marchena,37.334282,-5.425519,Marchena,NO,marchena,,


In [125]:
df_trains_regional.shape

(2643, 9)

In [126]:
df_trains_regional.isna().sum()

station_code         0
station_name         2
latitude           532
longitude          532
municipality         0
suburban_train       0
municipality_1       0
municipality_2    2562
change            2626
dtype: int64

---

In [127]:
merge_1 = df_trains_regional.merge(municipality_coordinates, on = "municipality_1", how="left")

In [128]:
merge_1.isna().sum()

station_code               0
station_name               2
latitude_x               532
longitude_x              532
municipality_x             0
suburban_train             0
municipality_1             0
municipality_2_x        2562
change                  2626
cmun                      98
municipality_y            98
municipality_2_y        2591
province                  98
autonomous_community      98
latitude_y                98
longitude_y               98
altitude                  98
dtype: int64

In [129]:
merge_1 = merge_1.dropna(subset=['latitude_y'])

In [130]:
merge_2 = df_trains_regional.merge(municipality_coordinates, left_on="municipality_1", right_on="municipality_2", how="left")

In [131]:
merge_2 = merge_2.dropna(subset=['latitude_y'])

In [132]:
merged_df = pd.concat([merge_1, merge_2])

In [133]:
merged_df.shape

(2547, 19)

In [134]:
merge_3 = df_trains_regional.merge(municipality_coordinates, left_on="municipality_2", right_on="municipality_1", how="left")

In [135]:
merge_3 = merge_3.dropna(subset=['latitude_y'])

In [136]:
merged_df = pd.concat([merged_df, merge_3])

In [137]:
merged_df.shape

(2557, 19)

In [138]:
# Fill missing values in municipality_2 with "none" (as a string)
df_trains_regional["municipality_2"] = df_trains_regional["municipality_2"].fillna("none")
# Fill missing values in municipality_2 with "none" (as a string)
municipality_coordinates["municipality_2"] = municipality_coordinates["municipality_2"].fillna("none")

In [139]:
df_trains_regional.head()

Unnamed: 0,station_code,station_name,latitude,longitude,municipality,suburban_train,municipality_1,municipality_2,change
0,1001,El Sorbito (Apd-Cgd),37.208475,-5.706642,Alcalá De Guadaíra,NO,alcaladeguadaira,none,
1,1002,La Trinidad (Apt-Cgd),,,Alcalá De Guadaíra,NO,alcaladeguadaira,none,
2,1003,Arahal,37.268081,-5.548514,Arahal,NO,arahal,none,
3,1004,Paradas (Apd-Cgd),,,Paradas,NO,paradas,none,
4,1005,Marchena,37.334282,-5.425519,Marchena,NO,marchena,none,


In [140]:
municipality_coordinates = municipality_coordinates[municipality_coordinates["municipality_2"] != "none"]
df_trains_regional = df_trains_regional[df_trains_regional["municipality_2"] != "none"]

In [141]:
# Perform merges on all combinations
merge_4 = df_trains_regional.merge(municipality_coordinates, left_on="municipality_2", right_on="municipality_2", how="left")

In [142]:
merge_4.isna().sum()

station_code             0
station_name             0
latitude_x              28
longitude_x             28
municipality_x           0
suburban_train           0
municipality_1_x         0
municipality_2           0
change                  64
cmun                    31
municipality_y          31
municipality_1_y        31
province                31
autonomous_community    31
latitude_y              31
longitude_y             31
altitude                31
dtype: int64

In [143]:
merge_4 = merge_4.dropna(subset=['latitude_y'])

In [144]:
# Concatenate results and drop duplicates
merged_df = pd.concat([merged_df, merge_4])

In [145]:
merged_df.isna().sum()

station_code               0
station_name               1
latitude_x               541
longitude_x              541
municipality_x             0
suburban_train             0
municipality_1            62
municipality_2_x        2531
change                  2595
cmun                       0
municipality_y             0
municipality_2_y        2553
province                   0
autonomous_community       0
latitude_y                 0
longitude_y                0
altitude                   0
municipality_1_x        2545
municipality_1_y        2545
municipality_2          2557
dtype: int64

In [146]:
# Replacing the NAs by the municipality coordinates. Not as accurate, but accurate enough. 
merged_df['latitude_x'].fillna(merged_df['latitude_y'], inplace=True)
merged_df['longitude_x'].fillna(merged_df['longitude_y'], inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [147]:
merged_df.isna().sum()

station_code               0
station_name               1
latitude_x                 0
longitude_x                0
municipality_x             0
suburban_train             0
municipality_1            62
municipality_2_x        2531
change                  2595
cmun                       0
municipality_y             0
municipality_2_y        2553
province                   0
autonomous_community       0
latitude_y                 0
longitude_y                0
altitude                   0
municipality_1_x        2545
municipality_1_y        2545
municipality_2          2557
dtype: int64

In [148]:
merged_df = merged_df[["cmun", "municipality_x", "latitude_x", "longitude_x", "station_code", "suburban_train", "station_name"]].drop_duplicates(keep="first")

merged_df.sample(10)

Unnamed: 0,cmun,municipality_x,latitude_x,longitude_x,station_code,suburban_train,station_name
1437,46131.0,Gandia,38.96902,-0.185187,43960,NO,Final Playa Gandia (Bus)
2568,15037.0,Fisterra,42.90507,-9.264338,99119,NO,Fisterra-Bus
1031,37156.0,Guijuelo,40.55514,-5.673656,30016,NO,Guijuelo Y Campillo
309,33055.0,Ribadedeva,43.38272,-4.55709,5579,NO,Colombres
483,47085.0,Medina Del Campo,41.317459,-4.910071,8240,NO,Medina Del Campo Alta Velocidad
1352,21010.0,Ayamonte,37.21466,-7.409819,42107,NO,Ayamonte (Apd)
1710,3066.0,Elda,38.484879,-0.80109,60905,NO,Elda-Petrer
993,36002.0,Barro,42.517801,-8.654609,23005,NO,Portela
501,28047.0,Collado Villalba,40.6265067,-4.0081871,10200,SI,Villalba De Guadarrama
1232,28079.0,Madrid,40.3666963,-3.7046002,35703,SI,Orcasitas


---

In [149]:
df_transformed = merged_df.groupby(["cmun", "municipality_x"]).agg(
    count_of_stations=("station_code", "count"),
    suburban_train=("suburban_train", lambda x: "yes" if "SI" in x.values else "no")
).reset_index()


In [150]:
df_transformed.head()

Unnamed: 0,cmun,municipality_x,count_of_stations,suburban_train
0,1002.0,Amurrio,5,yes
1,1009.0,Asparrena,1,no
2,1010.0,Ayala/Aiara,1,yes
3,1036.0,Laudio/Llodio,3,yes
4,1046.0,Erriberagoitia/Ribera Alta,2,yes


In [151]:
df_transformed["cmun"] = df_transformed["cmun"].astype("Int64")

In [152]:
df_transformed.rename(columns={"municipality_x": "municipality"}, inplace=True)

In [155]:
df_transformed.head()

Unnamed: 0,cmun,municipality,count_of_stations,suburban_train
0,1002,Amurrio,5,1
1,1009,Asparrena,1,0
2,1010,Ayala/Aiara,1,1
3,1036,Laudio/Llodio,3,1
4,1046,Erriberagoitia/Ribera Alta,2,1


In [154]:
df_transformed['suburban_train'] = df_transformed['suburban_train'].str.lower().map({'yes': 1, 'no': 0})


In [156]:
df_transformed.to_csv("../data/processed/filtered_files/filtered_regional_trains.csv", index=False)