In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
pd.set_option('display.max_columns', None)
# eye candy plots
plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-light.mplstyle')

In [None]:
df = pd.read_csv("../../data/processed/2_aggregated_pueblos.csv")
df.shape

In [None]:
df['province'].unique()
df[df.towns_in_vicinity.isnull()]

df = df.dropna(subset=['towns_in_vicinity'])


In [None]:
province_to_autonomous_community = {
    'Álava': 'País Vasco',
    'Albacete': 'Castilla-La Mancha',
    'Alicante': 'Comunidad Valenciana',
    'Almería': 'Andalucía',
    'Ávila': 'Castilla y León',
    'Badajoz': 'Extremadura',
    'Illes Balears': 'Illes Balears',
    'Barcelona': 'Cataluña',
    'Burgos': 'Castilla y León',
    'Cáceres': 'Extremadura',
    'Cádiz': 'Andalucía',
    'Castellón': 'Comunidad Valenciana',
    'Ciudad Real': 'Castilla-La Mancha',
    'Córdoba': 'Andalucía',
    'A Coruña': 'Galicia',
    'Cuenca': 'Castilla-La Mancha',
    'Girona': 'Cataluña',
    'Granada': 'Andalucía',
    'Guadalajara': 'Castilla-La Mancha',
    'Gipuzkoa': 'País Vasco',
    'Huelva': 'Andalucía',
    'Huesca': 'Aragón',
    'Jaén': 'Andalucía',
    'León': 'Castilla y León',
    'Lleida': 'Cataluña',
    'La Rioja': 'La Rioja',
    'Lugo': 'Galicia',
    'Madrid': 'Comunidad de Madrid',
    'Málaga': 'Andalucía',
    'Murcia': 'Murcia',
    'Navarra': 'Navarra',
    'Ourense': 'Galicia',
    'Asturias': 'Asturias',
    'Palencia': 'Castilla y León',
    'Pontevedra': 'Galicia',
    'Salamanca': 'Castilla y León',
    'Santa Cruz de Tenerife': 'Canarias',
    'Cantabria': 'Cantabria',
    'Segovia': 'Castilla y León',
    'Sevilla': 'Andalucía',
    'Soria': 'Castilla y León',
    'Tarragona': 'Cataluña',
    'Teruel': 'Aragón',
    'Toledo': 'Castilla-La Mancha',
    'Valencia': 'Comunidad Valenciana',
    'Valladolid': 'Castilla y León',
    'Bizkaia': 'País Vasco',
    'Zamora': 'Castilla y León',
    'Zaragoza': 'Aragón'
}

In [None]:
df['autonomous_community'] = df['province'].map(province_to_autonomous_community)

In [None]:
df = df.drop(columns=[col for col in df.columns if col.startswith('n_')])
df = df.sort_values(['latitude', 'longitude'], ascending=False)

## Scoring Hospitals

In [None]:
df.reset_index(drop=True, inplace=True)
df

In [None]:


# define bins for the hospital distance (adjust thresholds as needed)
hospital_bins = [df['closest_distance_hospital'].min() - 0.001, 5, 10, 15, 20, 25, 30, df['closest_distance_hospital'].max()]
school_bins = [df['closest_distance_school'].min() - 0.001, 5, 10, 15, 20, 25, 30, df['closest_distance_school'].max()]
labels = ['<5', '5-10', '10-15', '15-20', '20-25', '25-30', '30+']

# bin the values into categories
df['closest_distance_hospital_bin'] = pd.cut(df['closest_distance_hospital'], bins=hospital_bins, labels=labels, include_lowest=True)
df['closest_distance_school_bin'] = pd.cut(df['closest_distance_school'], bins=school_bins, labels=labels, include_lowest=True)

# apply score based on the bins (10 is best)
score_mapping = {
    '<5': 10,
    '5-10': 8,
    '10-15': 6,
    '15-20': 4,
    '20-25': 3,
    '25-30': 2,
    '30+': 1
}

df['hospital_distance_score'] = df['closest_distance_hospital_bin'].map(score_mapping)
df['school_distance_score'] = df['closest_distance_school_bin'].map(score_mapping)

# second step
def combine_scores(row, column):
    base_score = row[column]
    vicinity = row[column]
    if vicinity:
        neighbor_scores = df.loc[df["cmun"].isin([vicinity]), column]
        if not neighbor_scores.empty:
            avg_neighbor_score = neighbor_scores.mean()
        else:
            avg_neighbor_score = 0
        return (base_score + avg_neighbor_score) / 2
    else:
        return base_score

df["hospital_score_area"] = df.apply(lambda row: combine_scores(row, column="hospital_distance_score"), axis=1)
df["school_score_area"] = df.apply(lambda row: combine_scores(row, column="school_distance_score"), axis=1)

In [None]:
df.drop(columns=["closest_distance_hospital_bin", "closest_distance_school_bin"], axis=1, inplace=True)

In [None]:
score_mapping = {
    '<5': 10,
    '5-10': 8,
    '10-15': 6,
    '15-20': 4,
    '20-25': 3,
    '25-30': 2,
    '30+': 1
}


airport_score_mapping = {
    '<10': 10,
    '10-15': 8,
    '15-30': 6,
    '30-50': 4,
    '50-100': 3,
    '100+': 1
}

df_transport = df[['closest_distance_train', 'closest_distance_airport' ]]

train_bins = [df['closest_distance_train'].min() - 0.001, 5, 10, 15, 20, 25, 30, df['closest_distance_train'].max()]
train_labels = ['<5', '5-10', '10-15', '15-20', '20-25', '25-30', '30+']

airport_bins = [df['closest_distance_airport'].min() - 0.001, 10, 15, 30, 50, 100, df['closest_distance_airport'].max()]
airport_labels = ['<10', '10-15', '15-30', '30-50', '50-100', '100+']

# bin the train and airport distances using the provided bins and labels
df['train_distance_bin'] = pd.cut(df['closest_distance_train'], bins=train_bins, labels=train_labels, include_lowest=True)
df['airport_distance_bin'] = pd.cut(df['closest_distance_airport'], bins=airport_bins, labels=airport_labels, include_lowest=True)

# map the binned categories to scores using the score_mapping
df['train_distance_score'] = df['train_distance_bin'].map(score_mapping)
df['airport_distance_score'] = df['airport_distance_bin'].map(airport_score_mapping)

# compute a weighted transportation score (equal weight is used here; adjust weights if needed)
df['transport_score'] = (pd.to_numeric(df['train_distance_score']) + pd.to_numeric(df['airport_distance_score'])) / 2



In [None]:
df.sample(6)

In [None]:
df.drop(columns=[col for col in df.columns if col.endswith('_bin')], inplace=True)

In [151]:
df.to_csv("../../data/processed/3_aggregated_pueblos.csv", index=False)