## Basic EDA for first incoming data

In [288]:
import pandas as pd
import plotly.express as px
import numpy as np

In [399]:
df_communities = pd.read_csv("../data/population_towns.csv", sep=";")
df_coordinates = pd.read_csv(
    "../data/coordinates_towns_spain.csv",
    encoding="ISO-8859-1",  # or "latin1"
    sep=";",
    decimal=","
)
df_hospitals = pd.read_csv("../data/hospitals_spain.csv")
df_hospitals.columns

Index(['OBJECTID', 'CODCNH', 'NOMBRE', 'DIRECCION', 'TELEFONO', 'TELEFONO2',
       'TELEFAX', 'CODMU', 'MUNICIPIOS', 'CODPROV', 'PROVINCIAS', 'CODAUTO',
       'COMUNIDADES', 'CODPOSTAL', 'NCAMAS', 'CODFI', 'FINALIDAD_ASISITENCIAL',
       'CODPAT', 'DEPENDENCIA_PATRIMONIAL', 'CODFU', 'DEPENDENCIA_FUNCIONAL',
       'ACREDOCENT', 'ESCOMPLE', 'FORCOMPLE', 'CODIDCOM', 'ALTA', 'CERRADO',
       'CAPITAL', 'CIERREFECH', 'CONCIERTO', 'EMAIL', 'TAC', 'RM', 'GAM',
       'HEM', 'ASD', 'LIT', 'BCO', 'ALI', 'SPECT', 'PET', 'MAMOS', 'DO',
       'DIAL', 'X', 'Y', 'CalidadGeocodificacion'],
      dtype='object')

The column names in the hospitals dataset are:

- **Identification**
    - `OBJECTID`: Object identifier
    - `CODCNH`: Hospital code
    - `NOMBRE`: Hospital name
    - `CODIDCOM`: Community identifier code

- **Contact Information**
    - `DIRECCION`: Address
    - `TELEFONO`: Phone
    - `TELEFONO2`: Secondary phone
    - `TELEFAX`: Fax
    - `EMAIL`: Email address
    - `CODPOSTAL`: Postal code

- **Location**
    - `CODMU`: Municipality code
    - `MUNICIPIOS`: Municipality name 
    - `CODPROV`: Province code
    - `PROVINCIAS`: Province name
    - `CODAUTO`: Autonomous community code 
    - `COMUNIDADES`: Autonomous community name
    - `X`: Longitude coordinate
    - `Y`: Latitude coordinate
    - `CalidadGeocodificacion`: Geocoding quality
    - `CAPITAL`: Capital city indicator

- **Hospital Information**
    - `NCAMAS`: Number of beds
    - `CODFI`: Healthcare purpose code
    - `FINALIDAD_ASISITENCIAL`: Healthcare purpose description
    - `CODPAT`: Asset ownership code
    - `DEPENDENCIA_PATRIMONIAL`: Asset ownership description
    - `CODFU`: Functional dependency code
    - `DEPENDENCIA_FUNCIONAL`: Functional dependency description

- **Status & Certifications**
    - `ACREDOCENT`: Teaching accreditation
    - `ESCOMPLE`: Complementary center
    - `FORCOMPLE`: Complementary form
    - `ALTA`: Active status
    - `CERRADO`: Closed status
    - `CIERREFECH`: Closing date
    - `CONCIERTO`: Agreement status

- **Equipment & Services**
    - `TAC`: CT scanner
    - `RM`: MRI
    - `GAM`: Gamma camera
    - `HEM`: Hemodynamics
    - `ASD`: Assistant diagnostic systems
    - `LIT`: Lithotripsy
    - `BCO`: Blood bank
    - `ALI`: Other
    - `SPECT`: SPECT scanner
    - `PET`: PET scanner
    - `MAMOS`: Mammography
    - `DO`: Other diagnostic
    - `DIAL`: Dialysis

In [None]:
hospital_dict = {
    'NOMBRE': 1,
    'CODMU': 1,
    'MUNICIPIOS': 1,
    'CODAUTO': 1,
    'COMUNIDADES': 1,
    'NCAMAS': 1,
    'FINALIDAD_ASISITENCIAL': 1,
    'DEPENDENCIA_PATRIMONIAL': 1,
    'X': 1,
    'Y': 1,
}

{'OBJECTID': 1,
 'CODCNH': 1,
 'NOMBRE': 1,
 'DIRECCION': 1,
 'TELEFONO': 1,
 'TELEFONO2': 1,
 'TELEFAX': 1,
 'CODMU': 1,
 'MUNICIPIOS': 1,
 'CODPROV': 1,
 'PROVINCIAS': 1,
 'CODAUTO': 1,
 'COMUNIDADES': 1,
 'CODPOSTAL': 1,
 'NCAMAS': 1,
 'CODFI': 1,
 'FINALIDAD_ASISITENCIAL': 1,
 'CODPAT': 1,
 'DEPENDENCIA_PATRIMONIAL': 1,
 'CODFU': 1,
 'DEPENDENCIA_FUNCIONAL': 1,
 'ACREDOCENT': 1,
 'ESCOMPLE': 1,
 'FORCOMPLE': 1,
 'CODIDCOM': 1,
 'ALTA': 1,
 'CERRADO': 1,
 'CAPITAL': 1,
 'CIERREFECH': 1,
 'CONCIERTO': 1,
 'EMAIL': 1,
 'TAC': 1,
 'RM': 1,
 'GAM': 1,
 'HEM': 1,
 'ASD': 1,
 'LIT': 1,
 'BCO': 1,
 'ALI': 1,
 'SPECT': 1,
 'PET': 1,
 'MAMOS': 1,
 'DO': 1,
 'DIAL': 1,
 'X': 1,
 'Y': 1,
 'CalidadGeocodificacion': 1}

In [400]:
df_hospitals

Unnamed: 0,OBJECTID,CODCNH,NOMBRE,DIRECCION,TELEFONO,TELEFONO2,TELEFAX,CODMU,MUNICIPIOS,CODPROV,...,BCO,ALI,SPECT,PET,MAMOS,DO,DIAL,X,Y,CalidadGeocodificacion
0,1,10035,RED DE SALUD MENTAL DE ARABA (HOSPITAL PSIQUIÁ...,"ÁLAVA, 43",945006555.0,,945006587.0,10590,Vitoria-Gasteiz,1,...,0,0,0,0,0,0,0,-2.678612,42.835656,Manual
1,2,10040,HOSPITAL SAN JOSÉ,"BEATO TOMÁS DE ZUMÁRRAGA, 10",945140900.0,,945145709.0,10590,Vitoria-Gasteiz,1,...,0,0,0,0,1,1,1,-2.676640,42.849661,PointAddress
2,3,10053,HOSPITAL QUIRÓNSALUD VITORIA,"ESPERANZA, 3",945252500.0,,945279260.0,10590,Vitoria-Gasteiz,1,...,0,0,0,0,1,1,0,-2.668035,42.849761,PointAddress
3,4,10066,"HOSPITAL DE CUIDADOS SAN ONOFRE, S.L. (HOSPITA...","SALVATIERRABIDE, 9",945142100.0,,945143318.0,10590,Vitoria-Gasteiz,1,...,0,0,0,0,0,0,0,-2.680386,42.837582,PointAddress
4,5,10088,HOSPITAL DE LEZA,"CTRA. VITORIA - LOGROÑO, KM. 59",945006900.0,945006876.0,945006901.0,10318,Laguardia,1,...,0,0,0,0,0,0,0,-2.638635,42.575989,Manual
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,920,500195,HOSPITAL DE REHABILITACION PSIQUIATRICA PRISMA,"TORRE DEL CEREZO, Nº 17 Bº. DE SAN JUAN DE MOZ...",976151152.0,,976151153.0,502973,Zaragoza,50,...,0,0,0,0,0,0,0,-0.841290,41.711449,Postal
920,921,500200,CENTRO SANITARIO CINCO VILLAS,"ANTONIO MACHADO, S/N",976677978.0,976677973.0,976677972.0,500956,Ejea de los Caballeros,50,...,0,0,0,0,1,0,0,-1.125290,42.126721,Manual
921,922,500218,HOSPITAL VIAMED MONTECANAL,"C/ FRANZ SCHUBERT, 2",876241818.0,,876241822.0,502973,Zaragoza,50,...,0,0,0,0,1,0,0,-0.948738,41.639042,PointAddress
922,923,510039,HOSPITAL UNIVERSITARIO DE CEUTA,"LOMA COLMENAR, S/N",856907000.0,,856907066.0,510013,Ceuta,51,...,0,0,0,0,1,1,0,-5.341853,35.880332,Manual


In [401]:
is_null = df_hospitals.isnull().sum().sort_values(ascending=False)
is_null


CIERREFECH                 923
CODIDCOM                   809
TELEFONO2                  574
EMAIL                      156
TELEFAX                     66
TELEFONO                     2
LIT                          0
CAPITAL                      0
CONCIERTO                    0
TAC                          0
RM                           0
GAM                          0
HEM                          0
ASD                          0
OBJECTID                     0
BCO                          0
CERRADO                      0
SPECT                        0
PET                          0
MAMOS                        0
DO                           0
DIAL                         0
X                            0
Y                            0
ALI                          0
FORCOMPLE                    0
ALTA                         0
CODPOSTAL                    0
NOMBRE                       0
DIRECCION                    0
CODMU                        0
MUNICIPIOS                   0
CODPROV 

In [291]:
columns_communities_english = ['province_code', 'province', 'municipality_code', 'municipality_name', 'population', 'male', 'female']
columns_coordinates_english = ['community', 'province', 'municipality_name', 'latitude', 'longitude', 'altitude',
    'population', 'male', 'female']


def map_cols_es_en(es: list, en: list):
    dict_cols = dict(zip(es, en))
    return dict_cols.values()


df_communities.columns = map_cols_es_en(df_communities.columns.to_list(), columns_communities_english)
df_coordinates.columns = map_cols_es_en(df_coordinates.columns.to_list(), columns_coordinates_english)

In [292]:
df_communities[df_communities['municipality_name'] == "Galapagar"]

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female
4350,28,Madrid,61,Galapagar,36184,17561,18623


In [293]:
df_coordinates[df_coordinates['municipality_name'] == "Galapagar"]

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female
6883,Madrid,Madrid,Galapagar,40.57736,-4.00357,884.4304,31820,15936,15884


In [356]:
df = pd.merge(df_communities, df_coordinates[["latitude", "longitude", "altitude", "municipality_name"]], 
              on='municipality_name', 
              how='outer')

In [362]:
df.dropna(thresh=4)

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,latitude,longitude,altitude
0,44.0,Teruel,1.0,Ababuj,74.0,44.0,30.0,40.54846,-0.807783,1368.1570
1,40.0,Segovia,1.0,Abades,859.0,422.0,437.0,40.91547,-4.269130,972.9816
2,48.0,Bizkaia,1.0,Abadiño,7742.0,3858.0,3884.0,43.15000,-2.610278,139.5298
3,10.0,Cáceres,1.0,Abadía,331.0,183.0,148.0,40.26001,-5.977147,452.6463
4,27.0,Lugo,1.0,Abadín,2217.0,1103.0,1114.0,43.36666,-7.483333,475.1587
...,...,...,...,...,...,...,...,...,...,...
8902,8.0,Barcelona,153.0,Òrrius,805.0,405.0,400.0,41.55499,2.355113,254.5491
8903,42.0,Soria,134.0,Ólvega,3838.0,2034.0,1804.0,41.77938,-1.985618,1040.0870
8904,18.0,Granada,147.0,Órgiva,5674.0,2802.0,2872.0,36.90224,-3.423990,465.8732
8905,23.0,Jaén,92.0,Úbeda,33674.0,16380.0,17294.0,38.00809,-3.368519,737.4617


In [338]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8160 entries, 0 to 8159
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   province_code      8160 non-null   int64  
 1   province           8160 non-null   object 
 2   municipality_code  8160 non-null   int64  
 3   municipality_name  8160 non-null   object 
 4   population         8160 non-null   int64  
 5   male               8160 non-null   int64  
 6   female             8160 non-null   int64  
 7   latitude           7393 non-null   float64
 8   longitude          7393 non-null   float64
 9   altitude           7393 non-null   float64
dtypes: float64(3), int64(5), object(2)
memory usage: 637.6+ KB


In [None]:
# Display rows with any NaN values and count them
nan_rows = df[df.isna().any(axis=1)]
nan_rows

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,latitude,longitude,altitude
35,,,,Abárzuza,,,,42.72641,-2.022548,568.4764
36,31.0,Navarra,2.0,Abárzuza/Abartzuza,518.0,258.0,260.0,,,
37,,,,Acebeda (La),,,,41.08697,-3.624634,1266.5420
38,28.0,Madrid,1.0,"Acebeda, La",68.0,37.0,31.0,,,
41,,,,Acebrón (El),,,,39.90840,-2.984723,765.3969
...,...,...,...,...,...,...,...,...,...,...
8894,37.0,Salamanca,118.0,Éjeme,144.0,82.0,62.0,,,
8895,46.0,Valencia/València,119.0,"Énova, l'",924.0,477.0,447.0,,,
8897,4.0,Almería,54.0,Íllar,460.0,237.0,223.0,,,
8898,18.0,Granada,102.0,Íllora,9918.0,5012.0,4906.0,,,


In [355]:
title = "Communities in Spain"

lats = df.latitude
lons = df.longitude

fig = px.scatter_map(df, 
                     lat=lats, 
                     lon=lons,
                     hover_data=["municipality_name", "altitude"], 
                     size='population',
                     color='population',
                     color_continuous_scale=px.colors.carto.Aggrnyl,
                     zoom=5,
                     size_max=50  # Increase max size of markers
                     )

# Adjust the size reference to make small points more visible
fig.update_traces(marker=dict(sizeref=1000))  # Decrease this value to make points larger

fig.update_geos(fitbounds="locations")
fig.update_layout(height=1000, width=1000)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":50,"t":50,"l":50,"b":50})
fig.update_layout(
    coloraxis_colorbar=dict(title='Population')
)
fig.update_layout(title="Communities in Spain by population size")

fig.show()

In [354]:
df[df["municipality_name"] == "València"]

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female,latitude,longitude,altitude
7259,46,Valencia/València,250,València,825948,391970,433978,,,


In [352]:
df_communities[df_communities["municipality_name"] == "València"]

Unnamed: 0,province_code,province,municipality_code,municipality_name,population,male,female
7234,46,Valencia/València,250,València,825948,391970,433978


In [353]:
df_coordinates[df_coordinates["municipality_name"] == "Valencia"]

Unnamed: 0,community,province,municipality_name,latitude,longitude,altitude,population,male,female
8094,Valencia,Valencia/València,Valencia,39.47024,-0.376805,23.3349,814208,392300,421908
