In [1]:
#IMPORTS
import json
import geopandas as gpd
import numpy as numpy
import pandas as pd


# Setting up barrios_gdf for cleaning

In this section, we read barrios_carcteristicas.geojson from datos/datos_out and we transform it into a dataframe using geopandas library

In [2]:
#SETTING UP barrios_caracteristicas as GDF

# SETTING UP BARRIOS DF
path = '../datos/datos_out/barrios_caracteristicas_final.geojson'


#CARGAMOS LOS DATOS DE LOS BARRIOS
with open(path) as json_file:
    json_data = json.load(json_file)

barrios_json = []
for i in range(len(json_data['features'])):
    barrios_json.append(json_data['features'][i])           #Guardamos resto de campos de geojson


barrios_gdf = gpd.GeoDataFrame.from_features(barrios_json)
barrios_gdf.crs = 'epsg:4326' #Aseguramos que la proyección es la adecuada para coordenadas GPS

In [3]:
# TRANSFORMING MY DF FROM GEOPANDAS TO PANDAS

barrios_df = pd.DataFrame(barrios_gdf)

barrios_df


Unnamed: 0,geometry,coddistrit,gis_gis_barrios_area,object_id_barrio,linkid,codbarrio,coddistbar,geo_point_2d,nombre_barrio,last_edited_user,...,num_colegios,id_caract_num_colegios,num_chargestations,id_caract_num_chargestations,pm25,id_caract_pm25,num_contenedores,id_caract_num_contenedores,num_transporte,id_caract_num_transporte
0,"POLYGON ((-0.33459 39.45478, -0.33326 39.45487...",11,917112.56250,62,0,5,115,"[39.44628964870906, -0.3326600366971329]",NATZARET,,...,6.0,2,,8,10.0,5,184.0,7,11.0,1
1,"POLYGON ((-0.38124 39.45463, -0.38281 39.44951...",9,374887.53125,63,0,3,093,"[39.45082750748332, -0.3853982961775011]",LA CREU COBERTA,,...,1.0,2,2.0,8,21.0,5,162.0,7,8.0,1
2,"POLYGON ((-0.34709 39.47548, -0.34144 39.47379...",13,,76,0,2,132,"[39.47198431553791, -0.3450427029615464]",CIUTAT JARDI,,...,5.0,2,,8,12.0,5,238.0,7,10.0,1
3,"POLYGON ((-0.28767 39.55682, -0.28829 39.55654...",17,,601,18,5,175,"[39.55880504953868, -0.3031699565111558]",RAFALELL-VISTABELLA,,...,,2,,8,12.0,5,,7,,1
4,"POLYGON ((-0.33151 39.48561, -0.33254 39.48046...",13,,137,0,5,135,"[39.48035805242704, -0.3410905938386986]",LA CARRASCA,,...,10.0,2,,8,12.0,5,237.0,7,21.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,"POLYGON ((-0.39047 39.47825, -0.38996 39.47754...",3,496668.75000,28,0,3,033,"[39.473565513491664, -0.39098170775459257]",LA PETXINA,,...,11.0,2,,8,12.0,5,279.0,7,14.0,1
84,"POLYGON ((-0.37567 39.47381, -0.37423 39.47351...",1,438658.31250,37,0,6,016,"[39.47010643870186, -0.3760489821791233]",SANT FRANCESC,,...,2.0,2,2.0,8,12.0,5,580.0,7,27.0,1
85,"POLYGON ((-0.35018 39.42223, -0.35011 39.42227...",19,,249,0,3,193,"[39.406290326215334, -0.33967854582733203]",PINEDO,,...,2.0,2,,8,10.0,5,185.0,7,14.0,1
86,"POLYGON ((-0.36720 39.50425, -0.36767 39.50338...",15,,201,0,1,151,"[39.494312122911424, -0.3699310988383476]",ELS ORRIOLS,,...,9.0,2,,8,,5,214.0,7,21.0,1


In [4]:
#Checking columns of barrios_df
barrios_df.columns

Index(['geometry', 'coddistrit', 'gis_gis_barrios_area', 'object_id_barrio',
       'linkid', 'codbarrio', 'coddistbar', 'geo_point_2d', 'nombre_barrio',
       'last_edited_user', 'last_edited_date', '%_zona_verde',
       'id_caract_%_zona_verde', 'nivel_acustico', 'id_caract_nivel_acustico',
       'num_hospitales', 'id_caract_num_hospitales', 'num_colegios',
       'id_caract_num_colegios', 'num_chargestations',
       'id_caract_num_chargestations', 'pm25', 'id_caract_pm25',
       'num_contenedores', 'id_caract_num_contenedores', 'num_transporte',
       'id_caract_num_transporte'],
      dtype='object')

In [5]:
# Eliminamos columnas que no necesitamos. Como identificativo unívoco para cada barrio nos quedaremos con el object_id_barrio. 
# Por tanto eliminaremos las siguientes columnas que no nos aportan informacion relevante (coddistrit, linkid, codbarrio, coddistbar, last_edited_user, last_edited_date)
# El area la eliminamos porque la calculamos con python

barrios_df = barrios_df.drop(['coddistrit', 'linkid', 'codbarrio', 'coddistbar', 'last_edited_user', 'last_edited_date', 'gis_gis_barrios_area'], axis = 1)

# Comprobamos ue se han eliminado correctamente las columnas:

barrios_df.columns


Index(['geometry', 'object_id_barrio', 'geo_point_2d', 'nombre_barrio',
       '%_zona_verde', 'id_caract_%_zona_verde', 'nivel_acustico',
       'id_caract_nivel_acustico', 'num_hospitales',
       'id_caract_num_hospitales', 'num_colegios', 'id_caract_num_colegios',
       'num_chargestations', 'id_caract_num_chargestations', 'pm25',
       'id_caract_pm25', 'num_contenedores', 'id_caract_num_contenedores',
       'num_transporte', 'id_caract_num_transporte'],
      dtype='object')

In [6]:
# Eliminamos las filas de los barrios que no pertenecen al nucleo urbano de Valencia, ya que los datasets escogidos son inconsistentes y concentran más los datos en el nucleo urbano
# Se eliminan barrios(EL PALMAR, EL PERELLONET, EL SALER, PINEDO, LES CASES DE BARCENA, MASSAROJOS, MAHUELLA-TAULADELLA, RAFALELL-VISTABELLA)

blacklist = ['EL PALMAR', 'EL PERELLONET', 'EL SALER', 'PINEDO', 'LES CASES DE BARCENA', 'MASSARROJOS', 'MAHUELLA-TAULADELLA', 'RAFALELL-VISTABELLA']

for i in blacklist:
    barrios_df = barrios_df.drop(barrios_df.loc[barrios_df['nombre_barrio'] == i].index)


barrios_df

Unnamed: 0,geometry,object_id_barrio,geo_point_2d,nombre_barrio,%_zona_verde,id_caract_%_zona_verde,nivel_acustico,id_caract_nivel_acustico,num_hospitales,id_caract_num_hospitales,num_colegios,id_caract_num_colegios,num_chargestations,id_caract_num_chargestations,pm25,id_caract_pm25,num_contenedores,id_caract_num_contenedores,num_transporte,id_caract_num_transporte
0,"POLYGON ((-0.33459 39.45478, -0.33326 39.45487...",62,"[39.44628964870906, -0.3326600366971329]",NATZARET,0.118869,3,3.0,6,1.0,4,6.0,2,,8,10.0,5,184.0,7,11.0,1
1,"POLYGON ((-0.38124 39.45463, -0.38281 39.44951...",63,"[39.45082750748332, -0.3853982961775011]",LA CREU COBERTA,0.037566,3,3.5,6,,4,1.0,2,2.0,8,21.0,5,162.0,7,8.0,1
2,"POLYGON ((-0.34709 39.47548, -0.34144 39.47379...",76,"[39.47198431553791, -0.3450427029615464]",CIUTAT JARDI,0.205871,3,3.5,6,,4,5.0,2,,8,12.0,5,238.0,7,10.0,1
4,"POLYGON ((-0.33151 39.48561, -0.33254 39.48046...",137,"[39.48035805242704, -0.3410905938386986]",LA CARRASCA,0.187377,3,3.5,6,,4,10.0,2,,8,12.0,5,237.0,7,21.0,1
7,"POLYGON ((-0.35762 39.49410, -0.35762 39.49409...",202,"[39.49635509681184, -0.36282126402044845]",SANT LLORENS,0.076189,3,3.0,6,,4,8.0,2,,8,,5,270.0,7,11.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,"POLYGON ((-0.38139 39.49868, -0.38120 39.49693...",9,"[39.493005640921645, -0.3910022045676008]",BENICALAP,0.154520,3,3.0,6,3.0,4,18.0,2,,8,26.0,5,831.0,7,43.0,1
83,"POLYGON ((-0.39047 39.47825, -0.38996 39.47754...",28,"[39.473565513491664, -0.39098170775459257]",LA PETXINA,0.087284,3,4.0,6,1.0,4,11.0,2,,8,12.0,5,279.0,7,14.0,1
84,"POLYGON ((-0.37567 39.47381, -0.37423 39.47351...",37,"[39.47010643870186, -0.3760489821791233]",SANT FRANCESC,0.134529,3,4.0,6,,4,2.0,2,2.0,8,12.0,5,580.0,7,27.0,1
86,"POLYGON ((-0.36720 39.50425, -0.36767 39.50338...",201,"[39.494312122911424, -0.3699310988383476]",ELS ORRIOLS,0.153242,3,3.0,6,1.0,4,9.0,2,,8,,5,214.0,7,21.0,1


In [7]:
# Checking index: obtenemos el indice que corresponde a cada fila de nuestro DF
barrios_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 0 to 87
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   geometry                      80 non-null     geometry
 1   object_id_barrio              80 non-null     int64   
 2   geo_point_2d                  80 non-null     object  
 3   nombre_barrio                 80 non-null     object  
 4   %_zona_verde                  76 non-null     float64 
 5   id_caract_%_zona_verde        80 non-null     int64   
 6   nivel_acustico                80 non-null     float64 
 7   id_caract_nivel_acustico      80 non-null     int64   
 8   num_hospitales                44 non-null     float64 
 9   id_caract_num_hospitales      80 non-null     int64   
 10  num_colegios                  76 non-null     float64 
 11  id_caract_num_colegios        80 non-null     int64   
 12  num_chargestations            15 non-null     float6

In [8]:
# Para cada caracteristica rellenamos los valores faltantes (NaN) con 0s en caso de que el valor sea cuantitativo. (%_zona_verde, num_hospitales, num_colegios, num_chargestations, num_contenedores)
myList = ['%_zona_verde', 'num_hospitales', 'num_colegios', 'num_chargestations', 'num_contenedores', 'num_transporte']

for i in myList:
    barrios_df[i] = barrios_df[i].fillna(0)

barrios_df.loc[barrios_df['num_colegios'] == 0] # Comprobamos la columna colegios para ver que se ha rellenado correctamente

barrios_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 0 to 87
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   geometry                      80 non-null     geometry
 1   object_id_barrio              80 non-null     int64   
 2   geo_point_2d                  80 non-null     object  
 3   nombre_barrio                 80 non-null     object  
 4   %_zona_verde                  80 non-null     float64 
 5   id_caract_%_zona_verde        80 non-null     int64   
 6   nivel_acustico                80 non-null     float64 
 7   id_caract_nivel_acustico      80 non-null     int64   
 8   num_hospitales                80 non-null     float64 
 9   id_caract_num_hospitales      80 non-null     int64   
 10  num_colegios                  80 non-null     float64 
 11  id_caract_num_colegios        80 non-null     int64   
 12  num_chargestations            80 non-null     float6

In [9]:
# CHANGING DATA TYPES
# Los hospitales, colegios, contenedores, estaciones de transporte son valores discretos que en nuestro df vienen como float64. Los cambiamos a int64

myList = ['num_hospitales', 'num_colegios', 'num_contenedores', 'num_transporte']

for i in myList:
    barrios_df[i] = barrios_df[i].astype('int64')

# Las estaciones de carga (num_chargestations) están presentes en muy pocos barrios. Por tanto nos interesa ver en qué barrios están presentes estas. Para ello haremos que la columna num_chargestations sea bool. De forma que
# indique True, cuando un barrio tiene estaciones de carga y False cuando no. Le cambiamos el nobre a la columna num_chargestations por chargestations para no indicar el nombre de la columna que es un número

barrios_df['num_chargestations'] = barrios_df['num_chargestations'].astype('bool')
barrios_df = barrios_df.rename(columns = {'num_chargestations' : 'chargestations'})

barrios_df.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 0 to 87
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   geometry                      80 non-null     geometry
 1   object_id_barrio              80 non-null     int64   
 2   geo_point_2d                  80 non-null     object  
 3   nombre_barrio                 80 non-null     object  
 4   %_zona_verde                  80 non-null     float64 
 5   id_caract_%_zona_verde        80 non-null     int64   
 6   nivel_acustico                80 non-null     float64 
 7   id_caract_nivel_acustico      80 non-null     int64   
 8   num_hospitales                80 non-null     int64   
 9   id_caract_num_hospitales      80 non-null     int64   
 10  num_colegios                  80 non-null     int64   
 11  id_caract_num_colegios        80 non-null     int64   
 12  chargestations                80 non-null     bool  

In [10]:
# Comprobamos los valores de la columna num_chargestations:
barrios_df['chargestations']

# Efectivamente, si en un barrio hay estaciones de carga se almacena el valor como true. Si no es False

0     False
1      True
2     False
4     False
7     False
      ...  
82    False
83    False
84     True
86    False
87    False
Name: chargestations, Length: 80, dtype: bool

In [190]:
# Finalmente nos queda lidiar con los nulos para el indice de pm25 (contaminación). Vamos a ver los barrios que tienen como null este valor:

barrios_df.loc[barrios_df['pm25'].isna()].head()


Unnamed: 0,geometry,object_id_barrio,geo_point_2d,nombre_barrio,%_zona_verde,id_caract_%_zona_verde,nivel_acustico,id_caract_nivel_acustico,num_hospitales,id_caract_num_hospitales,num_colegios,id_caract_num_colegios,chargestations,id_caract_num_chargestations,pm25,id_caract_pm25,num_contenedores,id_caract_num_contenedores,num_transporte,id_caract_num_transporte
7,"POLYGON ((-0.35762 39.49410, -0.35762 39.49409...",202,"[39.49635509681184, -0.36282126402044845]",SANT LLORENS,0.076189,3,3.0,6,0,4,8,2,False,8,,5,270,7,11,1
10,"POLYGON ((-0.36720 39.51203, -0.36734 39.51182...",617,"[39.51492937567956, -0.37685442512193834]",CARPESA,0.0,3,2.0,6,1,4,0,2,False,8,,5,81,7,6,1
12,"POLYGON ((-0.37639 39.52831, -0.37667 39.52737...",5,"[39.525644468218076, -0.3846207009279064]",BENIFARAIG,0.0,3,2.5,6,1,4,2,2,False,8,,5,76,7,8,1
14,"POLYGON ((-0.38105 39.48223, -0.38023 39.48212...",22,"[39.47904419159028, -0.3799855271302711]",EL CARME,0.228795,3,3.5,6,0,4,5,2,False,8,,5,183,7,4,1
15,"POLYGON ((-0.36513 39.47901, -0.36339 39.47851...",25,"[39.47613782154427, -0.3636903649528479]",EXPOSICIO,0.260577,3,4.0,6,1,4,6,2,True,8,,5,227,7,12,1
16,"POLYGON ((-0.35822 39.47700, -0.35389 39.47585...",31,"[39.47098032023145, -0.3588449910815803]",MESTALLA,0.230933,3,3.0,6,1,4,12,2,True,8,,5,397,7,30,1
25,"POLYGON ((-0.36229 39.48483, -0.36274 39.48389...",21,"[39.48174281412897, -0.3630732531416211]",JAUME ROIG,0.145308,3,4.0,6,0,4,2,2,False,8,,5,159,7,6,1
27,"POLYGON ((-0.38582 39.44784, -0.38631 39.44640...",66,"[39.44497017390368, -0.3900252260136304]",SANT MARCEL.LI,0.2069,3,3.0,6,1,4,8,2,False,8,,5,178,7,9,1
29,"POLYGON ((-0.37059 39.47721, -0.37162 39.47676...",26,"[39.47662037964931, -0.3751184490096385]",LA SEU,0.077031,3,3.0,6,0,4,3,2,False,8,,5,149,7,4,1
35,"POLYGON ((-0.37122 39.49839, -0.37180 39.49703...",10,"[39.49519800261453, -0.3769320951602514]",TORREFIEL,0.160518,3,3.0,6,2,4,14,2,False,8,,5,444,7,29,1


In [191]:
# Para rellenar los valores null de pm25 podemos calcular el indice de pm25 medio del resto de barrios y asignarselo a los barrios que no tienen este valor definido:
barrios_df['pm25'] = barrios_df['pm25'].fillna(barrios_df['pm25'].mean())

barrios_df


Unnamed: 0,geometry,object_id_barrio,geo_point_2d,nombre_barrio,%_zona_verde,id_caract_%_zona_verde,nivel_acustico,id_caract_nivel_acustico,num_hospitales,id_caract_num_hospitales,num_colegios,id_caract_num_colegios,chargestations,id_caract_num_chargestations,pm25,id_caract_pm25,num_contenedores,id_caract_num_contenedores,num_transporte,id_caract_num_transporte
0,"POLYGON ((-0.33459 39.45478, -0.33326 39.45487...",62,"[39.44628964870906, -0.3326600366971329]",NATZARET,0.118869,3,3.0,6,1,4,6,2,False,8,10.000000,5,184,7,11,1
1,"POLYGON ((-0.38124 39.45463, -0.38281 39.44951...",63,"[39.45082750748332, -0.3853982961775011]",LA CREU COBERTA,0.037566,3,3.5,6,0,4,1,2,True,8,21.000000,5,162,7,8,1
2,"POLYGON ((-0.34709 39.47548, -0.34144 39.47379...",76,"[39.47198431553791, -0.3450427029615464]",CIUTAT JARDI,0.205871,3,3.5,6,0,4,5,2,False,8,12.000000,5,238,7,10,1
4,"POLYGON ((-0.33151 39.48561, -0.33254 39.48046...",137,"[39.48035805242704, -0.3410905938386986]",LA CARRASCA,0.187377,3,3.5,6,0,4,10,2,False,8,12.000000,5,237,7,21,1
7,"POLYGON ((-0.35762 39.49410, -0.35762 39.49409...",202,"[39.49635509681184, -0.36282126402044845]",SANT LLORENS,0.076189,3,3.0,6,0,4,8,2,False,8,15.641509,5,270,7,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,"POLYGON ((-0.38139 39.49868, -0.38120 39.49693...",9,"[39.493005640921645, -0.3910022045676008]",BENICALAP,0.154520,3,3.0,6,3,4,18,2,False,8,26.000000,5,831,7,43,1
83,"POLYGON ((-0.39047 39.47825, -0.38996 39.47754...",28,"[39.473565513491664, -0.39098170775459257]",LA PETXINA,0.087284,3,4.0,6,1,4,11,2,False,8,12.000000,5,279,7,14,1
84,"POLYGON ((-0.37567 39.47381, -0.37423 39.47351...",37,"[39.47010643870186, -0.3760489821791233]",SANT FRANCESC,0.134529,3,4.0,6,0,4,2,2,True,8,12.000000,5,580,7,27,1
86,"POLYGON ((-0.36720 39.50425, -0.36767 39.50338...",201,"[39.494312122911424, -0.3699310988383476]",ELS ORRIOLS,0.153242,3,3.0,6,1,4,9,2,False,8,15.641509,5,214,7,21,1


In [194]:
# Finalmente comprobamos la tabla para ver si hay algún null más

barrios_df.isnull().sum()

geometry                        0
object_id_barrio                0
geo_point_2d                    0
nombre_barrio                   0
%_zona_verde                    0
id_caract_%_zona_verde          0
nivel_acustico                  0
id_caract_nivel_acustico        0
num_hospitales                  0
id_caract_num_hospitales        0
num_colegios                    0
id_caract_num_colegios          0
chargestations                  0
id_caract_num_chargestations    0
pm25                            0
id_caract_pm25                  0
num_contenedores                0
id_caract_num_contenedores      0
num_transporte                  0
id_caract_num_transporte        0
dtype: int64