In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from pycaret.classification import *
import json

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso, ElasticNet
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from pycaret.regression import *
from sklearn.metrics import silhouette_samples, silhouette_score
import math
import random
import matplotlib.ticker as mtick
import re

from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

from neighborhoods import admin_wards
from neighborhoods import district_neighborhoods

In [2]:
random.seed(42)

In [3]:
# Load the dataset
df = pd.read_csv('../data/raw/scraped_data_Emanuela.csv')
df.shape

(1066, 9)

In [4]:
not_fount = []
for item in set(df['neighborhood'].values):
    if item not in admin_wards:
        not_found.append(item)

In [5]:
def replace_str_values(df, column, input_tuple_list):
    for item in input_tuple_list:
        df[column] = df[column].str.replace(item[0], item[1])
    return df


# Sports Centers

In [6]:
df_sports = pd.read_csv('../data/raw/sports_centers.csv', encoding='utf-8')
df_sports = df_sports[['BARRIO']]
df_sports = df_sports.dropna()
df_sports['BARRIO'] = df_sports['BARRIO'].apply(lambda x: x.capitalize())

In [7]:
not_found = []

for barrio in set(df_sports['BARRIO'].values):
    if barrio not in admin_wards:
        not_found.append(barrio)

len(not_found)

37

In [8]:
admin_wards_lower = [barrio.lower() for barrio in admin_wards]

for barrio in not_found:
    if barrio.lower() in admin_wards_lower:
        index = admin_wards_lower.index(barrio.lower())
        new_value = admin_wards[index]
        df_sports['BARRIO'] = df_sports['BARRIO'].replace(barrio, new_value)

In [9]:
not_found = []

for barrio in set(df_sports['BARRIO'].values):
    if barrio not in admin_wards:
        not_found.append(barrio)

len(not_found)

20

In [10]:
input_tuple_list = [('Casco h.vallecas', 'Casco Histórico de Vallecas'), 
                    ('Casco h.vicalvaro', 'Casco Histórico de Vicálvaro'), 
                    ('Los jeronimos', 'Los Jerónimos'), 
                    ('Los angeles', 'Los Ángeles'),
                    ('Ciudad jardin', 'Ciudad Jardín'),
                    ('El pilar', 'Pilar'),
                    ('Villaverde alto c.h.', 'Villaverde Alto'),
                    ('Moscardo', 'Moscardó'),
                    ('Puerta del angel', 'Puerta del Ángel'),
                    ('Fontarron', 'Fontarrón'),
                    ('Las aguilas', 'Las Águilas'),
                    ('Hispanoamerica', 'Hispanoamérica'),
                    ('Pacifico', 'Pacífico'),
                    ('El salvador', 'Salvador'),
                    ('Concepcion', 'Concepción'),
                    ('Niño jesus', 'Niño Jesús'),
                    ('Arguelles', 'Argüelles'),
                    ('Peña grande', 'Peñagrande'),
                    ('Entrevias', 'Entrevías'),
                    ('Rios rosas', 'Ríos Rosas'),
                    ('Apostol santiago', 'Apóstol Santiago'),
                    ('Ambroz', 'Casco Histórico de Vicálvaro'),
                    ('Los carmenes', 'Los Cármenes'),
                    ('El plantio', 'El Plantío'),
                    ('San andres', 'San Andrés'),
                    ('Zofio', 'Zofío'),
                    ('Timon', 'Timón'),
                    ('Hellin', 'Hellín'),
                    ('Valdemarin', 'Valdemarín'),
                    ('Casco h.vicálvaro', 'Casco Histórico de Vicálvaro'),
                    ('Casco h.barajas', 'Casco Histórico de Barajas'),
                    ('Palos de la frontera', 'Palos de Moguer'),
                    ('San fermin', 'San Fermín'),
                    ('San cristobal', 'San Cristóbal'),
                    ('Justicia/ centro', 'Justicia')
                    ]

In [11]:
not_found = []

for barrio in set(df_sports['BARRIO'].values):
    if barrio not in admin_wards:
        not_found.append(barrio)

not_found

['Peña grande',
 'Entrevias',
 'El pilar',
 'Casco h.vicalvaro',
 'Apostol santiago',
 'Pacifico',
 'Casco h.vallecas',
 'Justicia/ centro',
 'San andres',
 'Los carmenes',
 'Zofio',
 'San cristobal',
 'Ciudad jardin',
 'San fermin',
 'Los jeronimos',
 'Hellin',
 'Moscardo',
 'Las aguilas',
 'Ambroz',
 'Concepcion']

In [12]:
df_sports = replace_str_values(df_sports, 'BARRIO', input_tuple_list)

In [13]:
not_found = []

for barrio in set(df_sports['BARRIO'].values):
    if barrio not in admin_wards:
        not_found.append(barrio)

not_found

[]

In [14]:
df_sports['sports_centers'] = 1
df_sports = df_sports.groupby('BARRIO').sum('sports_centers').reset_index()
df_sports = df_sports.rename(columns={'BARRIO': 'neighborhood'})
df = df.merge(df_sports, on='neighborhood', how='left')
df['sports_centers'] = df['sports_centers'].fillna(0)

# Wellness Indices

In [15]:
df_wellness = pd.read_csv('../data/raw/wellness_index.csv', encoding='utf-8')
max_year = df_wellness['Fecha datos'].max()
df_wellness = df_wellness[df_wellness['Fecha datos']==max_year]

In [16]:
df_wellness = df_wellness[['Nombre barrio', 
                           'Índice de Vulnerabilidad Bienestar Social e Igualdad', 
                           'Índice de Vulnerabilidad Medio Ambiente Urbano y Movilidad',
                           'Índice de Vulnerabilidad Educación y Cultura',
                           'Índice de Vulnerabilidad Economía y Empleo',
                           'Índice de Vulnerabilidad Salud'
                           ]]

In [17]:
df_wellness = df_wellness.rename(columns={
    'Nombre barrio': 'neighborhood', 
    'Índice de Vulnerabilidad Bienestar Social e Igualdad': 'social_wellness_index', 
    'Índice de Vulnerabilidad Medio Ambiente Urbano y Movilidad': 'urban_mobility_index',
    'Índice de Vulnerabilidad Educación y Cultura': 'education_index',
    'Índice de Vulnerabilidad Economía y Empleo': 'employment_index',
    'Índice de Vulnerabilidad Salud': 'health_index'
})

In [18]:
not_found = []

for barrio in set(df_wellness['neighborhood'].values):
    if barrio not in admin_wards:
        not_found.append(barrio)

not_found

['Casco histórico de Vicálvaro',
 'Apostol Santiago',
 'Palos de la Frontera',
 'Cármenes',
 'Rios Rosas',
 'Villaverde Alto, Casco Histórico de Villaverde',
 'Fuentelareina',
 'Águilas']

In [19]:
input_tuple_list = [('Apostol Santiago', 'Apóstol Santiago'), 
                    ('Fuentelareina', 'Fuentelarreina'),
                    ('Villaverde Alto, Casco Histórico de Villaverde', 'Villaverde Alto'),
                    ('Águilas','Las Águilas'),
                    ('Cármenes', 'Los Cármenes'),
                    ('Casco histórico de Vicálvaro', 'Casco Histórico de Vicálvaro'),
                    ('Palos de la Frontera', 'Palos de Moguer'),
                    ('Rios Rosas', 'Ríos Rosas')
                    ]

In [20]:
df_wellness = replace_str_values(df_wellness, 'neighborhood', input_tuple_list)

In [21]:
not_found = []

for barrio in set(df_wellness['neighborhood'].values):
    if barrio not in admin_wards:
        not_found.append(barrio)

not_found

[]

In [22]:
df = df.merge(df_wellness, on='neighborhood', how='left')

In [23]:
df = df[df['m2']<200]

In [24]:
df.to_csv('../data/processed/scraped_data_cleaned_ER.csv', index=False)