In [5]:
from os import listdir, makedirs
from os.path import exists
import pandas as pd
import shutil

# main workspace folders
data_folder = '../data_original/'
out_folder = '../data/'


# 1. Remapping
The folder 'data_original', which should be stored locally on the main page of the repository, contains all the untreated data. The files from the Ajuntament de Barcelona are coded (for example, sex is presented as 1 and 2, which correspond to female and male), although a translation file is provided. 

This code aims to recover these values, while also dropping and translating some columns to make it easier to work with all the data. This filtered dataset will be stored on a folder 'data' at the same level as 'data_original'.

In [6]:
# mapping file
codes_df = pd.read_csv(f'{data_folder}Codigo_valores/pad_dimensions.csv')
codes_df['Codi_Valor'] = codes_df['Codi_Valor'].astype(str)                                                     # to avoid int/str issues
mappings = {dim: g.set_index('Codi_Valor')['Desc_Valor_EN'] for dim, g in codes_df.groupby('Desc_Dimensio')}    # create a mapping dict of series


## 1.1. Data Lloc Naix Regio

In [7]:
def convert_birthPlaceRegion_sex(file, data_dir, out_dir, mappings):
    df = pd.read_csv(data_dir + file)

    # drop unnecessary columns
    df = df.drop(columns=['Codi_Districte', 'Codi_Barri'])

    # remap values
    for col in df.columns:
        if col in mappings:
            df[col] = df[col].astype(str).map(mappings[col])

    # translate column names
    df = df.rename(columns={'Data_Referencia': 'Year_Reference',
                            'Nom_Districte': 'District',
                            'Nom_Barri': 'Neighborhood',
                            'Valor': 'Value',
                            'LLOC_NAIX_REGIO': 'Birth_Place_Region',
                            'SEXE': 'Sex'})

    # remap and get year
    df['Year_Reference'] = pd.to_datetime(df['Year_Reference'], errors='coerce').dt.year
    year = df['Year_Reference'].iloc[0]

    # save to output
    df.to_csv(f'{out_dir}{year}_birthPlaceRegion_sex.csv', index=False)

    return


In [8]:
data_dir = f'{data_folder}Data_Lloc_naix_regio/'
files = listdir(data_dir)

out_dir = f'{out_folder}birthPlaceRegion/'
if not exists(out_dir):
    makedirs(out_dir)


for file in files:
    convert_birthPlaceRegion_sex(file, data_dir, out_dir, mappings)



## 1.2. Data Lloc Naix Regio (Spain v Outside)

In [9]:
def convert_sp(file, data_dir, out_dir, mappings):
    df = pd.read_csv(data_dir + file)

    # drop unnecessary columns
    df = df.drop(columns=['Codi_Districte', 'Codi_Barri', 'AEB'])

    # remap values
    for col in df.columns:
        if col in mappings:
            df[col] = df[col].astype(str).map(mappings[col])

    # translate column names
    df = df.rename(columns={'Data_Referencia': 'Year_Reference',
                            'Nom_Districte': 'District',
                            'Nom_Barri': 'Neighborhood',
                            'Seccio_Censal': 'Census_Section',
                            'Valor': 'Value',
                            'LLOC_NAIX': 'Birth_Place',
                            'SEXE': 'Sex'})

    # remap and get year
    df['Year_Reference'] = pd.to_datetime(df['Year_Reference'], errors='coerce').dt.year
    year = df['Year_Reference'].iloc[0]

    # save to output
    df.to_csv(f'{out_dir}{year}_birthPlace_spain_v_outside.csv', index=False)
    

    return



In [10]:
data_dir = f'{data_folder}Data_Lloc_naix(esp_vs_fuera)/'
files = listdir(data_dir)

out_dir = f'{out_folder}birthPlace_spain_v_outside/'
if not exists(out_dir):
    makedirs(out_dir)


for file in files:
    convert_sp(file, data_dir, out_dir, mappings)



## 3. Renda

In [11]:
def convert_rent(file, data_dir, out_dir, mappings):
    df = pd.read_csv(data_dir + file)

    # drop unnecessary columns
    df = df.drop(columns=['Codi_Districte', 'Codi_Barri'])

    # remap values
    for col in df.columns:
        if col in mappings:
            df[col] = df[col].astype(str).map(mappings[col])

    # translate column names
    df = df.rename(columns={'Any': 'Year_Reference',
                            'Nom_Districte': 'District',
                            'Nom_Barri': 'Neighborhood',
                            'Seccio_Censal': 'Census_Section'})

    # get year
    year = df['Year_Reference'].iloc[0]

    # save to output
    df.to_csv(f'{out_dir}{year}_rent.csv', index=False)
    

    return



In [12]:
data_dir = f'{data_folder}Renda/'
files = listdir(data_dir)

out_dir = f'{out_folder}rent/'
if not exists(out_dir):
    makedirs(out_dir)

for file in files:
    convert_rent(file, data_dir, out_dir, mappings)


## 1.3. Nivel Educativo

In [13]:
def convert_education_level(file, data_dir, out_dir, mappings):
    df = pd.read_csv(data_dir + file)

    # drop unnecessary columns
    df = df.drop(columns=['Codi_Districte', 'Codi_Barri'])

    # remap values
    for col in df.columns:
        if col in mappings:
            df[col] = df[col].astype(str).map(mappings[col])

    # translate column names
    df = df.rename(columns={'Data_Referencia': 'Year_Reference',
                            'Nom_Districte': 'District',
                            'Nom_Barri': 'Neighborhood',
                            'NIV_EDUCA_esta': 'Education_Level',
                            'LLOC_NAIX': 'Birth_Place',
                            'Valor': 'Value'})

    # remap and get year
    df['Year_Reference'] = pd.to_datetime(df['Year_Reference'], errors='coerce').dt.year
    year = df['Year_Reference'].iloc[0]

    # save to output
    df.to_csv(f'{out_dir}{year}_education_level.csv', index=False)

    return


In [14]:
data_dir = f'{data_folder}nivel_educativo/'
files = listdir(data_dir)

out_dir = f'{out_folder}education_level/'
if not exists(out_dir):
    makedirs(out_dir)


for file in files:
    convert_education_level(file, data_dir, out_dir, mappings)


# 2. Geometry
Plotting maps requires geometry data. This code unifies the format of the metadata and removes unnecessary columns.

## 2.1. Districts

In [15]:
geometry_file = f'{data_folder}Codigo_valores/BarcelonaCiutat_Districtes.csv'
geometry_df = pd.read_csv(geometry_file)

# drop unnecessary columns
geometry_df = geometry_df.drop(columns=['Codi_Districte', 'geometria_wgs84'])

# rename columns
geometry_df = geometry_df.rename(columns={'nom_districte':'District', 'geometria_etrs89':'Geometry_etrs89'})

# save to output folder
geometry_df.to_csv(out_folder + 'districts_geometry.csv', index=False)


In [16]:
geometry_file = f'{data_folder}Codigo_valores/BarcelonaCiutat_Barris.csv'
geometry_df = pd.read_csv(geometry_file)

# drop unnecessary columns
geometry_df = geometry_df.drop(columns=['codi_barri', 'codi_districte', 'geometria_wgs84'])

# rename columns
geometry_df = geometry_df.rename(columns={'nom_barri':'Neighborhood', 'nom_districte':'District', 'geometria_etrs89':'Geometry_etrs89'})

# save to output folder
geometry_df.to_csv(out_folder + 'neighborhoods_geometry.csv', index=False)


## 2.2. Neighborhoods

# 3. Other files
Files that required no initial treatment will be cloned to the same folder 'data'. Since the files are not too heavy, this allows for better organization.

In [17]:
def clone(data_dir, out_dir):
    'duplicate a directory (no need to handle errors for our dataset)'
    if not exists(out_dir):
        makedirs(out_dir)
    
    shutil.copytree(data_dir, out_dir, dirs_exist_ok=True)
    

In [18]:
# Indicadores Socioecon√≥micos
data_dir = f'{data_folder}indicadores_socioeconomicos/'
out_dir = f'{out_folder}indicadores_socioeconomicos/'
clone(data_dir, out_dir)    

# Unemployement
# data_dir = f'{data_folder}unemployment_spain/atur_provincies/'
# out_dir = f'{out_folder}unemployment_spain/atur_provincies/'
# clone(data_dir, out_dir) 
