# Formatted zone

In [1]:
import pandas as pd

import os, sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import utils

## Delete previosly created duckdb files

In [2]:
utils.clear_database(module_path)

## Repositories

### Household repository

In [3]:
household_columns = ['section','delete','delete','single_women_aged_16_to_64','single_men_aged_16_to_64','single_women_aged_65_or_over','single_men_aged_65_or_over',
    'adult_women_with_one_or_more_minors','adult_men_with_one_or_more_minors','two_adults_from_16_to_64_and_without_minors',
    'two_adults_one_at_least_65_and_without_minors','two_adults_and_one_minor','two_adults_and_two_minors','two_adults_and_three_or_more_minors',
    'two_adults_over_35_and_one_adult_from_16_to_34','two_adults_over_35_and_one_adult_from_16_to_34_and_one_minor',
    'two_adults_over_35_and_one_adult_from_16_to_34_and_two_minors','three_adults_and_0_or_more_minors','two_adults_over_35_and_two_adults_from_16_to_34',
    'two_adults_over_35_and_two_adults_from_16_to_34_and_one_minor','two_adults_over_35_and_two_adults_from_16_to_34_and_two_or_more_minors',
    'four_adults_and_0_or_more_minors','five_adults_and_0_or_more_minors','fifteen_or_more_inhabitants','only_minors']

def read_household(file):
    """
    Prepares the Household excel file, 
    Returns the data in DataFrame format
    """
    path = f'../1.Landing zone/persistent/{file}'
    df = pd.read_excel(path,sheet_name='Composicion del hogar', header=[5], names=household_columns)

    # Formatting the excel format to dataframe
    df['section'].fillna(df['delete'],inplace=True)
    df.drop(df.filter(like='delete'),axis=1, inplace=True)
    df.dropna(inplace = True)

    # Removes the total rows 
    newDF = df[pd.to_numeric(df['section'], errors='coerce').notnull()]
    assert df.shape[0] - newDF.shape[0] == 22

    return newDF

### Nationalities repsitory

In [4]:

def read_nationalities(file):
    """
    Reads the Nationalities Excel
    Returns the data in DataFrame format
    """
    path = f'../1.Landing zone/persistent/{file}'
    df = pd.read_excel(path, sheet_name='Total', header=[7])

    # Formatting excel format to dataframe
    df.rename(columns = {'Unnamed: 0':'Madrid_section','Unnamed: 3':'Españoles'}, inplace = True)
    df.drop(df.filter(regex='Unname'),axis=1, inplace=True)
    df.dropna(inplace = True)

    # Removes the total columns
    df.drop(df.filter(like='Total'),axis=1, inplace=True)

    # Formatting the column name
    df.columns = df.columns.str.strip()
    df.columns = df.columns.str.replace(' ', '_')

    # Removes the total rows 
    newDF = df[df['Madrid_section'].apply(lambda x: len(x.strip()) == 9)]

    return newDF

## Main

In [5]:
def formatted_zone(data_src):
    """
    Stores the excel tables in a relational data base.
    Returns the relational databases
    """
    for file in os.listdir(f'{data_src}/persistent/'):  
        table = file.split('_')[1].split('.')[0]
        repo = ''.join(filter(str.isalpha,table))
        df = read_household(file) if repo == 'household' else read_nationalities(file)
        utils.df_to_DBtable(f'../{repo}.duckdb',df,table)
        print(f'    - ./persistent/{file} stored in {repo} DuckDB')
        


In [6]:
formatted_zone('../1.Landing zone')

    - ./persistent/2023-01-13_nationalities2019.xls stored in nationalities DuckDB
    - ./persistent/2022-12-30_nationalities2020.xls stored in nationalities DuckDB
    - ./persistent/2022-12-30_household2020.xls stored in household DuckDB
    - ./persistent/2023-01-13_household2019.xls stored in household DuckDB
    - ./persistent/2023-01-13_household2018.xls stored in household DuckDB
    - ./persistent/2023-01-13_nationalities2018.xls stored in nationalities DuckDB
    - ./persistent/2023-01-13_nationalities2020.xls stored in nationalities DuckDB
    - ./persistent/2022-12-30_household2019.xls stored in household DuckDB
    - ./persistent/2022-12-30_nationalities2018.xls stored in nationalities DuckDB
    - ./persistent/2022-12-30_household2018.xls stored in household DuckDB
    - ./persistent/2022-12-30_nationalities2019.xls stored in nationalities DuckDB
    - ./persistent/2023-01-13_household2020.xls stored in household DuckDB


## Test

In [7]:
# check if all tables in household.duckdb are created
household = utils.get_tables('../household.duckdb')
assert household.size == 3
household

0    household2019
1    household2020
2    household2018
Name: table_name, dtype: object

In [8]:
# check if all tables in nationalities.db are created
nationalities = utils.get_tables('../nationalities.duckdb')
assert nationalities.size == 3
nationalities

0    nationalities2020
1    nationalities2019
2    nationalities2018
Name: table_name, dtype: object

## Preview of all the formatted tables

In [9]:
h18 = utils.DBtable_to_df(DB='../household.duckdb',table ='household2018')
h19 = utils.DBtable_to_df(DB='../household.duckdb',table ='household2019')
h20 = utils.DBtable_to_df(DB='../household.duckdb',table ='household2020')

n18 = utils.DBtable_to_df(DB='../nationalities.duckdb',table ='nationalities2018')
n19 = utils.DBtable_to_df(DB='../nationalities.duckdb',table ='nationalities2019')
n20 = utils.DBtable_to_df(DB='../nationalities.duckdb',table ='nationalities2020')

### Hosehold

Household version 2018

In [10]:
h18.head(3)

Unnamed: 0,section,single_women_aged_16_to_64,single_men_aged_16_to_64,single_women_aged_65_or_over,single_men_aged_65_or_over,adult_women_with_one_or_more_minors,adult_men_with_one_or_more_minors,two_adults_from_16_to_64_and_without_minors,two_adults_one_at_least_65_and_without_minors,two_adults_and_one_minor,...,two_adults_over_35_and_one_adult_from_16_to_34_and_one_minor,two_adults_over_35_and_one_adult_from_16_to_34_and_two_minors,three_adults_and_0_or_more_minors,two_adults_over_35_and_two_adults_from_16_to_34,two_adults_over_35_and_two_adults_from_16_to_34_and_one_minor,two_adults_over_35_and_two_adults_from_16_to_34_and_two_or_more_minors,four_adults_and_0_or_more_minors,five_adults_and_0_or_more_minors,fifteen_or_more_inhabitants,only_minors
0,1001.0,72.0,86.0,63.0,16.0,10.0,4.0,82.0,63.0,20.0,...,4.0,0.0,30.0,13.0,2.0,2.0,13.0,18.0,0.0,0.0
1,1002.0,100.0,91.0,36.0,19.0,7.0,0.0,89.0,28.0,13.0,...,2.0,1.0,38.0,9.0,1.0,1.0,10.0,11.0,0.0,1.0
2,1003.0,154.0,179.0,74.0,40.0,13.0,2.0,141.0,77.0,23.0,...,10.0,3.0,54.0,18.0,4.0,0.0,23.0,30.0,0.0,1.0


Household version 2019

In [11]:
h19.head(3)

Unnamed: 0,section,single_women_aged_16_to_64,single_men_aged_16_to_64,single_women_aged_65_or_over,single_men_aged_65_or_over,adult_women_with_one_or_more_minors,adult_men_with_one_or_more_minors,two_adults_from_16_to_64_and_without_minors,two_adults_one_at_least_65_and_without_minors,two_adults_and_one_minor,...,two_adults_over_35_and_one_adult_from_16_to_34_and_one_minor,two_adults_over_35_and_one_adult_from_16_to_34_and_two_minors,three_adults_and_0_or_more_minors,two_adults_over_35_and_two_adults_from_16_to_34,two_adults_over_35_and_two_adults_from_16_to_34_and_one_minor,two_adults_over_35_and_two_adults_from_16_to_34_and_two_or_more_minors,four_adults_and_0_or_more_minors,five_adults_and_0_or_more_minors,fifteen_or_more_inhabitants,only_minors
0,1001.0,67.0,88.0,62.0,16.0,9.0,4.0,101.0,69.0,15.0,...,3.0,1.0,35.0,11.0,4.0,2.0,13.0,17.0,0.0,1.0
1,1002.0,99.0,89.0,36.0,15.0,5.0,3.0,84.0,29.0,11.0,...,4.0,0.0,41.0,11.0,1.0,0.0,13.0,12.0,0.0,0.0
2,1003.0,147.0,151.0,71.0,42.0,7.0,1.0,144.0,75.0,23.0,...,16.0,2.0,54.0,16.0,5.0,1.0,20.0,35.0,0.0,2.0


Household version 2020

In [12]:
h20.head(3)

Unnamed: 0,section,single_women_aged_16_to_64,single_men_aged_16_to_64,single_women_aged_65_or_over,single_men_aged_65_or_over,adult_women_with_one_or_more_minors,adult_men_with_one_or_more_minors,two_adults_from_16_to_64_and_without_minors,two_adults_one_at_least_65_and_without_minors,two_adults_and_one_minor,...,two_adults_over_35_and_one_adult_from_16_to_34_and_one_minor,two_adults_over_35_and_one_adult_from_16_to_34_and_two_minors,three_adults_and_0_or_more_minors,two_adults_over_35_and_two_adults_from_16_to_34,two_adults_over_35_and_two_adults_from_16_to_34_and_one_minor,two_adults_over_35_and_two_adults_from_16_to_34_and_two_or_more_minors,four_adults_and_0_or_more_minors,five_adults_and_0_or_more_minors,fifteen_or_more_inhabitants,only_minors
0,1001.0,75.0,93.0,52.0,17.0,6.0,5.0,97.0,74.0,14.0,...,3.0,2.0,36.0,11.0,3.0,0.0,20.0,20.0,0.0,0.0
1,1002.0,111.0,99.0,32.0,17.0,4.0,1.0,96.0,30.0,9.0,...,1.0,0.0,33.0,6.0,1.0,0.0,15.0,14.0,0.0,0.0
2,1003.0,137.0,164.0,68.0,43.0,7.0,2.0,148.0,80.0,18.0,...,10.0,3.0,50.0,16.0,7.0,0.0,28.0,37.0,0.0,0.0


### Nationalities

Nationalities version 2018

In [13]:
n18.head(3)

Unnamed: 0,Madrid_section,Españoles,Alemania,Austria,Bélgica,Bulgaria,Chipre,Croacia,Dinamarca,Eslovaquia,...,Vietnam,Yemen,Otros_Países_de_Asia,Australia,Fiji,Nueva_Zelanda,Papúa_Nueva_Guinea,Timor_Oriental,Vanuatu,Otros_Países_de_Oceanía
0,1401001,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2901001,1938.0,7.0,0.0,0.0,8.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2901002,2015.0,1.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Nationalities version 2019

In [14]:
n19.head(3)

Unnamed: 0,Madrid_section,Españoles,Alemania,Austria,Bélgica,Bulgaria,Chipre,Croacia,Dinamarca,Eslovaquia,...,Vietnam,Yemen,Otros_Países_de_Asia,Australia,Fiji,Nueva_Zelanda,Papúa_Nueva_Guinea,Samoa,Timor_Oriental,Otros_Países_de_Oceanía
0,1401001,83.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2901001,2008.0,6.0,0.0,0.0,5.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2901002,2090.0,2.0,0.0,1.0,4.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Nationalities version 2020

In [15]:
n20.head(3)

Unnamed: 0,Madrid_section,Españoles,Alemania,Austria,Bélgica,Bulgaria,Chipre,Croacia,Dinamarca,Eslovaquia,...,Uzbekistán,Vietnam,Yemen,Otros_Países_de_Asia,Australia,Fiji,Nueva_Zelanda,Timor_Oriental,Vanuatu,Otros_Países_de_Oceanía
0,1401001,67.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2901001,2008.0,10.0,0.0,0.0,4.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2901002,2065.0,2.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
