# Formatted zone

In [19]:
import os
import requests

import pandas as pd
import duckdb 


In [20]:
household_columns = ['section','delete','total','single_women_aged_16_to_64','single_men_aged_16_to_64','single_women_aged_65_or_over','single_men_aged_65_or_over',
    'adult_women_with_one_or_more_minors','adult_men_with_one_or_more_minors','two_adults_from_16_to_64_and_without_minors',
    'two_adults_one_at_least_65_and_without_minors','two_adults_and_one_minor','two_adults_and_two_minors','two_adults_and_three_or_more_minors',
    'two_adults_over_35_and_one_adult_from_16_to_34','two_adults_over_35_and_one_adult_from_16_to_34_and_one_minor',
    'two_adults_over_35_and_one_adult_from_16_to_34_and_two_minors','three_adults_and_0_or_more_minors','two_adults_over_35_and_two_adults_from_16_to_34',
    'two_adults_over_35_and_two_adults_from_16_to_34_and_one_minor','two_adults_over_35_and_two_adults_from_16_to_34_and_two_or_more_minors',
    'four_adults_and_0_or_more_minors','five_adults_and_0_or_more_minors','fifteen_or_more_inhabitants','only_minors']

### Delete previosly created duckdb files

In [21]:

def clear_datebase():
    dbFileName = os.getcwd() + '/household.duckdb'
    if(os.path.exists(dbFileName)):
        os.remove(dbFileName)
    dbFileName = os.getcwd() + '/nationalities.duckdb'
    if(os.path.exists(dbFileName)):
        os.remove(dbFileName)

clear_datebase()

In [22]:
def add_yearCol(file,df):
    #adding year column
    year = file.split('_')[1].split('.')[0][-4:]
    df['Year'] = year
    display(df)

def prejoin_household(file):
    """
    Prepares the Household excel file, 
    Returns the data in DataFrame format
    """
    path = f'./landing/persistent/{file}'
    df = pd.read_excel(path,sheet_name='Composicion del hogar',header=[5],names=household_columns,na_values=None)
    df['section'].fillna(df['delete'],inplace=True)
    df.drop(labels='delete', axis=1, inplace=True)
    df.dropna(inplace = True)
    add_yearCol(file,df) # add column with corresponding year
    return df

def prejoin_nationalities(file):
    """
    Reads the Nationalities Excel
    Returns the data in DataFrame format
    """
    path = f'./landing/persistent/{file}'
    df = pd.read_excel(path,sheet_name='Total',header=[7],na_values=None)
    df.rename(columns = {'Unnamed: 0':'Madrid_section','Unnamed: 2':'Habitantes','Unnamed: 3':'Españoles','Unnamed: 4':'Extranjeros'}, inplace = True)
    df.drop('Unnamed: 1', axis=1, inplace=True)
    df.drop(df.filter(regex="Unname"),axis=1, inplace=True)
    df.drop(df.filter(like='Total'),axis=1, inplace=True)
    df.columns = df.columns.str.strip()
    df.columns = df.columns.str.replace(' ', '_')
    df.dropna(inplace = True)
    add_yearCol(file,df) # add column with corresponding year
    return df


def createTable(DB,df,table_name):
   # Creates a persistent table in DuckDB from the contents of the DataFrame
    con = duckdb.connect(DB)
    con.register(table_name,df)
    con.execute(f'CREATE TABLE {table_name} AS SELECT * FROM {table_name}')
    con.close()

In [23]:
# Formated _dod
def formatedZone():
    # current directory set to persistent folder
    persistent_dir = os.getcwd()+'/landing/persistent/'
    print(os.listdir(persistent_dir))
    for filename in os.listdir(persistent_dir):  
        if(filename.__contains__("household")):
            print(filename)
            df = prejoin_household(filename)
            #display(df)
            table_name = filename.split('_')[1].split('.')[0]
            createTable('household.duckdb',df,table_name)

        if(filename.__contains__("nationalities")):
            print(filename)
            df = prejoin_nationalities(filename)
            #display(df)
            table_name = filename.split('_')[1].split('.')[0]
            createTable('nationalities.duckdb',df,table_name)
            



In [24]:
formatedZone()


['2022-10-21_household2018.xls', '2022-10-21_nationalities2020.xls', '2022-10-21_nationalities2019.xls', '2022-10-21_nationalities2018.xls', '2022-10-21_household2019.xls', '2022-10-21_household2020.xls']
2022-10-21_household2018.xls


Unnamed: 0,section,total,single_women_aged_16_to_64,single_men_aged_16_to_64,single_women_aged_65_or_over,single_men_aged_65_or_over,adult_women_with_one_or_more_minors,adult_men_with_one_or_more_minors,two_adults_from_16_to_64_and_without_minors,two_adults_one_at_least_65_and_without_minors,...,two_adults_over_35_and_one_adult_from_16_to_34_and_two_minors,three_adults_and_0_or_more_minors,two_adults_over_35_and_two_adults_from_16_to_34,two_adults_over_35_and_two_adults_from_16_to_34_and_one_minor,two_adults_over_35_and_two_adults_from_16_to_34_and_two_or_more_minors,four_adults_and_0_or_more_minors,five_adults_and_0_or_more_minors,fifteen_or_more_inhabitants,only_minors,Year
1,Ciudad de Madrid,1278258.0,116921.0,114806.0,124815.0,35963.0,27460.0,5488.0,156118.0,173809.0,...,8792.0,102020.0,50724.0,7803.0,3361.0,45696.0,50463.0,513.0,342.0,2018
3,01. Centro,66411.0,11695.0,13210.0,5173.0,2142.0,970.0,191.0,12277.0,5164.0,...,191.0,4013.0,1060.0,183.0,79.0,1718.0,1854.0,43.0,18.0,2018
4,1001.0,542.0,72.0,86.0,63.0,16.0,10.0,4.0,82.0,63.0,...,0.0,30.0,13.0,2.0,2.0,13.0,18.0,0.0,0.0,2018
5,1002.0,481.0,100.0,91.0,36.0,19.0,7.0,0.0,89.0,28.0,...,1.0,38.0,9.0,1.0,1.0,10.0,11.0,0.0,1.0,2018
6,1003.0,884.0,154.0,179.0,74.0,40.0,13.0,2.0,141.0,77.0,...,3.0,54.0,18.0,4.0,0.0,23.0,30.0,0.0,1.0,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2482,21029.0,519.0,32.0,29.0,6.0,5.0,20.0,12.0,45.0,19.0,...,10.0,29.0,49.0,7.0,2.0,8.0,12.0,0.0,0.0,2018
2483,21030.0,566.0,37.0,46.0,29.0,11.0,19.0,6.0,68.0,52.0,...,3.0,39.0,41.0,3.0,0.0,14.0,16.0,0.0,0.0,2018
2484,21031.0,887.0,88.0,91.0,15.0,6.0,39.0,9.0,156.0,31.0,...,6.0,28.0,21.0,6.0,2.0,10.0,16.0,0.0,0.0,2018
2485,21032.0,627.0,65.0,71.0,17.0,4.0,27.0,13.0,82.0,23.0,...,2.0,21.0,30.0,5.0,0.0,7.0,6.0,1.0,1.0,2018


2022-10-21_nationalities2020.xls


Unnamed: 0,Madrid_section,Habitantes,Españoles,Extranjeros,Alemania,Austria,Bélgica,Bulgaria,Chipre,Croacia,...,Vietnam,Yemen,Otros_Países_de_Asia,Australia,Fiji,Nueva_Zelanda,Timor_Oriental,Vanuatu,Otros_Países_de_Oceanía,Year
3,0014,68.0,67.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020
4,001401,68.0,67.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020
5,001401001,68.0,67.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020
6,0029,4721.0,4073.0,648.0,12.0,0.0,1.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020
7,002901,4721.0,4073.0,648.0,12.0,0.0,1.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4840,182101,262.0,235.0,27.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020
4841,182101001,262.0,235.0,27.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020
4842,1837,1734.0,1475.0,259.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020
4843,183701,1734.0,1475.0,259.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020


2022-10-21_nationalities2019.xls


Unnamed: 0,Madrid_section,Habitantes,Españoles,Extranjeros,Alemania,Austria,Bélgica,Bulgaria,Chipre,Croacia,...,Yemen,Otros_Países_de_Asia,Australia,Fiji,Nueva_Zelanda,Papúa_Nueva_Guinea,Samoa,Timor_Oriental,Otros_Países_de_Oceanía,Year
3,0014,84.0,83.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019
4,001401,84.0,83.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019
5,001401001,84.0,83.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019
6,0029,4712.0,4098.0,614.0,8.0,0.0,1.0,9.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019
7,002901,4712.0,4098.0,614.0,8.0,0.0,1.0,9.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4840,182101,253.0,226.0,27.0,0.0,0.0,1.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019
4841,182101001,253.0,226.0,27.0,0.0,0.0,1.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019
4842,1837,1658.0,1413.0,245.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019
4843,183701,1658.0,1413.0,245.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019


2022-10-21_nationalities2018.xls


Unnamed: 0,Madrid_section,Habitantes,Españoles,Extranjeros,Alemania,Austria,Bélgica,Bulgaria,Chipre,Croacia,...,Yemen,Otros_Países_de_Asia,Australia,Fiji,Nueva_Zelanda,Papúa_Nueva_Guinea,Timor_Oriental,Vanuatu,Otros_Países_de_Oceanía,Year
3,0014,65.0,64.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018
4,001401,65.0,64.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018
5,001401001,65.0,64.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018
6,0029,4559.0,3953.0,606.0,8.0,0.0,1.0,13.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018
7,002901,4559.0,3953.0,606.0,8.0,0.0,1.0,13.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4837,182101,258.0,229.0,29.0,0.0,0.0,0.0,11.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018
4838,182101001,258.0,229.0,29.0,0.0,0.0,0.0,11.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018
4839,1837,1620.0,1390.0,230.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018
4840,183701,1620.0,1390.0,230.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018


2022-10-21_household2019.xls


Unnamed: 0,section,total,single_women_aged_16_to_64,single_men_aged_16_to_64,single_women_aged_65_or_over,single_men_aged_65_or_over,adult_women_with_one_or_more_minors,adult_men_with_one_or_more_minors,two_adults_from_16_to_64_and_without_minors,two_adults_one_at_least_65_and_without_minors,...,two_adults_over_35_and_one_adult_from_16_to_34_and_two_minors,three_adults_and_0_or_more_minors,two_adults_over_35_and_two_adults_from_16_to_34,two_adults_over_35_and_two_adults_from_16_to_34_and_one_minor,two_adults_over_35_and_two_adults_from_16_to_34_and_two_or_more_minors,four_adults_and_0_or_more_minors,five_adults_and_0_or_more_minors,fifteen_or_more_inhabitants,only_minors,Year
1,Ciudad de Madrid,1290164.0,117454.0,115691.0,126044.0,36720.0,27171.0,5491.0,158487.0,174284.0,...,9057.0,104079.0,51419.0,8009.0,3438.0,46977.0,53887.0,552.0,405.0,2019
3,01. Centro,67296.0,11627.0,13352.0,5117.0,2171.0,922.0,194.0,12698.0,5244.0,...,214.0,4341.0,1062.0,184.0,84.0,1711.0,1934.0,44.0,33.0,2019
4,1001.0,559.0,67.0,88.0,62.0,16.0,9.0,4.0,101.0,69.0,...,1.0,35.0,11.0,4.0,2.0,13.0,17.0,0.0,1.0,2019
5,1002.0,471.0,99.0,89.0,36.0,15.0,5.0,3.0,84.0,29.0,...,0.0,41.0,11.0,1.0,0.0,13.0,12.0,0.0,0.0,2019
6,1003.0,855.0,147.0,151.0,71.0,42.0,7.0,1.0,144.0,75.0,...,2.0,54.0,16.0,5.0,1.0,20.0,35.0,0.0,2.0,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2482,21029.0,526.0,35.0,33.0,7.0,5.0,21.0,9.0,36.0,24.0,...,11.0,25.0,51.0,8.0,2.0,9.0,16.0,0.0,0.0,2019
2483,21030.0,563.0,33.0,59.0,26.0,8.0,16.0,4.0,70.0,49.0,...,3.0,36.0,43.0,5.0,0.0,12.0,21.0,0.0,0.0,2019
2484,21031.0,948.0,99.0,94.0,14.0,7.0,41.0,8.0,155.0,35.0,...,6.0,29.0,25.0,6.0,0.0,17.0,23.0,0.0,0.0,2019
2485,21032.0,658.0,72.0,67.0,18.0,6.0,33.0,13.0,94.0,25.0,...,3.0,22.0,27.0,7.0,0.0,6.0,9.0,1.0,2.0,2019


2022-10-21_household2020.xls


Unnamed: 0,section,total,single_women_aged_16_to_64,single_men_aged_16_to_64,single_women_aged_65_or_over,single_men_aged_65_or_over,adult_women_with_one_or_more_minors,adult_men_with_one_or_more_minors,two_adults_from_16_to_64_and_without_minors,two_adults_one_at_least_65_and_without_minors,...,two_adults_over_35_and_one_adult_from_16_to_34_and_two_minors,three_adults_and_0_or_more_minors,two_adults_over_35_and_two_adults_from_16_to_34,two_adults_over_35_and_two_adults_from_16_to_34_and_one_minor,two_adults_over_35_and_two_adults_from_16_to_34_and_two_or_more_minors,four_adults_and_0_or_more_minors,five_adults_and_0_or_more_minors,fifteen_or_more_inhabitants,only_minors,Year
1,Ciudad de Madrid,1307682.0,117998.0,116863.0,126695.0,37538.0,26232.0,5328.0,163183.0,175581.0,...,9283.0,106779.0,51993.0,8131.0,3537.0,48740.0,59406.0,622.0,422.0,2020
3,01. Centro,69187.0,11695.0,13596.0,5057.0,2269.0,815.0,172.0,13563.0,5267.0,...,213.0,4719.0,1123.0,181.0,67.0,1898.0,2153.0,48.0,25.0,2020
4,1001.0,568.0,75.0,93.0,52.0,17.0,6.0,5.0,97.0,74.0,...,2.0,36.0,11.0,3.0,0.0,20.0,20.0,0.0,0.0,2020
5,1002.0,492.0,111.0,99.0,32.0,17.0,4.0,1.0,96.0,30.0,...,0.0,33.0,6.0,1.0,0.0,15.0,14.0,0.0,0.0,2020
6,1003.0,862.0,137.0,164.0,68.0,43.0,7.0,2.0,148.0,80.0,...,3.0,50.0,16.0,7.0,0.0,28.0,37.0,0.0,0.0,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2482,21029.0,514.0,29.0,26.0,6.0,7.0,17.0,7.0,41.0,23.0,...,10.0,25.0,49.0,10.0,0.0,14.0,18.0,0.0,0.0,2020
2483,21030.0,561.0,35.0,59.0,27.0,8.0,17.0,5.0,61.0,50.0,...,4.0,44.0,43.0,5.0,0.0,13.0,17.0,0.0,0.0,2020
2484,21031.0,988.0,106.0,96.0,17.0,9.0,37.0,7.0,156.0,40.0,...,7.0,33.0,32.0,7.0,1.0,15.0,28.0,1.0,0.0,2020
2485,21032.0,683.0,71.0,65.0,16.0,8.0,31.0,17.0,105.0,27.0,...,7.0,23.0,30.0,6.0,0.0,8.0,7.0,1.0,0.0,2020


## Verify tables created

In [25]:
# check if all tables in household.duckdb are created
con = duckdb.connect('household.duckdb')
household_tables = con.execute(f'SELECT table_name FROM information_schema.tables').df()
con.close()
household_tables


Unnamed: 0,table_name
0,household2020
1,household2018
2,household2019


In [26]:
# check if all tables in nationalities.db are created
con = duckdb.connect('nationalities.duckdb')
nationalities_tables = con.execute(f'SELECT table_name FROM information_schema.tables').df()
con.close()
nationalities_tables


Unnamed: 0,table_name
0,nationalities2018
1,nationalities2020
2,nationalities2019
