# Trusted zone

In [11]:
# ! pip install duckdb --user --upgrade --quiet
# ! pip install pyarrow --user --upgrade --quiet

In [12]:
import duckdb
import pyarrow as pa
import numpy as np

In [13]:
def columns(name_DB):
    con =  duckdb.connect(f'{name_DB}.duckdb')
    c1 = con.execute(f"SELECT Column_Name FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = N'{name_DB}2018'").arrow()
    cols1 = c1.column(0).to_pylist()

    c2 = con.execute(f"SELECT Column_Name FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = N'{name_DB}2019'").arrow()
    cols2 = c2.column(0).to_pylist()

    c3 = con.execute(f"SELECT Column_Name FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = N'{name_DB}2020'").arrow()
    cols3 = c3.column(0).to_pylist()
    return cols1, cols2, cols3

def join_table(cols1,cols2,cols3):
    v18 = [f'v18.{x} AS {x}_2018' for x in cols1]
    v19 = [f'v19.{x} AS {x}_2019' for x in cols2]
    v20 = [f'v20.{x} AS {x}_2020' for x in cols3]
    all = np.concatenate([v18,v19,v20])
    columns = ', '.join(all)
    return columns

def join_household():
    cols1, cols2, cols3 = columns('household')
    str_join = join_table(cols1[1:],cols2[1:],cols3[1:]) # skip section

    con = duckdb.connect(f'household.duckdb')
    query = f"CREATE TABLE household AS SELECT v18.section, {str_join} FROM household2018 v18 "
    query += "FULL OUTER JOIN household2019 v19 ON v18.section = v19.section "
    query += "FULL OUTER JOIN household2020 v20 ON v19.section = v20.section"
    df = con.execute(query)
    con.close()

def join_nationalities():
    cols1, cols2, cols3 = columns('nationalities')
    str_join = join_table(cols1[2:],cols2[2:],cols3[2:]) # skip code and madrid

    con = duckdb.connect(f'nationalities.duckdb')
    query = f"CREATE TABLE nationalities AS SELECT v18.Code, v18.Madrid, {str_join} FROM nationalities2018 v18 "
    query += "FULL OUTER JOIN nationalities2019 v19 ON v18.Code = v19.Code "
    query += "FULL OUTER JOIN nationalities2020 v20 ON v19.Code = v20.Code"
    df = con.execute(query)
    con.close()

def trusted_zone():
    join_household()
    join_nationalities()
    

In [14]:
trusted_zone()

## Verification of joining data

### household

In [15]:
con = duckdb.connect('household.duckdb')
household = con.execute("SELECT * FROM household").df()
con.close()

In [16]:
household #  ready to be pre-processed

Unnamed: 0,section,total_2018,single_women_aged_16_to_64_2018,single_men_aged_16_to_64_2018,single_women_aged_65_or_over_2018,single_men_aged_65_or_over_2018,adult_women_with_one_or_more_minors_2018,adult_men_with_one_or_more_minors_2018,two_adults_from_16_to_64_and_without_minors_2018,two_adults_one_at_least_65_and_without_minors_2018,...,two_adults_over_35_and_one_adult_from_16_to_34_and_one_minor_2020,two_adults_over_35_and_one_adult_from_16_to_34_and_two_minors_2020,three_adults_and_0_or_more_minors_2020,two_adults_over_35_and_two_adults_from_16_to_34_2020,two_adults_over_35_and_two_adults_from_16_to_34_and_one_minor_2020,two_adults_over_35_and_two_adults_from_16_to_34_and_two_or_more_minors_2020,four_adults_and_0_or_more_minors_2020,five_adults_and_0_or_more_minors_2020,fifteen_or_more_inhabitants_2020,only_minors_2020
0,Ciudad de Madrid,1278258.0,116921.0,114806.0,124815.0,35963.0,27460.0,5488.0,156118.0,173809.0,...,25255.0,9283.0,106779.0,51993.0,8131.0,3537.0,48740.0,59406.0,622.0,422.0
1,01. Centro,66411.0,11695.0,13210.0,5173.0,2142.0,970.0,191.0,12277.0,5164.0,...,476.0,213.0,4719.0,1123.0,181.0,67.0,1898.0,2153.0,48.0,25.0
2,1001.0,542.0,72.0,86.0,63.0,16.0,10.0,4.0,82.0,63.0,...,3.0,2.0,36.0,11.0,3.0,0.0,20.0,20.0,0.0,0.0
3,1002.0,481.0,100.0,91.0,36.0,19.0,7.0,0.0,89.0,28.0,...,1.0,0.0,33.0,6.0,1.0,0.0,15.0,14.0,0.0,0.0
4,1003.0,884.0,154.0,179.0,74.0,40.0,13.0,2.0,141.0,77.0,...,10.0,3.0,50.0,16.0,7.0,0.0,28.0,37.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,20. San Blas-Canillejas,59462.0,4200.0,4747.0,5239.0,1493.0,1520.0,274.0,7074.0,7573.0,...,1932.0,459.0,4638.0,3215.0,433.0,149.0,2105.0,2508.0,20.0,18.0
2461,1096.0,533.0,101.0,100.0,37.0,22.0,6.0,3.0,97.0,29.0,...,4.0,0.0,46.0,10.0,1.0,0.0,18.0,16.0,0.0,1.0
2462,2091.0,486.0,44.0,40.0,47.0,9.0,10.0,3.0,64.0,77.0,...,4.0,1.0,42.0,31.0,0.0,1.0,17.0,10.0,0.0,0.0
2463,5007.0,642.0,32.0,27.0,68.0,18.0,5.0,4.0,50.0,104.0,...,9.0,9.0,86.0,36.0,2.0,6.0,42.0,50.0,1.0,0.0


### nationalities

In [17]:
con = duckdb.connect('nationalities.duckdb')
nationalities = con.execute("SELECT * FROM nationalities").df()
con.close()

In [18]:
nationalities #  ready to be pre-processed

Unnamed: 0,Code,Madrid,Habitantes_2018,Españoles_2018,Extranjeros_2018,Alemania_2018,Austria_2018,Bélgica_2018,Bulgaria_2018,Chipre_2018,...,Uzbekistán_2020,Vietnam_2020,Yemen_2020,Otros_Países_de_Asia_2020,Australia_2020,Fiji_2020,Nueva_Zelanda_2020,Timor_Oriental_2020,Vanuatu_2020,Otros_Países_de_Oceanía_2020
0,0014,Acebeda (La),65.0,64.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0029,Ajalvir,4559.0,3953.0,606.0,8.0,0.0,1.0,13.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0035,Alameda del Valle,195.0,192.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0040,Alamo (El),9353.0,8143.0,1210.0,1.0,0.0,1.0,12.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0053,Alcalá de Henares,193751.0,163933.0,29818.0,89.0,12.0,13.0,1351.0,0.0,...,3.0,13.0,11.0,2.0,7.0,0.0,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
414,07,Nordeste Comunidad,63572.0,55648.0,7924.0,53.0,3.0,19.0,461.0,0.0,...,3.0,2.0,0.0,11.0,13.0,0.0,3.0,0.0,0.0,0.0
415,08,Sudeste Comunidad,104366.0,92803.0,11563.0,65.0,3.0,23.0,233.0,0.0,...,7.0,11.0,2.0,25.0,22.0,0.0,1.0,0.0,0.0,1.0
416,09,Sudoeste Comunidad,138537.0,126136.0,12401.0,57.0,7.0,19.0,327.0,0.0,...,6.0,35.0,7.0,31.0,51.0,0.0,12.0,0.0,0.0,2.0
417,10,Sierra Sur,34970.0,29932.0,5038.0,28.0,2.0,8.0,97.0,0.0,...,6.0,108.0,37.0,64.0,34.0,0.0,10.0,0.0,1.0,6.0
