# Landing zone

In [1]:
# Uncomment to upgrade packages
# ! pip install duckdb --user --upgrade --quiet
# ! pip install pandas --user --upgrade --quiet
# ! pip install ipython-sql --user --upgrade --quiet
# ! pip install SQLAlchemy --user --upgrade --quiet
# ! pip install duckdb-engine --user --upgrade --quiet

In [2]:
import os
import requests
import shutil
from datetime import date

import pandas as pd
import duckdb 

In [3]:
DATA = {
    'household18.xls':'https://datos.madrid.es/egob/catalogo/300438-9-hogares-tama%C3%B1o.xls',
    'household19.xls':'https://datos.madrid.es/egob/catalogo/300438-10-hogares-tama%C3%B1o.xls',
    'household20.xls':'https://datos.madrid.es/egob/catalogo/300438-11-hogares-tama%C3%B1o.xls'
}

## Temporal zone

In [4]:
def temporal_zone():
    """
    Creates the landing/temporal folder and downloads the data if does not exist.
    """

    if not os.path.exists('./landing/temporal'): 
        os.makedirs('./landing/temporal')

    for file, url in DATA.items():
        if not os.path.exists(f'./landing/temporal/{file}'):
            data_request = requests.get(url)
            open(f'./landing/temporal/{file}', 'wb').write(data_request.content)

In [5]:
temporal_zone()

## Persistent zone

In [6]:
column_names = ['section','delete','total','alone_w_16-64','alone_m_16-64','alone_w_+65','alone_m_+65','adult_w_+1ch','adult_m_+1ch','2adults_16-64_0ch','2adults_ALO+65_0ch','2adults_1ch','2adults_2ch','2adults_+3ch','2adults_+35_1adult_16-34','2adults_+35_1adult_16-34_1ch','2adults_+35_1adult_16-34_2ch','3adults_+0ch','2adults_+35_2adults_16-34','2adults_+35_2adults_16-34_1ch','2adults_+35_2adults_16-34_+2ch','4adults_+0ch','5_adults_+0ch','+15ppl','only_childs']

In [7]:
def tmp2persistent(file):
    """
    Copies the temporary file to the persistent storage
    Returns the path of the persistent file created
    """
    today = date.today()
    source = f'./landing/temporal/{file}'
    destination = f'./landing/persistent/{today}_{file}'
    shutil.copy(source,destination) 
    return destination


def read_nationalities(src_file):
    """
    Reads the nationatities Excel
    Returns the data in DataFrame format
    """
    df = pd.read_excel(src_file,sheet_name='Composicion del hogar',header=[5],na_values=None)
    df.rename(columns = {'Unnamed: 0':'Sección','Unnamed: 2':'Total'}, inplace = True)
    df.rename(columns = {'Distrito / Sección':'Sección'}, inplace = True)
    df['Sección'].fillna(df['Unnamed: 1'],inplace=True)
    df.drop(labels='Unnamed: 1', axis=1, inplace=True)
    # df.columns = df.columns.str.replace(' ', '_')
    df.dropna(inplace = True)
    return df


def create_DB(df,table_name):
    """
    Creates a persistent table in DuckDB from the contents of the DataFrame
    """
    con = duckdb.connect('household.duckdb')
    con.register(table_name,df)
    con.execute(f'CREATE TABLE {table_name} AS SELECT * FROM {table_name}')
    con.close()

def persistent_zone():
    """
    Creates the landing/persistent folder if not exists.
    Stores all the data in a DB persistent table
    """
    if not os.path.exists('./landing/persistent'): 
        os.makedirs('./landing/persistent')

    dfs = []
    for file in DATA.keys():
        path_pers = tmp2persistent(file)
        df = read_nationalities(path_pers)

        table_name = file.split('.')[0]
        print(table_name)
        create_DB(df,table_name)


In [8]:
persistent_zone()

household18
household19
household20


In [20]:
! pip install pyarrow

Defaulting to user installation because normal site-packages is not writeable
Collecting pyarrow
  Downloading pyarrow-9.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.3/35.3 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pyarrow
Successfully installed pyarrow-9.0.0


In [21]:
import pyarrow as pa

In [24]:
con =  duckdb.connect('household.duckdb')
columns = con.execute('SELECT Column_Name FROM INFORMATION_SCHEMA.COLUMNS').arrow()

columns
# con.close()

pyarrow.Table
column_name: string
----
column_name: [["Sección","Total","Una mujer sola de 16 a 64 años ","Un hombre solo de 16 a 64 años","Una mujer sola de 65 o más años",...,"Dos adultos de 35 años o más, dos  de 16 a 34 años y dos o más menores","Otro hogar de cuatro adultos, con o sin menores","Cinco o más adultos, con o sin menores","Hogar con 15 ó más habitantes","Hogares con menores solos"]]

In [26]:
columns[0][0]

<pyarrow.StringScalar: 'Sección'>

In [10]:
con.execute("SELECT * FROM household20").df()

Unnamed: 0,Sección,Total,Una mujer sola de 16 a 64 años,Un hombre solo de 16 a 64 años,Una mujer sola de 65 o más años,Un hombre solo de 65 o más años,Una mujer adulta con uno o más menores,Un hombre adulto con uno o más menores,"Dos adultos de 16 a 64 años, sin menores","Dos adultos, uno al menos de 65 o más años, sin menores",...,"Dos adultos de 35 años o más, otro de 16 a 34 años y un menor","Dos adultos de 35 años o más, otro de 16 a 34 años y dos o más menores","Otro hogar de tres adultos, con o sin menores","Dos adultos de 35 años o más, dos de 16 a 34 años, sin menores","Dos adultos de 35 años o más, dos de 16 a 34 años y un menor","Dos adultos de 35 años o más, dos de 16 a 34 años y dos o más menores","Otro hogar de cuatro adultos, con o sin menores","Cinco o más adultos, con o sin menores",Hogar con 15 ó más habitantes,Hogares con menores solos
0,Ciudad de Madrid,1307682.0,117998.0,116863.0,126695.0,37538.0,26232.0,5328.0,163183.0,175581.0,...,25255.0,9283.0,106779.0,51993.0,8131.0,3537.0,48740.0,59406.0,622.0,422.0
1,01. Centro,69187.0,11695.0,13596.0,5057.0,2269.0,815.0,172.0,13563.0,5267.0,...,476.0,213.0,4719.0,1123.0,181.0,67.0,1898.0,2153.0,48.0,25.0
2,1001.0,568.0,75.0,93.0,52.0,17.0,6.0,5.0,97.0,74.0,...,3.0,2.0,36.0,11.0,3.0,0.0,20.0,20.0,0.0,0.0
3,1002.0,492.0,111.0,99.0,32.0,17.0,4.0,1.0,96.0,30.0,...,1.0,0.0,33.0,6.0,1.0,0.0,15.0,14.0,0.0,0.0
4,1003.0,862.0,137.0,164.0,68.0,43.0,7.0,2.0,148.0,80.0,...,10.0,3.0,50.0,16.0,7.0,0.0,28.0,37.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,21029.0,514.0,29.0,26.0,6.0,7.0,17.0,7.0,41.0,23.0,...,30.0,10.0,25.0,49.0,10.0,0.0,14.0,18.0,0.0,0.0
2461,21030.0,561.0,35.0,59.0,27.0,8.0,17.0,5.0,61.0,50.0,...,19.0,4.0,44.0,43.0,5.0,0.0,13.0,17.0,0.0,0.0
2462,21031.0,988.0,106.0,96.0,17.0,9.0,37.0,7.0,156.0,40.0,...,24.0,7.0,33.0,32.0,7.0,1.0,15.0,28.0,1.0,0.0
2463,21032.0,683.0,71.0,65.0,16.0,8.0,31.0,17.0,105.0,27.0,...,24.0,7.0,23.0,30.0,6.0,0.0,8.0,7.0,1.0,0.0
