# Landing zone

In [1]:
# Uncomment to upgrade packages
# ! pip install duckdb --user --upgrade --quiet
# ! pip install pandas --user --upgrade --quiet
# ! pip install ipython-sql --user --upgrade --quiet
# ! pip install SQLAlchemy --user --upgrade --quiet
# ! pip install duckdb-engine --user --upgrade --quiet

In [2]:
import os
import requests
import shutil
from datetime import date

import pandas as pd
import duckdb 

In [3]:
DATA = {
    'household18.xls':'https://datos.madrid.es/egob/catalogo/300438-9-hogares-tama%C3%B1o.xls',
    'household19.xls':'https://datos.madrid.es/egob/catalogo/300438-10-hogares-tama%C3%B1o.xls',
    'household20.xls':'https://datos.madrid.es/egob/catalogo/300438-11-hogares-tama%C3%B1o.xls',
    'nationalities18.xls':'https://www.madrid.org/iestadis/fijas/estructu/demograficas/padron/descarga/pc18t18_m5.xls',
    'nationalities19.xls':'https://www.madrid.org/iestadis/fijas/estructu/demograficas/padron/descarga/pc19t19_m5.xls',
    'nationalities20.xls':'https://www.madrid.org/iestadis/fijas/estructu/demograficas/padron/descarga/pc20t20_m5.xls',  
}

## Temporal zone

In [4]:
def temporal_zone():
    """
    Creates the landing/temporal folder and downloads the data if does not exist.
    
    """

    if not os.path.exists('./landing/temporal'): 
        os.makedirs('./landing/temporal')

    for file, url in DATA.items():
        if not os.path.exists(f'./landing/temporal/{file}'):
            data_request = requests.get(url)
            open(f'./landing/temporal/{file}', 'wb').write(data_request.content)

In [5]:
# temporal_zone()

## Persistent zone

In [6]:
household_columns = ['section','delete','total','single_women_aged_16_to_64','single_men_aged_16_to_64','single_women_aged_65_or_over','single_men_aged_65_or_over',
    'adult_women_with_one_or_more_minors','adult_men_with_one_or_more_minors','two_adults_from_16_to_64_and_without_minors',
    'two_adults_one_at_least_65_and_without_minors','two_adults_and_one_minor','two_adults_and_two_minors','two_adults_and_three_or_more_minors',
    'two_adults_over_35_and_one_adult_from_16_to_34','two_adults_over_35_and_one_adult_from_16_to_34_and_one_minor',
    'two_adults_over_35_and_one_adult_from_16_to_34_and_two_minors','three_adults_and_0_or_more_minors','two_adults_over_35_and_two_adults_from_16_to_34',
    'two_adults_over_35_and_two_adults_from_16_to_34_and_one_minor','two_adults_over_35_and_two_adults_from_16_to_34_and_two_or_more_minors',
    'four_adults_and_0_or_more_minors','five_adults_and_0_or_more_minors','fifteen_or_more_inhabitants','only_minors']


In [7]:
def tmp2persistent(file):
    """
    Copies the temporary file to the persistent storage
    Returns the path of the persistent file created
    """
    today = date.today()
    source = f'./landing/temporal/{file}'
    destination = f'./landing/persistent/{today}_{file}'
    shutil.copy(source,destination) 
    return destination

def read_household(src_file):
    """
    Reads the Household Excel
    Returns the data in DataFrame format
    """
    df = pd.read_excel(src_file,sheet_name='Composicion del hogar',header=[5],names=household_columns,na_values=None)
    df['section'].fillna(df['delete'],inplace=True)
    df.drop(labels='delete', axis=1, inplace=True)
    df.dropna(inplace = True)
    return df

def read_nationalities(src_file):
    """
    Reads the Nationalities Excel
    Returns the data in DataFrame format
    """
    df = pd.read_excel(src_file,sheet_name='Total',header=[7],na_values=None)
    df.rename(columns = {'Unnamed: 0':'Code','Unnamed: 1':'Madrid','Unnamed: 2':'Habitantes','Unnamed: 3':'Españoles','Unnamed: 4':'Extranjeros'}, inplace = True)
    df.drop(df.filter(regex="Unname"),axis=1, inplace=True)
    df.drop(df.filter(like='Total'),axis=1, inplace=True)
    df.columns = df.columns.str.strip()
    df.columns = df.columns.str.replace(' ', '_')
    df.dropna(inplace = True)
    return df
    
def create_DB(DB,df,table_name):
    """
    Creates a persistent table in DuckDB from the contents of the DataFrame
    """
    con = duckdb.connect(DB)
    con.register(table_name,df)
    con.execute(f'CREATE TABLE {table_name} AS SELECT * FROM {table_name}')
    con.close()

def persistent_zone():
    """
    Creates the landing/persistent folder if not exists.
    Stores all the data in a DB persistent table
    """
    if not os.path.exists('./landing/persistent'): 
        os.makedirs('./landing/persistent')

    for file in list(DATA.keys())[:3]: # household 
        path_pers = tmp2persistent(file)
        df = read_household(path_pers)

        table_name = file.split('.')[0]
        print(table_name)
        create_DB('household.duckdb',df,table_name)

    for file in list(DATA.keys())[3:]: # nationalities
        path_pers = tmp2persistent(file)
        df = read_nationalities(path_pers)

        table_name = file.split('.')[0]
        print(table_name)
        create_DB('nationalities.duckdb',df,table_name)
    


In [8]:
persistent_zone()

household18
household19
household20
nationalities18
nationalities19
nationalities20
