<h2>lnd_indicators_params</h2>
<p>Recover the country indicators params from IBGE Paises API web page with BeautifulSoup</p>
<p>Source: https://servicodados.ibge.gov.br/api/docs/paises<br>
Destination: s3://bucket/lnd/ibge/country_params </p>

In [94]:
from bs4 import BeautifulSoup
from datetime import datetime

import configparser as cp
import requests as re
import pandas as pd
import pandera as pa

<h3>Process parameters</h3>


In [95]:
table = {
    'schema': 'ibge',
    'name': 'country_indicators',
    'type': 'api'
}

storage = {
    'landing': 'lnd',
    'raw': 'raw',
    'trusted': 'trs',
    'refined': 'rfn'
}

<h3>Recover environment variables</h3>

In [63]:
cfg = cp.ConfigParser()
cfg.read('../config.cfg')

['../config.cfg']

<h3>Define source and sink variables</h3>

In [64]:
src = f"https://servicodados.ibge.gov.br/api/docs/paises"
sink = f"s3://{cfg['s3']['bucket_name']}/{storage['landing']}/{table['schema']}/{table['name']}.csv"

<h3>Scrap web page</h3>

In [65]:
page = re.get(src)
sp = BeautifulSoup(page.text, 'html.parser')

In [66]:
indicators = sp.find(id='ACERVO-container').find_all('label')

In [67]:
indicator_codes = []
indicator_names = []

for indicator in indicators:
    code = indicator.find('span')
    
    if code.get('class')[0] == 'iden5':
        indicator_codes.append(int(code.contents[0]))
        code.decompose()
        
        indicator_names.append(indicator.contents[0].replace(' - ', '', 1))
    

<h3>Create, dedup and validate dataframe</h3>

In [68]:
df = pd.DataFrame(
    [[indicator_codes[i], indicator_names[i]] for i in range(0, len(indicator_codes))], columns=['indicator_code', 'indicator_name']
    )

df = df.drop_duplicates()

In [77]:
schema = pa.DataFrameSchema(
    {
        'indicator_code': pa.Column('int', checks=[], nullable=False),
        'indicator_name': pa.Column('str', checks=[pa.Check.str_length(min_value=0, max_value=90)], nullable=False)
    }
)

df = schema.validate(df)

In [None]:
df['dt_insercao_lnd'] = datetime.utcnow()

<h3>Write to sink</h3>

In [79]:
df.to_csv(path_or_buf=sink,
          encoding='UTF-8',
          sep=';',
          index=False,
            storage_options={
                "key": cfg['s3']['access_key'],
                "secret": cfg['s3']['secret_key']
            })