<h2>lnd_country_params</h2>
<p>Recover the country params from IBGE Paises API web page with BeautifulSoup</p>
<p>Source: https://servicodados.ibge.gov.br/api/docs/paises<br>
Destination: s3://bucket/lnd/ibge/country_params </p>

In [113]:
from bs4 import BeautifulSoup
from datetime import datetime

import configparser as cp
import requests as re
import pandas as pd
import pandera as pa

<h3>Process parameters</h3>


In [100]:
table = {
    'schema': 'ibge',
    'name': 'country_params',
    'type': 'api'
}

storage = {
    'landing': 'lnd',
    'raw': 'raw',
    'trusted': 'trs',
    'refined': 'rfn'
}

<h3>Recover environment variables</h3>

In [101]:
cfg = cp.ConfigParser()
cfg.read('../config.cfg')

['../config.cfg']

<h3>Define source and sink variables</h3>

In [134]:
src = f"https://servicodados.ibge.gov.br/api/docs/paises"
sink = f"s3://{cfg['s3']['bucket_name']}/{storage['landing']}/{table['schema']}/{table['name']}_{datetime.utcnow().strftime('%y%m%d_%H%M%S')}.csv"

<h3>Scrap web page</h3>

In [103]:
page = re.get(src)
sp = BeautifulSoup(page.text, 'html.parser')

In [104]:
countries = sp.find(id='ACERVO-container').find_all('label')

In [105]:
initials = []
names = []

for country in countries:
    initial = country.find('span')
    
    if initial.get('class')[0] == 'iden2':
        initials.append(initial.contents[0])
        initial.decompose()
        
        names.append(country.contents[0].replace(' - ', ''))

<h3>Create, dedup and validate dataframe</h3>

In [108]:
df = pd.DataFrame(
    [[initials[i], names[i]] for i in range(0, len(initials))], columns=['country_initial', 'country_name']
    )

df = df.drop_duplicates()

In [129]:
schema = pa.DataFrameSchema(
    {
        'country_initial': pa.Column('str', checks=[pa.Check.str_length(min_value=2, max_value=2)], nullable=False),
        'country_name': pa.Column('str', checks=[pa.Check.str_length(min_value=0, max_value=40)], nullable=False)
    }
)

df = schema.validate(df)

<h3>Write to sink</h3>

In [137]:
df.to_csv(path_or_buf=sink,
          encoding='UTF-8',
          sep=';',
          index=False,
            storage_options={
                "key": cfg['s3']['access_key'],
                "secret": cfg['s3']['secret_key']
            })