# Imports workflow

This workflow it is a little more complex because here we need to download the data from the Central Bank. In exports, the data was already downloaded in the procomer_chapter and procomer_country scripts

## Libraries

Starting loading some libraries

In [1]:
from pathlib import Path
import datetime as dt
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import re
import os

Some auxilar functions

In [2]:
def string2number(x):
    y = re.findall("[\d]", x)
    y = ''.join(y)
    return y

def get_correct_exr(x):
    if len(x) == 5:
        y = pd.to_numeric(x) / 100
        return y
    if len(x) == 3:
        y = pd.to_numeric(x)
        return y
    raise ValueError("Not correct len for string (must be 5 or 3)")
    

In [3]:
buy = "¢662,00"
buy2 = string2number(buy)
buy2 = get_correct_exr(buy2)
print(buy2)

662.0


Now we create our parameters to pass in the expect call to the API of BCCR

In [4]:
base = "https://gee.bccr.fi.cr/Indicadores/Suscripciones/WS/wsindicadoreseconomicos.asmx/ObtenerIndicadoresEconomicos?"
final_day = dt.datetime.now().date().replace(month=12, day=31).strftime("%d/%m/%Y")
params_bccr = {
    "Indicador" : "1993",
    "FechaInicio" : "01/01/1999",
    "FechaFinal" : final_day,
    "Nombre" : "Marlon",
    "Subniveles" : "S",
    "CorreoElectronico" : os.getenv("BCCR_USER"),
    "Token" : os.getenv("BCCR_PASS")
}

We do the call. We check it, if it is 200, it was fine

In [5]:
response = requests.get(base, params=params_bccr)
print(response.status_code)

200


Let convert the response content into a pandas dataframe

In [6]:
root = ET.fromstring(response.content)
rows = {
    "cod": [x.text for x in root.findall(".//COD_INDICADORINTERNO")],
    "date": [x.text for x in root.findall(".//DES_FECHA")],
    "value": [x.text for x in root.findall(".//NUM_VALOR")],
}
raw_df = pd.DataFrame(rows)


We create a month and year col

In [7]:
raw_df.loc[:, "date"] = pd.to_datetime(raw_df.loc[:, "date"])
raw_df.loc[:, "year"] = raw_df.date.dt.year
raw_df.loc[:, "month"] = raw_df.date.dt.month
raw_df

Unnamed: 0,cod,date,value,year,month
0,1993,1999-01-31 00:00:00-06:00,539.68200000,1999,1
1,1993,1999-02-28 00:00:00-06:00,1070.75189383,1999,2
2,1993,1999-03-31 00:00:00-06:00,1670.59579383,1999,3
3,1993,1999-04-30 00:00:00-06:00,2159.76099634,1999,4
4,1993,1999-05-31 00:00:00-06:00,2678.14828913,1999,5
...,...,...,...,...,...
275,1993,2021-12-31 00:00:00-06:00,18401.47801558,2021,12
276,1993,2022-01-31 00:00:00-06:00,1736.33610531,2022,1
277,1993,2022-02-28 00:00:00-06:00,3442.77220127,2022,2
278,1993,2022-03-31 00:00:00-06:00,5489.44863579,2022,3


Now we filter the month 12 and a specif year of study (which the current year)

In [8]:
current_year = dt.datetime.now().year
agg_imp = raw_df.query("month == 12 | year == @current_year").copy(deep=True)
agg_imp.loc[:, "date"] = dt.datetime.now()
agg_imp

Unnamed: 0,cod,date,value,year,month
11,1993,2022-06-07 20:19:31.462752,6354.59765207,1999,12
23,1993,2022-06-07 20:19:31.462752,5544.82180287,2000,12
35,1993,2022-06-07 20:19:31.462752,5492.92729326,2001,12
47,1993,2022-06-07 20:19:31.462752,5958.10518467,2002,12
59,1993,2022-06-07 20:19:31.462752,6362.64871927,2003,12
71,1993,2022-06-07 20:19:31.462752,6903.63167723,2004,12
83,1993,2022-06-07 20:19:31.462752,7989.87321559,2005,12
95,1993,2022-06-07 20:19:31.462752,9168.15301413,2006,12
107,1993,2022-06-07 20:19:31.462752,10706.6239966,2007,12
119,1993,2022-06-07 20:19:31.462752,13027.30821803,2008,12


Now, we read the historical data. We convert the year to integer and round to 4 digits the value col

In [9]:
path_imp_hist = Path.cwd().parent.joinpath("data", "historical_imp_data_bccr.csv")
data_imp_hist = pd.read_csv(path_imp_hist,
sep="|")
data_imp_hist.loc[:, "time"] = pd.to_datetime(data_imp_hist.loc[:, "time"])
data_imp_hist.loc[:, "year"] = data_imp_hist.loc[:, "year"].astype("str").str.strip()
data_imp_hist.loc[:, "year"] = data_imp_hist.loc[:, "year"].replace({"NaN": np.nan})
data_imp_hist.loc[:, "year"] = data_imp_hist.loc[:, "year"].replace({"nan": np.nan})
data_imp_hist.loc[:, "year"] = pd.to_numeric(data_imp_hist.loc[:, "year"], downcast='integer')
data_imp_hist.loc[:, "year"] = round(data_imp_hist.loc[:, "year"], 0)
data_imp_hist.loc[:, "value"] = data_imp_hist.loc[:, "value"].replace({"NaN": np.nan})
data_imp_hist.loc[:, "value"] = data_imp_hist.loc[:, "value"].replace({"nan": np.nan})
data_imp_hist.loc[:, "value"] = pd.to_numeric(data_imp_hist.loc[:, "value"], errors='coerce')

Now, its time to filter with the last timestamp and group the data by year.

In [10]:
last_time = data_imp_hist["time"].min()
last_time

Timestamp('2021-08-29 17:04:02')

In [11]:
data_imp_hist

Unnamed: 0,time,value,year,month
0,2022-06-07 14:48:12.862689,6354.597652,1999,12.0
1,2022-06-07 14:48:12.862689,5544.821803,2000,12.0
2,2022-06-07 14:48:12.862689,5492.927293,2001,12.0
3,2022-06-07 14:48:12.862689,5958.105185,2002,12.0
4,2022-06-07 14:48:12.862689,6362.648719,2003,12.0
5,2022-06-07 14:48:12.862689,6903.631677,2004,12.0
6,2022-06-07 14:48:12.862689,7989.873216,2005,12.0
7,2022-06-07 14:48:12.862689,9168.153014,2006,12.0
8,2022-06-07 14:48:12.862689,10706.623997,2007,12.0
9,2022-06-07 14:48:12.862689,13027.308218,2008,12.0


In [12]:
data_imp_hist = (data_imp_hist.query("time == @last_time").copy(deep=True))
data_impagg_hist = data_imp_hist.groupby(["year", "month"]).agg({"value" : np.sum})
data_impagg_hist = data_impagg_hist.rename(columns={"value":"old"})
data_impagg_hist = data_impagg_hist.reset_index()
data_impagg_hist.year = data_impagg_hist.year.astype(int)
data_impagg_hist.month = data_impagg_hist.month.astype(int)
data_impagg_hist.loc[:, "old"] = data_impagg_hist.loc[:, "old"].round(4)

In [13]:
data_impagg_hist = data_impagg_hist.query("month == 12 | year == @current_year").copy(deep=True)
data_impagg_hist

Unnamed: 0,year,month,old
0,1999,12,6354.5977
1,2000,12,5544.8218
2,2001,12,5492.9273
3,2002,12,5958.1052
4,2003,12,6362.6487
5,2004,12,6903.6317
6,2005,12,7989.8732
7,2006,12,9168.153
8,2007,12,10706.624
9,2008,12,13027.3082


Now we set up the new data

In [14]:
new_data = agg_imp.groupby(["year", "month"]).agg({"value" : np.sum})
new_data = new_data.reset_index()
new_data = new_data.rename(columns={"value":"new"})
new_data.loc[:, "new"] = pd.to_numeric(new_data.loc[:, "new"])
new_data.loc[:, "new"] = round(new_data.loc[:, "new"], 4)
new_data

Unnamed: 0,year,month,new
0,1999,12,6354.5977
1,2000,12,5544.8218
2,2001,12,5492.9273
3,2002,12,5958.1052
4,2003,12,6362.6487
5,2004,12,6903.6317
6,2005,12,7989.8732
7,2006,12,9168.153
8,2007,12,10706.624
9,2008,12,13027.3082


We compare the new data. We add a tol parameter

In [15]:
tol = 0.001

In [16]:
compare_data = (
    pd.merge(new_data, data_impagg_hist, how="left", on=["month", "year"])
    .assign(
        old=lambda x: np.where(np.isnan(x["old"]), 0, x["old"]),
        new=lambda x: np.where(np.isnan(x["new"]), 0, x["new"]),
    )
    .assign(check=lambda x: abs(x["old"] - x["new"]) <= tol)
)


Now we write the data

In [17]:
path_imp_status = Path.cwd().parent.joinpath("data", "status_bccr.csv") 
compare_data.to_csv(path_imp_status, sep='|', index=False)

Now, if the historical data is to heavy and causes an conflict to github is better to keep only the last
values and restart the counter. We set a threshold of 40 mb for removing rows

In [18]:
def write_hist_imp(path, limit=40, file2write=None):
    mb_hist = os.stat(path).st_size/1000000
    if mb_hist <= limit:
        file = pd.concat([agg_imp.drop(columns=["cod"]).rename(columns={"date":"time"}), data_imp_hist])
    else:
        file = agg_imp.drop(columns=["cod"]).rename(columns={"date":"time"})
    path2hist = Path.cwd().parent.joinpath("data", file2write) 
    file.to_csv(path2hist, sep='|', index=False)
    

In [19]:
write_hist_imp(path=path_imp_hist, file2write="historical_imp_data_bccr.csv")