# Imports workflow

This workflow it is a little more complex because here we need to download the data from the Central Bank. In exports, the data was already downloaded in the procomer_chapter and procomer_country scripts

In [None]:
import selenium as sl
print(sl.__version__)

## Libraries

Starting loading some libraries

In [None]:
from pathlib import Path
import datetime as dt
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import re
import os

Some auxilar functions

In [None]:
def string2number(x):
    y = re.findall("[\d]", x)
    y = ''.join(y)
    return y

def get_correct_exr(x):
    if len(x) == 5:
        y = pd.to_numeric(x) / 100
        return y
    if len(x) == 3:
        y = pd.to_numeric(x)
        return y
    raise ValueError("Not correct len for string (must be 5 or 3)")
    

In [None]:
buy = "¢662,00"
buy2 = string2number(buy)
buy2 = get_correct_exr(buy2)
print(buy2)

Now we create our parameters to pass in the expect call to the API of BCCR

In [None]:
base = "https://gee.bccr.fi.cr/Indicadores/Suscripciones/WS/wsindicadoreseconomicos.asmx/ObtenerIndicadoresEconomicos?"
final_day = dt.datetime.now().date().replace(month=12, day=31).strftime("%d/%m/%Y")
params_bccr = {
    "Indicador" : "1993",
    "FechaInicio" : "01/01/1999",
    "FechaFinal" : final_day,
    "Nombre" : "Marlon",
    "Subniveles" : "S",
    "CorreoElectronico" : os.getenv("BCCR_USER"),
    "Token" : os.getenv("BCCR_PASS")
}

We do the call. We check it, if it is 200, it was fine

In [None]:
response = requests.get(base, params=params_bccr)
print(response.status_code)

Let convert the response content into a pandas dataframe

In [None]:
root = ET.fromstring(response.content)
rows = {
    "cod": [x.text for x in root.findall(".//COD_INDICADORINTERNO")],
    "date": [x.text for x in root.findall(".//DES_FECHA")],
    "value": [x.text for x in root.findall(".//NUM_VALOR")],
}
raw_df = pd.DataFrame(rows)


We create a month and year col

In [None]:
raw_df.loc[:, "date"] = pd.to_datetime(raw_df.loc[:, "date"])
raw_df.loc[:, "year"] = raw_df.date.dt.year
raw_df.loc[:, "month"] = raw_df.date.dt.month
raw_df

Now we filter the month 12 and a specif year of study (which the current year)

In [None]:
current_year = dt.datetime.now().year
agg_imp = raw_df.query("month == 12 | year == @current_year").copy(deep=True)
agg_imp.loc[:, "date"] = dt.datetime.now()
agg_imp

Now, we read the historical data. We convert the year to integer and round to 4 digits the value col

In [None]:
path_imp_hist = Path.cwd().parent.joinpath("data", "historical_imp_data_bccr.csv")
data_imp_hist = pd.read_csv(path_imp_hist,
sep="|")
data_imp_hist.loc[:, "time"] = pd.to_datetime(data_imp_hist.loc[:, "time"])
data_imp_hist.loc[:, "year"] = data_imp_hist.loc[:, "year"].astype("str").str.strip()
data_imp_hist.loc[:, "year"] = data_imp_hist.loc[:, "year"].replace({"NaN": np.nan})
data_imp_hist.loc[:, "year"] = data_imp_hist.loc[:, "year"].replace({"nan": np.nan})
data_imp_hist.loc[:, "year"] = pd.to_numeric(data_imp_hist.loc[:, "year"], downcast='integer')
data_imp_hist.loc[:, "year"] = round(data_imp_hist.loc[:, "year"], 0)
data_imp_hist.loc[:, "value"] = data_imp_hist.loc[:, "value"].replace({"NaN": np.nan})
data_imp_hist.loc[:, "value"] = data_imp_hist.loc[:, "value"].replace({"nan": np.nan})
data_imp_hist.loc[:, "value"] = pd.to_numeric(data_imp_hist.loc[:, "value"], errors='coerce')

Now, its time to filter with the last timestamp and group the data by year.

In [None]:
last_time = data_imp_hist["time"].min()
last_time

In [None]:
data_imp_hist

In [None]:
data_imp_hist = (data_imp_hist.query("time == @last_time").copy(deep=True))
data_impagg_hist = data_imp_hist.groupby(["year", "month"]).agg({"value" : np.sum})
data_impagg_hist = data_impagg_hist.rename(columns={"value":"old"})
data_impagg_hist = data_impagg_hist.reset_index()
data_impagg_hist.year = data_impagg_hist.year.astype(int)
data_impagg_hist.month = data_impagg_hist.month.astype(int)
data_impagg_hist.loc[:, "old"] = data_impagg_hist.loc[:, "old"].round(4)

In [None]:
data_impagg_hist = data_impagg_hist.query("month == 12 | year == @current_year").copy(deep=True)
data_impagg_hist

Now we set up the new data

In [None]:
new_data = agg_imp.groupby(["year", "month"]).agg({"value" : np.sum})
new_data = new_data.reset_index()
new_data = new_data.rename(columns={"value":"new"})
new_data.loc[:, "new"] = pd.to_numeric(new_data.loc[:, "new"])
new_data.loc[:, "new"] = round(new_data.loc[:, "new"], 4)
new_data

We compare the new data. We add a tol parameter

In [None]:
tol = 0.001

In [None]:
compare_data = (
    pd.merge(new_data, data_impagg_hist, how="left", on=["month", "year"])
    .assign(
        old=lambda x: np.where(np.isnan(x["old"]), 0, x["old"]),
        new=lambda x: np.where(np.isnan(x["new"]), 0, x["new"]),
    )
    .assign(check=lambda x: abs(x["old"] - x["new"]) <= tol)
)


Now we write the data

In [None]:
path_imp_status = Path.cwd().parent.joinpath("data", "status_bccr.csv") 
compare_data.to_csv(path_imp_status, sep='|', index=False)

Now, if the historical data is to heavy and causes an conflict to github is better to keep only the last
values and restart the counter. We set a threshold of 40 mb for removing rows

In [None]:
def write_hist(path, limit=40, file2write):
    mb_hist = os.stat(path).st_size/1000000
    if mb_hist <= limit:
        file = pd.concat([agg_imp.drop(columns=["cod"]).rename(columns={"date":"time"}), data_imp_hist])
    else:
        file = agg_imp.drop(columns=["cod"]).rename(columns={"date":"time"})
    path2hist = Path.cwd().parent.joinpath("data", file2write) 
    file.to_csv(path2hist, sep='|', index=False)
    

In [None]:
write_hist(path=path_imp_hist, file2write="historical_imp_data_bccr.csv")