In [1]:
import os
from dotenv import load_dotenv
os.chdir(os.path.expanduser(".."))

In [2]:
from src.collect.collect import SDMXCollector
import pandas as pd
from io import StringIO

In [3]:
from sqlalchemy import create_engine, URL

### Get the balance of payments

In [4]:
load_dotenv()

pg_user = os.environ["POSTGRES_USER"]
pg_db = os.environ["POSTGRES_DB"]

url = URL.create(
  "postgresql+psycopg2",
  username=pg_user,
  database=pg_db
)

con = create_engine(url)

In [5]:
def sample_to_pandas(sample, 
                     parse_dates: list[str] = None):

  df = pd.read_csv(StringIO(sample),
            parse_dates=parse_dates,
            engine="pyarrow")

  return df

In [6]:
def factorize(df: pd.DataFrame):
  obj_cols = df.keys()[df.dtypes == "object"]
  factor_array = []

  for col in obj_cols:
    indices, factors = pd.factorize(df[col])
    df.loc[:, col] = indices
    factor_array.append( (col, factors) )

  return df, factor_array

In [7]:
collector = SDMXCollector("sdmx.oecd.org/public", "rest")

n_args = 8
flow_ref = ["OECD.SDD.TPS", "DSD_BOP@DF_BOP", ""]

sample = collector.get(flow_ref, n_args=n_args, params={"format": "csv"})

In [8]:
df = sample_to_pandas(sample, parse_dates=["TIME_PERIOD"])
df.drop("DATAFLOW", axis=1, inplace=True)
df.head()

  df = pd.read_csv(StringIO(sample),


Unnamed: 0,REF_AREA,COUNTERPART_AREA,MEASURE,ACCOUNTING_ENTRY,FS_ENTRY,FREQ,UNIT_MEASURE,ADJUSTMENT,TIME_PERIOD,OBS_VALUE,OBS_STATUS,UNIT_MULT,CURRENCY,DECIMALS
0,NLD,WXD,S,C,T,Q,XDC,Y,2003-04-01,16209.56,A,6,EUR,2
1,NLD,WXD,S,C,T,Q,XDC,Y,2003-07-01,16491.17,A,6,EUR,2
2,NLD,WXD,S,C,T,Q,XDC,Y,2003-10-01,16172.25,A,6,EUR,2
3,NLD,WXD,S,C,T,Q,XDC,Y,2004-01-01,16250.9,A,6,EUR,2
4,NLD,WXD,S,C,T,Q,XDC,Y,2004-04-01,16740.05,A,6,EUR,2


In [9]:
df, factor_array = factorize(df)
df.head()

Unnamed: 0,REF_AREA,COUNTERPART_AREA,MEASURE,ACCOUNTING_ENTRY,FS_ENTRY,FREQ,UNIT_MEASURE,ADJUSTMENT,TIME_PERIOD,OBS_VALUE,OBS_STATUS,UNIT_MULT,CURRENCY,DECIMALS
0,0,0,0,0,0,0,0,0,2003-04-01,16209.56,0,6,0,2
1,0,0,0,0,0,0,0,0,2003-07-01,16491.17,0,6,0,2
2,0,0,0,0,0,0,0,0,2003-10-01,16172.25,0,6,0,2
3,0,0,0,0,0,0,0,0,2004-01-01,16250.9,0,6,0,2
4,0,0,0,0,0,0,0,0,2004-04-01,16740.05,0,6,0,2


In [20]:
df.to_sql(name="balance_of_pay", con=con, if_exists='replace')

693

### Get interest rates 

In [10]:
n_args = 7
flow_ref = ["OECD.SDD.STES", "DSD_KEI@DF_KEI", "4.0"]

sample = collector.get(flow_ref, n_args=n_args, params={"format": "csv"})

In [11]:
df = sample_to_pandas(sample, parse_dates=["TIME_PERIOD"])
df.drop("DATAFLOW", axis=1, inplace=True)

df, factor_array_ir = factorize(df)
df.head()

Unnamed: 0,REF_AREA,FREQ,MEASURE,UNIT_MEASURE,ACTIVITY,ADJUSTMENT,TRANSFORMATION,TIME_PERIOD,OBS_VALUE,OBS_STATUS,UNIT_MULT,DECIMALS,BASE_PER
0,0,0,0,0,0,0,0,0,0.962752,0,0,1,
1,0,0,0,0,0,0,0,1,2.519469,0,0,1,
2,0,0,0,0,0,0,0,2,4.938903,0,0,1,
3,0,0,0,0,0,0,0,3,5.057707,0,0,1,
4,0,0,0,0,0,0,0,4,5.473367,0,0,1,


In [31]:
df.to_sql(name="interest_rate", con=con, if_exists='replace')

250

### Exchange rate

In [13]:
collector = SDMXCollector("data-api.ecb.europa.eu", "service")
flow_ref = "EXR"

collector.make_url(flow_ref, params={"format": "csvdata"})

'https://data-api.ecb.europa.eu/service/data/EXR?format=csvdata'

In [14]:
sample = collector.get(flow_ref, params={"format": "csvdata"})

df = sample_to_pandas(sample)
df.head(3)

Unnamed: 0,KEY,FREQ,CURRENCY,CURRENCY_DENOM,EXR_TYPE,EXR_SUFFIX,TIME_PERIOD,OBS_VALUE,OBS_STATUS,OBS_CONF,...,COMPILATION,COVERAGE,DECIMALS,NAT_TITLE,SOURCE_AGENCY,SOURCE_PUB,TITLE,TITLE_COMPL,UNIT,UNIT_MULT
0,EXR.A.ARS.EUR.SP00.A,A,ARS,EUR,SP00,A,2000,0.91995,A,,...,,,5,,4F0,,Argentine peso/Euro,"Indicative exchange rate, Argentine peso/Euro,...",ARS,0
1,EXR.A.ARS.EUR.SP00.A,A,ARS,EUR,SP00,A,2001,0.895263,A,,...,,,5,,4F0,,Argentine peso/Euro,"Indicative exchange rate, Argentine peso/Euro,...",ARS,0
2,EXR.A.ARS.EUR.SP00.A,A,ARS,EUR,SP00,A,2002,3.15561,A,,...,,,5,,4F0,,Argentine peso/Euro,"Indicative exchange rate, Argentine peso/Euro,...",ARS,0


In [8]:
df.shape

(3334169, 32)

In [15]:
mask = df["TIME_PERIOD"].map(lambda x: int(x[:4])) > 2015
sum(mask)

924997

In [16]:
df = sample_to_pandas(sample, parse_dates=["TIME_PERIOD"])
df.drop("KEY", axis=1, inplace=True)

In [17]:
df = df[mask]
df, factor_array_exr = factorize(df)

df.head()

Unnamed: 0,FREQ,CURRENCY,CURRENCY_DENOM,EXR_TYPE,EXR_SUFFIX,TIME_PERIOD,OBS_VALUE,OBS_STATUS,OBS_CONF,OBS_PRE_BREAK,...,COMPILATION,COVERAGE,DECIMALS,NAT_TITLE,SOURCE_AGENCY,SOURCE_PUB,TITLE,TITLE_COMPL,UNIT,UNIT_MULT
16,0,0,0,0,0,0,16.340145,0,-1,,...,-1,,5,,0,,0,0,0,0
17,0,0,0,0,0,1,18.74169,0,-1,,...,-1,,5,,0,,0,0,0,0
18,0,0,0,0,0,2,32.911487,0,-1,,...,-1,,5,,0,,0,0,0,0
19,0,0,0,0,0,3,53.820035,0,-1,,...,-1,,5,,0,,0,0,0,0
36,0,0,0,0,1,0,16.7918,0,-1,,...,-1,,5,,0,,0,0,0,0


In [21]:
df.to_sql(name="exchange_rates", con=con, if_exists='replace')

697

In [None]:
def index_to_df(index_names: list[str]) -> pd.DataFrame:
  output_dict = {
    "id": list(range(len(index_names))),
    "name": index_names
  }

  return pd.DataFrame(output_dict)

In [26]:
def factor_arr_to_df_list(name: str, arr: list[tuple[str, pd.Index]]) -> list[tuple[str, pd.DataFrame]]:
  return [(f"{name}_{col}".lower(), index_to_df(index)) for col, index in arr]

In [30]:
dimension_tables = factor_arr_to_df_list("bop", factor_array)

dimension_tables.extend(factor_arr_to_df_list("int_rates", factor_array_ir))
dimension_tables.extend(factor_arr_to_df_list("ex_rates", factor_array_exr))

In [32]:
for tbl_name, tbl in dimension_tables:
  tbl.to_sql(name=tbl_name, con=con, if_exists="replace")