In [12]:
from pathlib import Path
import pandas as pd
import numpy as np
import sys
sys.path.append('../')
from src.data import cbk_data

## Cleaning and exporting the data

We hve four sources of data that we need to clean and consolidate into one to be added in an posibble s3 file. 

In [13]:
files_types = {"cbk" : "csv", "fred" : "csv"}
# files_types = {"cbk" : "csv", "fred" : "csv", "m" : "xls", "x" : "xls"}

We read the data

In [14]:
raw_data = {}
for k, v in files_types.items():
    if v == "xls":
        raw_data[k] = pd.read_excel(Path.cwd().parent.joinpath("data", "raw_" + k + "." + v))
    else:
        raw_data[k] = pd.read_csv(Path.cwd().parent.joinpath("data", "raw_" + k + "." + v))

We explore the data

In [15]:
raw_data["cbk"]

Unnamed: 0,cod,date,value,var
0,1992,1999-01-31T00:00:00-06:00,528.559625,exp
1,1992,1999-02-28T00:00:00-06:00,1101.014371,exp
2,1992,1999-03-31T00:00:00-06:00,1686.540258,exp
3,1992,1999-04-30T00:00:00-06:00,2315.256577,exp
4,1992,1999-05-31T00:00:00-06:00,2869.498299,exp
...,...,...,...,...
40487,3799,2022-09-10T00:00:00-06:00,,mil_count
40488,3799,2022-09-11T00:00:00-06:00,,mil_count
40489,3799,2022-09-12T00:00:00-06:00,17.000000,mil_count
40490,3799,2022-09-13T00:00:00-06:00,15.000000,mil_count


In [16]:
raw_data["fred"]

Unnamed: 0,date,value,var
0,1986-01-02,25.56,crude_oil_wti
1,1986-01-03,26.00,crude_oil_wti
2,1986-01-06,26.53,crude_oil_wti
3,1986-01-07,25.85,crude_oil_wti
4,1986-01-08,25.87,crude_oil_wti
...,...,...,...
106187,2022-05-01,340385.00,usa_m
106188,2022-06-01,339642.00,usa_m
106189,2022-07-01,330040.00,usa_m
106190,2022-08-01,326472.00,usa_m


Let's make the date group by month and year. As the data for X and M are available since 2000, we filter all the vars to that year

In [17]:
grouped_data = {}
for df in raw_data:
       grouped_data[df] = (raw_data[df].assign(year = lambda x: pd.DatetimeIndex(x["date"]).year,
                                                    month = lambda x: pd.DatetimeIndex(x["date"]).month)
                                       .query("year >= 1999 & year <= 2022")
                                       .groupby(["year", "month", "var"])
                                       .agg({"value" : np.mean})               
                                       .reset_index())
                 

The cbk data for Exports and Imports is accumalted by month, we need to transform that to not accumulative value. So we extract those vars and transform them.

In [18]:
x_data = grouped_data["cbk"].query("var == 'exp'")
m_data = grouped_data["cbk"].query("var == 'imp'")
flowsvars = ["exp", "imp"]
grouped_data2 = grouped_data.copy()
grouped_data2["cbk"] = grouped_data["cbk"].query("var not in @flowsvars")

Transforming data in X-M

In [19]:
x_data2 = cbk_data.xm_notacc(x_data)
m_data2 = cbk_data.xm_notacc(m_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, "value_lag"] = df.groupby("year")["value"].shift()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, "value_lag"] = df.loc[:, "value_lag"].replace({np.nan: 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, "valnotacc"] = df.loc[:, "value"] - df.loc[:, "value_lag"]
A valu

Now, we join the dataset

In [20]:
final_dataset=pd.concat(grouped_data2, axis=0, ignore_index=True)
final_dataset=pd.concat([final_dataset, x_data2, m_data2], axis=0, ignore_index=True)

In [21]:
final_dataset.head()

Unnamed: 0,year,month,var,value
0,1999,1,imae,52.19728
1,1999,2,imae,48.747353
2,1999,3,imae,50.386924
3,1999,4,imae,47.016367
4,1999,5,imae,48.617007


Saving

In [22]:
with Path.cwd().parent.joinpath("data", "cleaned_data.csv") as e:
    final_dataset.to_csv(e, index=False)