In [None]:
import pandas as pd

In [None]:
import os
os.listdir('data')

In [None]:
INPATH = "data/"
INFILE1 = "202005_petroleo_cnmc.xlsx"

In [None]:
OUTPATH = 'data/'
OUTFILE1 = 'cnmc_goa_gna.xlsx'
OUTFILE2 = 'cnmc_prov_month_prices.xlsx'

In [None]:
START_DT = '2003-01-01'
END_DT = '2020-03-01'

In [None]:
month_parser = ["enero",
"febrero",
"marzo",
"abril",
"mayo",
"junio",
"julio",
"agosto",
"septiembre",
"octubre",
"noviembre",
"diciembre",
]

In [None]:
def create_date(yr, mon, day):
    return pd.to_datetime(10000*yr.astype(float)+100*mon.astype(float)+1, format='%Y%m%d')

# Load Data

In [None]:
cons_renaming = {'GASÓLEO A': 'cons_GOA',
           'GASOLINA  AUTO. S/PB 95 I.O.': 'cons_GNA95',
           'GASOLINA  AUTO. S/PB 98 I.O.': 'cons_GNA98'}

cons_select_cols = ['cons_GOA', 'cons_GNA95', 'cons_GNA98']

In [None]:
cons = pd.read_excel(INPATH + INFILE1, sheet_name ='Con')
cons = cons[cons['MES']!='ANUAL']
cons['MES'] = cons['MES'].replace(month_parser, range(1,len(month_parser)+1)).astype(int)
cons['Date'] = cons[['AÑO', 'MES']].apply(lambda x: create_date(yr=x[0], mon=x[1], day=1), axis=1)

cons.rename(inplace=True, columns = cons_renaming)
cons.set_index('Date', inplace=True)
cons = cons.loc[START_DT:END_DT, cons_select_cols].copy()  #kts
pd.concat([cons.head(3), cons.tail(3)], axis=0)

In [None]:
cot_renaming = {'GASÓLEO A': 'spot_GOA',
            'GASOLINA': 'spot_GNA',
            'BRENT': 'spot_Brent'}

cot_select_cols = ['spot_GOA', 'spot_GNA', 'spot_Brent']

In [None]:
cot = pd.read_excel(INPATH + INFILE1, sheet_name ='Cot')
cot = cot.rename( columns = cot_renaming).rename( columns = {'MES': 'Date'})
cot.set_index('Date', inplace=True)

cot = cot.loc[START_DT:END_DT, cot_select_cols].copy()
cot.head()

In [None]:
prices_renaming = {'GASÓLEO A': 'price_GOA',
           'GASOLINA  AUTO. S/PB 95 I.O.': 'price_GNA95',
           'GASOLINA  AUTO. S/PB 98 I.O.': 'price_GNA98'}

prices_select_cols = ['price_GOA', 'price_GNA95', 'price_GNA98']

In [None]:
prices = pd.read_excel(INPATH + INFILE1, sheet_name ='PVP')

prices['MES'] = prices['MES'].replace(month_parser, range(1,len(month_parser)+1)).astype(int)
prices['Date'] = prices[['AÑO', 'MES']].apply(lambda x: create_date(yr=x[0], mon=x[1], day=1), axis=1)
prices.rename(inplace=True, columns = prices_renaming)

prices.set_index('Date', inplace=True)
prices = prices.loc[START_DT:END_DT, prices_select_cols].copy()
pd.concat([prices.head(3), prices.tail(3)], axis=0)

In [None]:
comb = pd.concat([cons, cot, prices],axis=1).dropna()
pd.concat([comb.head(3), comb.tail(3)], axis=0)

In [None]:
cons_renaming

In [None]:
prices_prov = pd.read_excel(INPATH + INFILE1, sheet_name ='PVP_Prov')
prices_prov.rename(inplace=True, columns=prices_renaming)
prices_prov = prices_prov[prices_prov['PROVINCIA']!='TOTAL'].copy()
prices_prov['MES'] = prices_prov['MES'].replace(month_parser, range(1,len(month_parser)+1)).astype(int)
prices_prov['Date'] = prices_prov[['AÑO', 'MES']].apply(lambda x: create_date(yr=x[0], mon=x[1], day=1), axis=1)

# Train/Test split

In [None]:
split_col = 'split'
test_size = 12

In [None]:
idx_time = comb.index
idx_train, idx_test = idx_time[:-test_size], idx_time[-test_size:]

In [None]:
comb[split_col] = 'train'
comb.loc[idx_test, split_col] = 'test'
comb[split_col].value_counts()

In [None]:
last_date = prices_prov['Date'].max()
TRAIN_END_DT = last_date - pd.DateOffset(months=test_size)
TRAIN_END_DT

In [None]:
prices_prov[split_col] = 'train'
prices_prov.loc[prices_prov['Date'] >TRAIN_END_DT, split_col] = 'test'
prices_prov[split_col].value_counts()

In [None]:
comb.to_excel(OUTPATH + OUTFILE1)

In [None]:
prices_prov.to_excel(OUTPATH + OUTFILE2)