In [49]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [50]:
df_brent = pd.read_csv("./processed_tables/brent.csv")
df_precios = pd.read_csv("./processed_tables/precios.csv")
df_precipitacion = pd.read_csv("./processed_tables/precipitacion.csv")
df_temperatura = pd.read_csv("./processed_tables/temperatura.csv")
df_trm = pd.read_csv("./processed_tables/TRM.csv")

In [51]:
df_brent = df_brent.rename(columns={"Fecha": "Date"})
df_precipitacion = df_precipitacion.rename(columns={"date": "Date"})
df_temperatura = df_temperatura.rename(columns={"date": "Date"})
df_trm = df_trm.rename(columns={"vigenciadesde": "Date"})

# Perform left joins
df_merged = pd.merge(df_precios, df_brent, on="Date", how="left")
df_merged = pd.merge(df_merged, df_precipitacion, on="Date", how="left")
df_merged = pd.merge(df_merged, df_temperatura, on="Date", how="left")
df_merged = pd.merge(df_merged, df_trm, on="Date", how="left")

In [52]:
df_merged.columns

Index(['Date', 'energy_price', 'brent_value', 'precipitacion_unidadmedida',
       'precipitacion_amazonas', 'precipitacion_antioquia',
       'precipitacion_arauca', 'precipitacion_atlantico',
       'precipitacion_bogota', 'precipitacion_bolivar', 'precipitacion_boyaca',
       'precipitacion_caldas', 'precipitacion_caqueta',
       'precipitacion_casanare', 'precipitacion_cauca', 'precipitacion_cesar',
       'precipitacion_choco', 'precipitacion_cordoba',
       'precipitacion_cundinamarca', 'precipitacion_guainia',
       'precipitacion_guaviare', 'precipitacion_huila',
       'precipitacion_la guajira', 'precipitacion_magdalena',
       'precipitacion_meta', 'precipitacion_narino',
       'precipitacion_norte de santander', 'precipitacion_putumayo',
       'precipitacion_quindio', 'precipitacion_risaralda',
       'precipitacion_san andres providencia', 'precipitacion_santander',
       'precipitacion_sucre', 'precipitacion_tolima',
       'precipitacion_valle del cauca', 'precip

In [53]:
df_merged.shape

(644, 72)

In [54]:
print(f"Fecha mínima: {df_merged.loc[0]['Date']}")

Fecha mínima: 2021-10-01


In [55]:
print(f"Fecha máxima: {df_merged.loc[df_merged.shape[0] - 1]['Date']}")

Fecha máxima: 2023-09-30


In [56]:
df_merged.describe()

Unnamed: 0,energy_price,brent_value,precipitacion_amazonas,precipitacion_antioquia,precipitacion_arauca,precipitacion_atlantico,precipitacion_bogota,precipitacion_bolivar,precipitacion_boyaca,precipitacion_caldas,...,temp_PUTUMAYO,temp_QUINDIO,temp_RISARALDA,temp_SAN ANDRES PROVIDENCIA,temp_SANTANDER,temp_SUCRE,temp_TOLIMA,temp_VALLE DEL CAUCA,temp_VICHADA,TRM
count,644.0,644.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,616.0,...,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,644.0
mean,322.121164,89.611661,6.270266,67.294067,9.849675,40.643075,10.77211,48.039756,106.116841,76.70224,...,22.912678,21.348549,21.506559,28.793119,19.625029,28.744456,21.675063,22.371571,27.284182,4313.503168
std,241.61842,12.144518,11.016419,73.369778,16.005523,94.15633,19.197051,77.675716,117.254883,89.132885,...,3.764467,1.840646,1.700905,1.082577,2.349299,1.673011,1.451251,1.33394,1.325667,374.0971
min,87.255265,70.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.975887,15.225,13.45,25.65,13.125,23.05,13.95,17.591995,20.6,3706.95
25%,133.920626,80.80875,0.09,11.4876,0.0,4.8525,0.3975,4.915,28.39,9.61,...,22.08125,20.147299,20.483344,28.05,18.078045,27.620833,20.917243,21.632185,26.75,3972.9025
50%,239.984244,85.955,1.837,41.5885,2.3,14.847,3.535,16.2,64.2265,43.2,...,24.2,21.114753,21.503617,28.725,18.973446,28.720833,21.736357,22.474095,27.3,4303.34
75%,435.65813,95.925,8.6795,101.03075,15.0,35.1825,12.27,66.0,140.482,113.15,...,25.275,22.651657,22.583292,29.55,21.769216,29.75,22.5491,23.327473,28.11875,4636.83
max,1061.527174,123.44,102.08,432.219,152.1,1120.19,188.7,701.92,733.292,486.5,...,27.75,26.2,26.325,31.45,26.7,34.15,26.8,25.147642,30.55,5061.21


In [57]:
precipitacion_columns = df_merged.filter(like='precipitacion_')
temp_columns = df_merged.filter(like='temp_')
specific_columns_df = df_merged[["brent_value", "TRM"]]
exogen_data = pd.concat([precipitacion_columns, temp_columns, specific_columns_df], axis=1)
exogen_data.drop("precipitacion_unidadmedida", axis=1, inplace=True)

## Estandarización de la data

Para empezar a modelar la data, se debe estandarizar los datos, para ello se utilizará la librería `sklearn.preprocessing`, la cual nos permite estandarizar los datos de una manera sencilla.

In [58]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(exogen_data)
scaled_data

array([[-0.48678438,  0.91324172,  0.77225603, ..., -0.40328169,
        -0.91731214, -1.33955165],
       [-0.56963718,  0.81779798,  1.62265328, ...,  1.59734222,
        -0.91731214, -1.42360583],
       [-0.51603723,  0.41374779,  1.56012407, ...,  1.59734222,
        -0.91731214, -1.42360583],
       ...,
       [-0.46970507, -0.77061673, -0.14692334, ...,  0.97450648,
         0.54497666, -0.58828069],
       [ 3.27411492, -0.61374906, -0.54085736, ...,  0.97450648,
         0.491825  , -0.60976239],
       [ 2.92889493, -0.65194293, -0.61589241, ...,  1.29536125,
         0.491825  , -0.69485988]])

In [59]:
pca = PCA(n_components=2)  # n_components is the number of components to keep
pca_result = pca.fit_transform(scaled_data)

ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
scaled_data.cov()

AttributeError: 'numpy.ndarray' object has no attribute 'cov'