# Leer Entidades de Azure Table Storage


### Librerías


In [1]:
import pandas as pd
from azure.data.tables import TableClient
import os
from dotenv import load_dotenv, find_dotenv

In [2]:
load_dotenv(find_dotenv())

True

In [3]:
connection_string = os.getenv("AZ_CONNECTION_STRING")

### Funciones necesarias


In [4]:
## Definiendo funciones:


# Crear conexión de una tabla específica dentro del servicio de Azure Table Storage
def set_table_service(connection_string, table):
    """Crear servicio de conexión a Azure Table Storage"""
    return TableClient.from_connection_string(
        conn_str=connection_string, table_name=table
    )


# Obtener datos de Table Storage
def get_data_from_table_storage_table(table_service, filter_query):
    """Recuperar datos de Table Storage"""
    for record in table_service.query_entities(filter_query):
        yield record


# Crear DataFrame con los datos de la tabla consultada
def get_dataframe_from_table_storage_table(table_service, filter_query):
    """Crear un DataFrame con la data del Table Storage"""
    return pd.DataFrame(get_data_from_table_storage_table(table_service, filter_query))

### Creando DataFrame con Data de Az Table Storage


#### TRM Data

In [5]:
table_name_trm = "TRMCurated"
filterQuery = "PartitionKey ne 'random'"

In [6]:
# Creando DataFrame con TRM Data
table_service_trm = set_table_service(connection_string, table_name_trm)
df_trm = get_dataframe_from_table_storage_table(table_service_trm, filterQuery)

print(f"Shape: {df_trm.shape}")
print(f"dtypes: {df_trm.dtypes}")
df_trm.head()

Shape: (689, 5)
dtypes: PartitionKey      object
RowKey            object
valor            float64
vigenciadesde     object
vigenciahasta     object
dtype: object


Unnamed: 0,PartitionKey,RowKey,valor,vigenciadesde,vigenciahasta
0,TRM_20210101_20231020.csv,2021-01-05T00:00:00.000,3420.78,2021-01-05T00:00:00.000,2021-01-05T00:00:00.000
1,TRM_20210101_20231020.csv,2021-01-06T00:00:00.000,3450.74,2021-01-06T00:00:00.000,2021-01-06T00:00:00.000
2,TRM_20210101_20231020.csv,2021-01-07T00:00:00.000,3428.04,2021-01-07T00:00:00.000,2021-01-07T00:00:00.000
3,TRM_20210101_20231020.csv,2021-01-08T00:00:00.000,3459.39,2021-01-08T00:00:00.000,2021-01-08T00:00:00.000
4,TRM_20210101_20231020.csv,2021-01-09T00:00:00.000,3478.11,2021-01-09T00:00:00.000,2021-01-12T00:00:00.000


In [7]:
conteo_por_grupo = df_trm.groupby(['vigenciadesde']).size().reset_index(name='Conteo')

# Muestra los grupos que tienen más de un registro (si hay alguno)
duplicados = conteo_por_grupo[conteo_por_grupo['Conteo'] > 1]

In [8]:
duplicados.head()

Unnamed: 0,vigenciadesde,Conteo
666,2023-10-24T00:00:00.000,2
675,2023-11-04T00:00:00.000,2


In [9]:
duplicados = df_trm[df_trm.duplicated()]

# Si hay registros duplicados, verifica que todos tengan el mismo valor en 'Valor'
for indice, grupo in duplicados.groupby(['vigenciadesde', 'vigenciahasta']):
    if grupo['Valor'].nunique() > 1:
        print(f"Registros duplicados con diferentes valores en 'Valor' para Departamento {indice[0]} y Fecha {indice[1]}:")
        print(grupo)

#todos los registros duplicados tienen el mismo valor

Se identifica que la tabla TRM contiene todos los registros duplicados, por lo que se procede a hacer la depuración

In [10]:
df_sin_duplicados = df_trm.drop_duplicates(subset=['valor', 'vigenciadesde','vigenciahasta'])

# Imprime el DataFrame resultante
print(f"Shape: {df_sin_duplicados.shape}")
print(f"dtypes: {df_sin_duplicados.dtypes}")
df_sin_duplicados.head()

Shape: (687, 5)
dtypes: PartitionKey      object
RowKey            object
valor            float64
vigenciadesde     object
vigenciahasta     object
dtype: object


Unnamed: 0,PartitionKey,RowKey,valor,vigenciadesde,vigenciahasta
0,TRM_20210101_20231020.csv,2021-01-05T00:00:00.000,3420.78,2021-01-05T00:00:00.000,2021-01-05T00:00:00.000
1,TRM_20210101_20231020.csv,2021-01-06T00:00:00.000,3450.74,2021-01-06T00:00:00.000,2021-01-06T00:00:00.000
2,TRM_20210101_20231020.csv,2021-01-07T00:00:00.000,3428.04,2021-01-07T00:00:00.000,2021-01-07T00:00:00.000
3,TRM_20210101_20231020.csv,2021-01-08T00:00:00.000,3459.39,2021-01-08T00:00:00.000,2021-01-08T00:00:00.000
4,TRM_20210101_20231020.csv,2021-01-09T00:00:00.000,3478.11,2021-01-09T00:00:00.000,2021-01-12T00:00:00.000


In [11]:
df_sin_duplicados_fecha = df_sin_duplicados.copy()

df_sin_duplicados_fecha['vigenciadesde'] = pd.to_datetime(df_sin_duplicados_fecha['vigenciadesde'], errors='coerce')
df_sin_duplicados_fecha['vigenciadesde'] = df_sin_duplicados_fecha['vigenciadesde'].dt.strftime('%Y-%m-%d')
df_sin_duplicados_fecha['vigenciahasta'] = pd.to_datetime(df_sin_duplicados_fecha['vigenciahasta'], errors='coerce')
df_sin_duplicados_fecha['vigenciahasta'] = df_sin_duplicados_fecha['vigenciahasta'].dt.strftime('%Y-%m-%d')

print(f"Shape: {df_sin_duplicados_fecha.shape}")
print(f"dtypes: {df_sin_duplicados_fecha.dtypes}")
df_sin_duplicados_fecha.head()

Shape: (687, 5)
dtypes: PartitionKey      object
RowKey            object
valor            float64
vigenciadesde     object
vigenciahasta     object
dtype: object


Unnamed: 0,PartitionKey,RowKey,valor,vigenciadesde,vigenciahasta
0,TRM_20210101_20231020.csv,2021-01-05T00:00:00.000,3420.78,2021-01-05,2021-01-05
1,TRM_20210101_20231020.csv,2021-01-06T00:00:00.000,3450.74,2021-01-06,2021-01-06
2,TRM_20210101_20231020.csv,2021-01-07T00:00:00.000,3428.04,2021-01-07,2021-01-07
3,TRM_20210101_20231020.csv,2021-01-08T00:00:00.000,3459.39,2021-01-08,2021-01-08
4,TRM_20210101_20231020.csv,2021-01-09T00:00:00.000,3478.11,2021-01-09,2021-01-12


#### Precipitación Data

In [12]:
table_name_precipitacion = "PRECIPITACIONCurated"
filterQuery = "PartitionKey ne 'random'"

In [13]:
table_service_precipitacion = set_table_service(connection_string, table_name_precipitacion)
df_precipitacion = get_dataframe_from_table_storage_table(table_service_precipitacion, filterQuery)
print(f"Shape: {df_precipitacion.shape}")
print(f"dtypes: {df_precipitacion.dtypes}")
df_precipitacion.head()

Shape: (261362, 15)
dtypes: PartitionKey            object
RowKey                  object
TimeStamp                int64
codigoestacion          object
codigosensor            object
date                    object
departamento            object
descripcionsensor       object
latitud                 object
longitud                object
municipio               object
nombreestacion          object
precipitacion_total    float64
unidadmedida            object
zonahidrografica        object
dtype: object


Unnamed: 0,PartitionKey,RowKey,TimeStamp,codigoestacion,codigosensor,date,departamento,descripcionsensor,latitud,longitud,municipio,nombreestacion,precipitacion_total,unidadmedida,zonahidrografica
0,DefaultPartitionKey,00003699-9864-405f-83fb-afd64270a798,20231026,42077020,240,2021-07-14T00:00:00.000,VAUPÉS,Precipitacion,1.26,-70.239,MITÚ,MITU,10.0,mm,VAUPES
1,DefaultPartitionKey,0000400d-4c13-40cd-88de-51c6d305d1ef,20231026,12015110,240,2021-12-12T00:00:00.000,ANTIOQUIA,Precipitacion,7.671138889,-76.69405556,CHIGORODÓ,CHIGORODO - AUT,0.1,mm,CARIBE - LITORAL
2,DefaultPartitionKey,00007b0b-ec72-4c5e-8a80-aaf73c645a67,20231026,3526500201,240,2023-01-17T00:00:00.000,VICHADA,Precipitacion,5.48088889,-70.42130556,LA PRIMAVERA,LA PRIMAVERA,0.0,mm,ORINOCO
3,DefaultPartitionKey,0000ca4d-4c22-41a8-a599-d20fc79fd83a,20231026,24025030,240,2021-11-15T00:00:00.000,BOYACÁ,Precipitacion,5.966388889,-73.16389139,PAIPA,LA SIERRA - AUT,5.0,mm,SOGAMOSO
4,DefaultPartitionKey,0000d2d6-8c7b-4120-9078-177e8855eef2,20231026,35025110,240,2023-01-02T00:00:00.000,META,Precipitacion,4.057361111,-73.46791667,VILLAVICENCIO,LA LIBERTAD - AUT,0.5,mm,META


In [14]:
df_precipitacion_sin_duplicados = df_precipitacion.drop_duplicates(subset=['codigoestacion','date','municipio','departamento'])

# Imprime el DataFrame resultante
print(f"Shape: {df_precipitacion_sin_duplicados.shape}")

Shape: (257538, 15)


In [15]:
df_precipitacion_sin_duplicados_fecha = df_precipitacion_sin_duplicados.copy()

df_precipitacion_sin_duplicados_fecha['date'] = pd.to_datetime(df_precipitacion_sin_duplicados_fecha['date'], errors='coerce')
df_precipitacion_sin_duplicados_fecha['date'] = df_precipitacion_sin_duplicados_fecha['date'].dt.strftime('%Y-%m-%d')

print(f"Shape: {df_precipitacion_sin_duplicados_fecha.shape}")
df_precipitacion_sin_duplicados_fecha.head(2)

Shape: (257538, 15)


Unnamed: 0,PartitionKey,RowKey,TimeStamp,codigoestacion,codigosensor,date,departamento,descripcionsensor,latitud,longitud,municipio,nombreestacion,precipitacion_total,unidadmedida,zonahidrografica
0,DefaultPartitionKey,00003699-9864-405f-83fb-afd64270a798,20231026,42077020,240,2021-07-14,VAUPÉS,Precipitacion,1.26,-70.239,MITÚ,MITU,10.0,mm,VAUPES
1,DefaultPartitionKey,0000400d-4c13-40cd-88de-51c6d305d1ef,20231026,12015110,240,2021-12-12,ANTIOQUIA,Precipitacion,7.671138889,-76.69405556,CHIGORODÓ,CHIGORODO - AUT,0.1,mm,CARIBE - LITORAL


#### Temperatura Data

In [16]:
table_name_temperatura = "TEMPERATURACurated"
filterQuery = "PartitionKey ne 'random'"

In [17]:
table_name_temperatura = set_table_service(connection_string, table_name_temperatura)
df_temperatura = get_dataframe_from_table_storage_table(table_name_temperatura, filterQuery)
print(f"Shape: {df_temperatura.shape}")
print(f"dtypes: {df_temperatura.dtypes}")
df_temperatura.head()

Shape: (182761, 16)
dtypes: PartitionKey          object
RowKey                object
TimeStamp              int64
codigoestacion        object
codigosensor          object
date                  object
departamento          object
descripcionsensor     object
latitud               object
longitud              object
municipio             object
nombreestacion        object
temp_max             float64
temp_min             float64
unidadmedida          object
zonahidrografica      object
dtype: object


Unnamed: 0,PartitionKey,RowKey,TimeStamp,codigoestacion,codigosensor,date,departamento,descripcionsensor,latitud,longitud,municipio,nombreestacion,temp_max,temp_min,unidadmedida,zonahidrografica
0,DefaultPartitionKey,00004632-074c-49c5-ada2-0d80acc76753,20231103,23025502,68,2023-02-13T00:00:00.000,CALDAS,Temp Aire 2 m,5.044997222,-75.33221944,MANIZALES,ALMACAFE LETRAS - AUT,11.76996,3.194234,°C,MEDIO MAGDALENA
1,DefaultPartitionKey,00006165-52f2-4ddb-86bc-928e0e7d5270,20231104,24025050,68,2021-02-16T00:00:00.000,SANTANDER,Temp Aire 2 m,6.274,-73.151,CHARALÁ,CHARALA - ISAGEN - Estacion en Pruebas,28.7,16.9,°C,SOGAMOSO
2,DefaultPartitionKey,0000c881-f422-45e6-b3a7-d80ed727431b,20231104,16025503,68,2022-08-03T00:00:00.000,NORTE DE SANTANDER,Temp Aire 2 m,7.923163889,-72.83375556,GRAMALOTE,GRAMALOTE - AUT,22.89349,15.37709,°C,CATATUMBO
3,DefaultPartitionKey,0000f7fd-1700-4e21-aacf-884faf159ef5,20231104,26125502,68,2022-07-23T00:00:00.000,VALLE DEL CAUCA,Temp Aire 2 m,4.297558333,-75.86396111,CAICEDONIA,CAICEDONIA - AUT,25.46039,17.62243,°C,CAUCA
4,DefaultPartitionKey,0001598f-192a-4d55-bf47-fda975ba6f14,20231104,35080050,68,2022-04-04T00:00:00.000,BOYACÁ,Temp Aire 2 m,5.096,-73.053,PÁEZ,PAEZ,23.45,17.94,°C,META


In [18]:
df_temperatura_sin_duplicados = df_temperatura.drop_duplicates(subset=['codigoestacion','date','municipio','departamento'])

# Imprime el DataFrame resultante
print(f"Shape: {df_temperatura_sin_duplicados.shape}")


Shape: (182504, 16)


In [19]:
df_temperatura_sin_duplicados_fecha = df_temperatura_sin_duplicados.copy()

df_temperatura_sin_duplicados_fecha['date'] = pd.to_datetime(df_temperatura_sin_duplicados_fecha['date'], errors='coerce')
df_temperatura_sin_duplicados_fecha['date'] = df_temperatura_sin_duplicados_fecha['date'].dt.strftime('%Y-%m-%d')

print(f"Shape: {df_precipitacion_sin_duplicados_fecha.shape}")
df_temperatura_sin_duplicados_fecha.head(2)

Shape: (257538, 15)


Unnamed: 0,PartitionKey,RowKey,TimeStamp,codigoestacion,codigosensor,date,departamento,descripcionsensor,latitud,longitud,municipio,nombreestacion,temp_max,temp_min,unidadmedida,zonahidrografica
0,DefaultPartitionKey,00004632-074c-49c5-ada2-0d80acc76753,20231103,23025502,68,2023-02-13,CALDAS,Temp Aire 2 m,5.044997222,-75.33221944,MANIZALES,ALMACAFE LETRAS - AUT,11.76996,3.194234,°C,MEDIO MAGDALENA
1,DefaultPartitionKey,00006165-52f2-4ddb-86bc-928e0e7d5270,20231104,24025050,68,2021-02-16,SANTANDER,Temp Aire 2 m,6.274,-73.151,CHARALÁ,CHARALA - ISAGEN - Estacion en Pruebas,28.7,16.9,°C,SOGAMOSO


#### Brent

In [20]:
table_name_brent = "BRENTCurated"
filterQuery = "PartitionKey ne 'random'"

In [21]:
table_name_brent = set_table_service(connection_string, table_name_brent)
df_brent = get_dataframe_from_table_storage_table(table_name_brent, filterQuery)
print(f"Shape: {df_brent.shape}")
print(f"dtypes: {df_brent.dtypes}")
df_brent.head()

Shape: (532, 9)
dtypes: PartitionKey    object
RowKey          object
Apertura        object
Fecha           object
Máximo          object
Mínimo          object
Vol             object
var             object
Último          object
dtype: object


Unnamed: 0,PartitionKey,RowKey,Apertura,Fecha,Máximo,Mínimo,Vol,var,Último
0,DefaultPartitionKey,000b65fc-8b3b-4e0b-9ba3-5f1842b1ecb0,8487,13.01.2022,8510,8379,"265,20K","-0,24%",8447
1,DefaultPartitionKey,001c0a9d-9441-4aff-94db-6ba29f786e96,7949,19.12.2022,8119,7902,"164,92K","0,82%",8014
2,DefaultPartitionKey,00604e4e-31f7-4de6-9a39-15ff4466953f,9979,31.08.2022,10046,9544,"10,07K","-1,38%",9649
3,DefaultPartitionKey,00666023-1642-4f3f-86c1-7f4a33c75a7e,8288,29.12.2022,8314,8128,"10,99K","-1,20%",8226
4,DefaultPartitionKey,007881a7-7471-40a4-aa2b-02720a06d866,7547,23.12.2021,7700,7478,"161,24K","2,07%",7685


In [22]:
df_brent_sin_duplicados = df_brent.drop_duplicates(subset=['Apertura','Fecha','Máximo','Mínimo'])

# Imprime el DataFrame resultante
print(f"Shape: {df_brent_sin_duplicados.shape}")

Shape: (532, 9)


Brent no tiene duplicados

In [23]:
df_brent_sin_duplicados_fecha = df_brent_sin_duplicados.copy()

df_brent_sin_duplicados_fecha['Fecha'] = pd.to_datetime(df_brent_sin_duplicados_fecha['Fecha'], errors='coerce', dayfirst=True)
df_brent_sin_duplicados_fecha['Fecha'] = df_brent_sin_duplicados_fecha['Fecha'].dt.strftime('%Y-%m-%d')

print(f"Shape: {df_brent_sin_duplicados_fecha.shape}")
df_brent_sin_duplicados_fecha.head(2)

Shape: (532, 9)


Unnamed: 0,PartitionKey,RowKey,Apertura,Fecha,Máximo,Mínimo,Vol,var,Último
0,DefaultPartitionKey,000b65fc-8b3b-4e0b-9ba3-5f1842b1ecb0,8487,2022-01-13,8510,8379,"265,20K","-0,24%",8447
1,DefaultPartitionKey,001c0a9d-9441-4aff-94db-6ba29f786e96,7949,2022-12-19,8119,7902,"164,92K","0,82%",8014


#### Represas

Se revisó las represas en colombia y se encuntran los sigueintes registros por departamento

In [24]:
table_name_represa = "InventarioRepresasColombia"
filterQuery = "PartitionKey ne 'random'"

In [25]:
table_name_represa = set_table_service(connection_string, table_name_represa)
df_represas = get_dataframe_from_table_storage_table(table_name_represa, filterQuery)
print(f"Shape: {df_represas.shape}")

Shape: (48, 14)


In [26]:
conteo_por_departamento = df_represas.groupby('Departamento').size().reset_index(name='Cantidad_Represas')
conteo_por_departamento = conteo_por_departamento.sort_values(by='Cantidad_Represas', ascending=False)

indice_a_cambiar = 0
nuevo_nombre = 'Cundinamarca'
indice_a_cambiar1 = 1
nuevo_nombre1 = 'Santander'

# Utiliza el método at para cambiar el nombre del departamento
conteo_por_departamento.at[indice_a_cambiar, 'Departamento'] = nuevo_nombre
conteo_por_departamento.at[indice_a_cambiar1, 'Departamento'] = nuevo_nombre1

# Imprime el DataFrame resultante
print(conteo_por_departamento)


      Departamento  Cantidad_Represas
2        Antioquia                 24
9  Valle del Cauca                 11
7            Huila                  4
1        Santander                  2
8           Tolima                  2
0     Cundinamarca                  1
3           Boyaca                  1
4           Caldas                  1
5            Cauca                  1
6          Cordoba                  1


#### Agrupación de precipitación por departamento donde hay represas y por todos los departamentos

In [27]:
# Convierte la columna 'date' a tipo datetime si aún no está en ese formato
df_precipitacion_sin_duplicados_fecha['date'] = pd.to_datetime(df_precipitacion_sin_duplicados_fecha['date'])

# Agrupa por 'departamento' y 'date', y suma la precipitación total
precipitacion_agrupada = df_precipitacion_sin_duplicados_fecha.groupby(['departamento', 'date'])['precipitacion_total'].sum().reset_index()

# Imprime el DataFrame resultante
precipitacion_agrupada


Unnamed: 0,departamento,date,precipitacion_total
0,AMAZONAS,2021-01-01,49.11
1,AMAZONAS,2021-01-02,23.12
2,AMAZONAS,2021-01-03,0.22
3,AMAZONAS,2021-01-04,0.01
4,AMAZONAS,2021-01-09,6.81
...,...,...,...
32569,VICHADA,2023-09-26,0.00
32570,VICHADA,2023-09-27,0.00
32571,VICHADA,2023-09-28,5.40
32572,VICHADA,2023-09-29,0.00


In [28]:
departamentos_seleccionados = ['ANTIOQUIA', 'VALLE DEL CAUCA', 'HUILA', 'SANTANDER', 'TOLIMA', 'CUNDINAMARCA', 'BOYACÁ', 'CALDAS', 'CAUCA', 'CORDOBA']

# Filtrar el DataFrame por los departamentos seleccionados
df_filtrado = precipitacion_agrupada[precipitacion_agrupada['departamento'].isin(departamentos_seleccionados)]

# Imprimir el DataFrame resultante
df_filtrado

Unnamed: 0,departamento,date,precipitacion_total
555,ANTIOQUIA,2021-01-01,227.376
556,ANTIOQUIA,2021-01-02,247.065
557,ANTIOQUIA,2021-01-03,346.492
558,ANTIOQUIA,2021-01-04,78.964
559,ANTIOQUIA,2021-01-09,306.810
...,...,...,...
31758,VALLE DEL CAUCA,2023-09-21,0.000
31759,VALLE DEL CAUCA,2023-09-22,15.500
31760,VALLE DEL CAUCA,2023-09-23,0.000
31761,VALLE DEL CAUCA,2023-09-27,0.000


In [29]:
df_pivot = df_filtrado.pivot(index='date', columns='departamento', values='precipitacion_total')

# Resetear el índice si lo deseas
df_pivot.reset_index(inplace=True)

df_pivot

departamento,date,ANTIOQUIA,BOYACÁ,CALDAS,CAUCA,CORDOBA,CUNDINAMARCA,HUILA,SANTANDER,TOLIMA,VALLE DEL CAUCA
0,2021-01-01,227.376,57.70,28.3,42.0,0.0,546.579,62.4,19.20,107.1,18.200
1,2021-01-02,247.065,740.20,264.4,54.7,0.3,1688.323,13.6,71.22,43.3,13.713
2,2021-01-03,346.492,233.60,95.1,126.9,0.0,124.707,22.8,9.46,37.3,141.056
3,2021-01-04,78.964,23.20,53.8,86.9,0.0,70.600,11.9,14.71,21.4,41.300
4,2021-01-09,306.810,369.40,18.9,23.1,0.6,1629.911,14.6,90.44,9.4,68.349
...,...,...,...,...,...,...,...,...,...,...,...
948,2023-09-26,15.100,12.00,,55.3,,69.900,0.2,60.12,0.0,
949,2023-09-27,1.100,252.50,,0.9,,18.800,4.4,11.30,23.2,0.000
950,2023-09-28,10.800,174.60,,54.1,,42.000,33.5,102.30,24.3,11.066
951,2023-09-29,22.300,12.70,,0.5,,1.100,1.5,25.30,0.8,


In [30]:
from azure.storage.blob import BlobServiceClient
import io

In [31]:
def obtain_content_of_blob(connection_string, container_name, file_name):
    """
    Esta función crea una conexión a un archivo específico almacenado en blob Storage y retorna su contenido
    """
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    container_client = blob_service_client.get_container_client(container_name)
    blob_client = container_client.get_blob_client(file_name)
    blob_stream = blob_client.download_blob().readall()
    blob_file = io.BytesIO(blob_stream)
    return blob_file

In [32]:
connection_string = os.getenv("AZ_CONNECTION_STRING")
container_name = "raw/EnergySuiteData/precios/aire/"

In [33]:
df_precio = pd.read_csv(
    obtain_content_of_blob(connection_string, container_name, "AireSup-Prices.csv"),
    delimiter=";",
    # type={"Id":str,"Date": str,"Period":int,"Version":int,"ProductType":str,
    #   "ConceptId":str,"ElementId":str,"Value":float,"SourceType":str,"NatureType":str},
    #   nrows=100000) #Loading only first 100000 rows
)
print(df_precio.shape)
print(df_precio.dtypes)
df_precio.head()

(21264, 9)
Id              object
Date            object
Period           int64
Version          int64
MarketType      object
ProductType     object
ConceptId       object
Value          float64
UpLoadId       float64
dtype: object


Unnamed: 0,Id,Date,Period,Version,MarketType,ProductType,ConceptId,Value,UpLoadId
0,7459BD0B-E48E-487A-B7D9-01E11783FAF0,2023-09-30 00:00:00.000,20,4,TMerEner,TPEner,PBNA,1062.184761,
1,25E1062D-603C-432C-BAA5-07C4013A78FE,2023-09-30 00:00:00.000,19,4,TMerEner,TPEner,PBNA,1062.185761,
2,BABD0C9A-111E-4D7A-A4B7-0AB310F6D4A5,2023-09-30 00:00:00.000,8,4,TMerEner,TPEner,PBNA,1008.944761,
3,5109AA0B-B546-408C-B9A5-1C0FF634D91A,2023-09-30 00:00:00.000,2,4,TMerEner,TPEner,PBNA,1008.945761,
4,F7CCDBA3-1710-4BF7-8BEE-28826207E2FA,2023-09-30 00:00:00.000,11,4,TMerEner,TPEner,PBNA,1062.085761,


In [34]:
df_precio_sin_duplicados = df_precio.drop_duplicates(subset=['Date','Period','ProductType','Value'])

# Imprime el DataFrame resultante
print(f"Shape: {df_precio_sin_duplicados.shape}")

Shape: (18862, 9)


In [35]:
df_precio_sin_duplicados_fecha = df_precio_sin_duplicados.copy()

df_precio_sin_duplicados_fecha['Date'] = pd.to_datetime(df_precio_sin_duplicados_fecha['Date'], errors='coerce')
df_precio_sin_duplicados_fecha['Date'] = df_precio_sin_duplicados_fecha['Date'].dt.strftime('%Y-%m-%d')

print(f"Shape: {df_brent_sin_duplicados_fecha.shape}")
df_precio_sin_duplicados_fecha.head(2)

Shape: (532, 9)


Unnamed: 0,Id,Date,Period,Version,MarketType,ProductType,ConceptId,Value,UpLoadId
0,7459BD0B-E48E-487A-B7D9-01E11783FAF0,2023-09-30,20,4,TMerEner,TPEner,PBNA,1062.184761,
1,25E1062D-603C-432C-BAA5-07C4013A78FE,2023-09-30,19,4,TMerEner,TPEner,PBNA,1062.185761,


se debe validar si es correcto eliminar las observaciones duplicadas para la data precio

In [36]:
#df_precio['Date'] = df_precio['Date'].astype(str)
#df_pivot['date'] = df_pivot['date'].astype(str)
df_precio['Date'] = pd.to_datetime(df_precio['Date'])
df_pivot['date'] = pd.to_datetime(df_pivot['date'])
# Unir los DataFrames
df_union = pd.merge(df_precio, df_pivot, left_on=['Date'], right_on=['date'], how='left')

print(f"Shape: {df_union.shape}")
df_union


Shape: (21264, 20)


Unnamed: 0,Id,Date,Period,Version,MarketType,ProductType,ConceptId,Value,UpLoadId,date,ANTIOQUIA,BOYACÁ,CALDAS,CAUCA,CORDOBA,CUNDINAMARCA,HUILA,SANTANDER,TOLIMA,VALLE DEL CAUCA
0,7459BD0B-E48E-487A-B7D9-01E11783FAF0,2023-09-30,20,4,TMerEner,TPEner,PBNA,1062.184761,,2023-09-30,19.500,23.93,,44.9,,3.700,0.3,6.8,0.9,
1,25E1062D-603C-432C-BAA5-07C4013A78FE,2023-09-30,19,4,TMerEner,TPEner,PBNA,1062.185761,,2023-09-30,19.500,23.93,,44.9,,3.700,0.3,6.8,0.9,
2,BABD0C9A-111E-4D7A-A4B7-0AB310F6D4A5,2023-09-30,8,4,TMerEner,TPEner,PBNA,1008.944761,,2023-09-30,19.500,23.93,,44.9,,3.700,0.3,6.8,0.9,
3,5109AA0B-B546-408C-B9A5-1C0FF634D91A,2023-09-30,2,4,TMerEner,TPEner,PBNA,1008.945761,,2023-09-30,19.500,23.93,,44.9,,3.700,0.3,6.8,0.9,
4,F7CCDBA3-1710-4BF7-8BEE-28826207E2FA,2023-09-30,11,4,TMerEner,TPEner,PBNA,1062.085761,,2023-09-30,19.500,23.93,,44.9,,3.700,0.3,6.8,0.9,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21259,24DC2378-565F-4606-9BC7-C4A3C2EAFFF8,2021-09-01,0,4,TMerEner,TPEner,MC,254.265318,,2021-09-01,64.224,99.40,26.1,108.4,43.2,47.811,58.3,72.7,86.1,91.5
21260,606943F5-C16E-4C36-B9B7-D09FF393F031,2021-01-01,0,4,TMerEner,TPEner,INDIPP,122.340000,,2021-01-01,227.376,57.70,28.3,42.0,0.0,546.579,62.4,19.2,107.1,18.2
21261,2664F52D-1E6D-4FCE-8A18-8A9E354491AE,2021-01-01,0,4,TMerEner,TPEner,INDIPC,104.240000,,2021-01-01,227.376,57.70,28.3,42.0,0.0,546.579,62.4,19.2,107.1,18.2
21262,8B9F1056-D761-4F09-844D-86D17D1C093B,2021-01-01,0,4,TMerEner,TPEner,MC,218.909315,,2021-01-01,227.376,57.70,28.3,42.0,0.0,546.579,62.4,19.2,107.1,18.2
