# Exploratory Data Analysis para datos de Consumo


## Importando Librerías


In [19]:
import pandas as pd
from data_service import obtain_content_of_blob
import os
from dotenv import load_dotenv

In [20]:
pd.options.display.max_columns = 100

In [21]:
load_dotenv()

True

## Cargando Entidades de consumo del cliente Aire


In [22]:
connection_string = os.getenv("AZ_CONNECTION_STRING")
container_name = "raw/EnergySuiteData/consumos/aire/"

### 1.Datos de consumo


In [23]:
df_consumo = pd.read_csv(
    obtain_content_of_blob(
        connection_string, container_name, "AireSup-fifteen-minutes.csv"
    ),
    delimiter=";",
    dtype={
        "Id": str,
        "Date": str,
        "Period": int,
        "Version": int,
        "ProductType": str,
        "ConceptId": str,
        "ElementId": str,
        "Value": float,
        "SourceType": str,
        "NatureType": str,
    },
    nrows=100000,
)  # Loading only first 100000 rows
print(df_consumo.shape)
print(df_consumo.dtypes)
df_consumo.head()

(100000, 10)
Id              object
Date            object
Period           int64
Version          int64
ProductType     object
ConceptId       object
ElementId       object
Value          float64
SourceType      object
NatureType      object
dtype: object


Unnamed: 0,Id,Date,Period,Version,ProductType,ConceptId,ElementId,Value,SourceType,NatureType
0,5A016C3E-BB9E-45C1-89CB-3B46661E93E9,2024-07-01 00:00:00.000,1,0,TPEner,GENERACION,84340133,0.0,FTELE,
1,465EA69C-9994-40D7-AE09-27B753112BAD,2024-07-01 00:00:00.000,1,0,TPEner,ENEREACAP,84340133,3.0,FTELE,
2,8808200D-B1A2-4287-A06C-A2917EF4D9F6,2024-07-01 00:00:00.000,1,0,TPEner,ENERREAC,84340133,0.0,FTELE,
3,5B40179A-707D-44D5-8906-5F9395CFAB5A,2024-07-01 00:00:00.000,1,0,TPEner,ENERACTI,84340133,16.7,FTELE,
4,98FDD3AE-1C55-4E53-9CA9-0F06FD9FB8F8,2023-09-07 00:00:00.000,26,0,TPEner,ENERACTI,2827835,50.73,FTele,


In [24]:
variables_relevantes_consumo = ["Date", "Period", "ConceptId", "ElementId", "Value"]


# Define a function to convert periods to hours and minutes
def period_to_time(period):
    if period == 1:
        return "00:15"
    elif period == 96:
        return "00:00"
    else:
        total_minutes = (period) * 15
        hours = total_minutes // 60
        minutes = total_minutes % 60
        return f"{hours:02d}:{minutes:02d}"


df_consumo_curated = df_consumo[variables_relevantes_consumo].copy()
df_consumo_curated["Date"] = df_consumo_curated["Date"].apply(lambda x: x.split(" ")[0])
df_consumo_curated["Hour"] = df_consumo_curated["Period"].apply(period_to_time)
df_consumo_curated["Datetime"] = (
    df_consumo_curated["Date"] + " " + df_consumo_curated["Hour"]
)
df_consumo_curated["Datetime"] = pd.to_datetime(df_consumo_curated["Datetime"])
print(df_consumo_curated.dtypes)
df_consumo_curated.head()

Date                 object
Period                int64
ConceptId            object
ElementId            object
Value               float64
Hour                 object
Datetime     datetime64[ns]
dtype: object


Unnamed: 0,Date,Period,ConceptId,ElementId,Value,Hour,Datetime
0,2024-07-01,1,GENERACION,84340133,0.0,00:15,2024-07-01 00:15:00
1,2024-07-01,1,ENEREACAP,84340133,3.0,00:15,2024-07-01 00:15:00
2,2024-07-01,1,ENERREAC,84340133,0.0,00:15,2024-07-01 00:15:00
3,2024-07-01,1,ENERACTI,84340133,16.7,00:15,2024-07-01 00:15:00
4,2023-09-07,26,ENERACTI,2827835,50.73,06:30,2023-09-07 06:30:00


### 2. Datos de Centros de Carga


In [25]:
df_load_center = pd.read_csv(
    obtain_content_of_blob(
        connection_string, container_name, "aire_mainActiveMeterId_loadCenter.csv"
    ),
    delimiter=";",
)

print(df_load_center.shape)
print(df_load_center.dtypes)
df_load_center.head()

(743, 14)
Id                          object
InstallationCode            object
InstallationName            object
MainActiveMeterId           object
MainReactiveMeterId         object
SupportingActiveMeterId    float64
LoadCenterId                object
DistributorAgentId          object
StartDate                   object
EndDate                     object
MeasureOwner                object
TeamOwner                   object
ExternIdA                   object
ExternIdR                   object
dtype: object


Unnamed: 0,Id,InstallationCode,InstallationName,MainActiveMeterId,MainReactiveMeterId,SupportingActiveMeterId,LoadCenterId,DistributorAgentId,StartDate,EndDate,MeasureOwner,TeamOwner,ExternIdA,ExternIdR
0,7D583BC6-BAEC-4832-9D89-002FD770050C,Frt14425-84321848,OLIMPICA S.A. STO 114,84321848,84321848,,Frt14425,MARM,2023-01-01 00:00:00.000,,,Operador de Red,84321848,84321848
1,0DBAD0E7-402A-4193-8FFF-00D49145155C,Frt02080-03584985,OLIMPICA S.A STO 569,3584985,3584985,,Frt02080,CLOM,2023-01-01 00:00:00.000,,,Operador de Red,54151430,54151430
2,338EB2F0-77DC-490E-BDEF-011D9F40D72D,Frt31834-56807541,CAJA DE COMPENSACION FAMILIAR DE LA GUAJIRA (S...,56807541,56807541,,Frt31834,SOLM,2019-01-01 05:00:00.000,,,Usuario,36054039-MC931,36054039-MC931
3,149090C1-C758-4B7E-BD0D-012AE5FB4503,Frt02396-84291915,AGROBANACARIBE S.A.S (FINCA BUENAVISTA),84291915,84291915,,Frt02396,SOLM,2019-11-22 05:00:00.000,,,Operador de Red,84291915-MC983,84291915-MC983
4,4523169E-E357-4BA0-A114-01318501F2A1,Frt24204-84330465,OLIMPICA S.A. STO 434,84330465,84330465,,Frt24204,METM,2023-01-01 00:00:00.000,,,Operador de Red,84330465,84330465


In [26]:
variables_relevantes_load_center = ["MainActiveMeterId", "LoadCenterId", "StartDate"]

df_load_center_curated = df_load_center[variables_relevantes_load_center].copy()
df_load_center_curated["StartDate"] = df_load_center_curated["StartDate"].apply(
    lambda x: x.split(" ")[0]
)
df_load_center_curated = df_load_center_curated.sort_values(
    by="MainActiveMeterId", ascending=True
).reset_index(drop=True)

print(df_load_center_curated.dtypes)
print(df_load_center_curated.shape)
df_load_center_curated.head(5)

MainActiveMeterId    object
LoadCenterId         object
StartDate            object
dtype: object
(743, 3)


Unnamed: 0,MainActiveMeterId,LoadCenterId,StartDate
0,2827835,Frt02827835,2023-08-01
1,2881106,Frt42423,2019-01-01
2,2890695,Frt37699,2019-01-01
3,2890736,Frt38336,2019-01-01
4,2891020,Frt05018,2021-05-12


In [27]:
# Verificando que no se repita la llave principal
df_load_center_curated.MainActiveMeterId.value_counts()

MainActiveMeterId
Frt46423        2
I2H73001        2
I2H71001        2
I2H6Y001        2
I2H6X001        2
               ..
63087902        1
63088158        1
63108190        1
63108477        1
MW2010A28102    1
Name: count, Length: 714, dtype: int64

Dado que se repite la llave principal, según la regla de negocio definida por el cliente, se debe conservar la más reciente.


In [28]:
# Sort the DataFrame by 'columns2' in descending order
df_load_center_curated = df_load_center_curated.sort_values(
    by="StartDate", ascending=False
)

# Keep only the first occurrence of each unique value in 'column1'
df_load_center_curated = df_load_center_curated.drop_duplicates(
    subset="MainActiveMeterId", keep="first"
).reset_index(drop=True)

print(df_load_center_curated.shape)
print(df_load_center_curated.dtypes)
df_load_center_curated.head()

(714, 3)
MainActiveMeterId    object
LoadCenterId         object
StartDate            object
dtype: object


Unnamed: 0,MainActiveMeterId,LoadCenterId,StartDate
0,2827835,Frt02827835,2023-08-01
1,21112244,Frt21112244,2023-08-01
2,63366476,Frt63366476,2023-08-01
3,59006609,Frt59006609,2023-08-01
4,59005701,Frt59005701,2023-08-01


In [29]:
# Verificamos que no se repita la llave
df_load_center_curated.MainActiveMeterId.value_counts()

MainActiveMeterId
02827835    1
84339654    1
84339412    1
84339414    1
84339410    1
           ..
54151620    1
03584999    1
03583231    1
84440242    1
46018369    1
Name: count, Length: 714, dtype: int64

### 3. Datos de Estaciones de Medición


In [30]:
df_element_id = pd.read_csv(
    obtain_content_of_blob(connection_string, container_name, "element_id_aire.csv"),
    delimiter=";",
)

print(df_element_id.shape)
print(df_element_id.dtypes)
df_element_id.head()

(752, 27)
LoadCenterId              object
LoadCenterName            object
CompanyId                 object
NodeId                     int64
ActivityType              object
ModeledType               object
RZId                      object
UnitId                   float64
Region                    object
OrderAccount             float64
ServiceSupplier          float64
RMU                       object
ConsumptionRange          object
LocationType              object
KindOfPoint               object
TensionLevel              object
MinimumValue             float64
RPU                       object
City                      object
State                     object
Country                   object
ZipCode                  float64
Address                   object
LocationLevel1            object
LocationLevel2           float64
IndustrialSectorType      object
ExceedNominationValue      int64
dtype: object


Unnamed: 0,LoadCenterId,LoadCenterName,CompanyId,NodeId,ActivityType,ModeledType,RZId,UnitId,Region,OrderAccount,ServiceSupplier,RMU,ConsumptionRange,LocationType,KindOfPoint,TensionLevel,MinimumValue,RPU,City,State,Country,ZipCode,Address,LocationLevel1,LocationLevel2,IndustrialSectorType,ExceedNominationValue
0,Frt00001,CORPORACION DE ACERO CORPACERO S.A. - I,5913547,1,CARG,DIREMODE,SIN,,RGNorte,,,62083378,,,,N3,0.0,6089029,BARRANQUILLA,Atlántico,COLOMBIA,,cra 34,,,SC,0
1,Frt00002,MONOMEROS S.A,6514257,1,CARG,DIREMODE,SIN,,RGNorte,,,1,,,,N3,0.0,6539021,BARRANQUILLA,Atlántico,COLOMBIA,,Cra 57 No 99 A-65 Torres del Atlántico,,,SC,0
2,Frt00003,SMURFIT CARTON DE COLOMBIA,6516818,1,CARG,DIREMODE,SIN,,RGNorte,,,36097935,,,,N3,0.0,6547985,BARRANQUILLA,Atlántico,COLOMBIA,,Cra 57 No 99 A-65 Torres del Atlántico,,,SC,0
3,Frt00004,UNIBOL S.A.,6544969,1,CARG,DIREMODE,SIN,,RGNorte,,,0,,,,N3,0.0,6628912,SOLEDAD,Atlántico,COLOMBIA,,Cra 57 No 99 A-65 Torres del Atlántico,,,SC,0
4,Frt00005,TRANSELCA S.A,6809086,1,CARG,DIREMODE,SIN,,RGNorte,,,46000209,,,,N3,0.0,7200636,BARRANQUILLA,Atlántico,COLOMBIA,,Cra 57 No 99 A-65 Torres del Atlántico,,,SC,0


In [31]:
variables_relevantes_element = ["LoadCenterId", "City", "State", "Country", "Address"]

df_element_id_curated = df_element_id[variables_relevantes_element].copy()
df_element_id_curated["City"] = df_element_id_curated["City"].str.capitalize()
df_element_id_curated["State"] = df_element_id_curated["State"].str.capitalize()
df_element_id_curated["Country"] = df_element_id_curated["Country"].str.capitalize()

print(df_element_id_curated.shape)
print(df_element_id_curated.dtypes)
df_element_id_curated.head()

(752, 5)
LoadCenterId    object
City            object
State           object
Country         object
Address         object
dtype: object


Unnamed: 0,LoadCenterId,City,State,Country,Address
0,Frt00001,Barranquilla,Atlántico,Colombia,cra 34
1,Frt00002,Barranquilla,Atlántico,Colombia,Cra 57 No 99 A-65 Torres del Atlántico
2,Frt00003,Barranquilla,Atlántico,Colombia,Cra 57 No 99 A-65 Torres del Atlántico
3,Frt00004,Soledad,Atlántico,Colombia,Cra 57 No 99 A-65 Torres del Atlántico
4,Frt00005,Barranquilla,Atlántico,Colombia,Cra 57 No 99 A-65 Torres del Atlántico


In [32]:
# Verificamos que no se repita ninguna llave
df_element_id_curated.LoadCenterId.value_counts()

LoadCenterId
Frt00001           1
Frt28792           1
Frt28304           1
Frt28306           1
Frt28310           1
                  ..
Frt06349           1
Frt06351           1
Frt06359           1
Frt06378           1
FrtMW2010A28102    1
Name: count, Length: 752, dtype: int64

## Uniendo las 3 entidades

El objetivo es agregarle a los datos de consumo, la ubicación de la estación de medición, para esto se debe seguir el siguiente diagrama

![Diagram Image](diagramas/ModeloDeDatosConsumo.png)


In [33]:
df_ubication = df_load_center_curated.merge(
    df_element_id_curated, on="LoadCenterId", how="left"
)
print(df_ubication.shape)
df_ubication.head()

(714, 7)


Unnamed: 0,MainActiveMeterId,LoadCenterId,StartDate,City,State,Country,Address
0,2827835,Frt02827835,2023-08-01,Medellín,Antioquia,Colombia,2827835
1,21112244,Frt21112244,2023-08-01,Medellín,Antioquia,Colombia,21112244
2,63366476,Frt63366476,2023-08-01,Medellín,Antioquia,Colombia,63366476
3,59006609,Frt59006609,2023-08-01,Medellín,Antioquia,Colombia,59006609
4,59005701,Frt59005701,2023-08-01,Medellín,Antioquia,Colombia,59005701


In [34]:
df_consumo_refined = df_consumo_curated.merge(
    df_ubication, left_on="ElementId", right_on="MainActiveMeterId", how="left"
)
print(df_consumo_refined.shape)
df_consumo_refined.head()

(100000, 14)


Unnamed: 0,Date,Period,ConceptId,ElementId,Value,Hour,Datetime,MainActiveMeterId,LoadCenterId,StartDate,City,State,Country,Address
0,2024-07-01,1,GENERACION,84340133,0.0,00:15,2024-07-01 00:15:00,84340133,Frt07221,2023-01-01,Barranquilla,Atlántico,Colombia,CARRERA 7H # 35A-60
1,2024-07-01,1,ENEREACAP,84340133,3.0,00:15,2024-07-01 00:15:00,84340133,Frt07221,2023-01-01,Barranquilla,Atlántico,Colombia,CARRERA 7H # 35A-60
2,2024-07-01,1,ENERREAC,84340133,0.0,00:15,2024-07-01 00:15:00,84340133,Frt07221,2023-01-01,Barranquilla,Atlántico,Colombia,CARRERA 7H # 35A-60
3,2024-07-01,1,ENERACTI,84340133,16.7,00:15,2024-07-01 00:15:00,84340133,Frt07221,2023-01-01,Barranquilla,Atlántico,Colombia,CARRERA 7H # 35A-60
4,2023-09-07,26,ENERACTI,2827835,50.73,06:30,2023-09-07 06:30:00,2827835,Frt02827835,2023-08-01,Medellín,Antioquia,Colombia,02827835


### Seleccionando Columnas y Formateando Tipo de Datos


In [35]:
df_consumo_refined = df_consumo_refined[
    [
        "Date",
        "Period",
        "ConceptId",
        "ElementId",
        "Value",
        "Hour",
        "Datetime",
        "City",
        "State",
        "Country",
        "Address",
    ]
]
print(df_consumo_refined.dtypes)
print(df_consumo_refined.shape)
df_consumo_refined.head()

Date                 object
Period                int64
ConceptId            object
ElementId            object
Value               float64
Hour                 object
Datetime     datetime64[ns]
City                 object
State                object
Country              object
Address              object
dtype: object
(100000, 11)


Unnamed: 0,Date,Period,ConceptId,ElementId,Value,Hour,Datetime,City,State,Country,Address
0,2024-07-01,1,GENERACION,84340133,0.0,00:15,2024-07-01 00:15:00,Barranquilla,Atlántico,Colombia,CARRERA 7H # 35A-60
1,2024-07-01,1,ENEREACAP,84340133,3.0,00:15,2024-07-01 00:15:00,Barranquilla,Atlántico,Colombia,CARRERA 7H # 35A-60
2,2024-07-01,1,ENERREAC,84340133,0.0,00:15,2024-07-01 00:15:00,Barranquilla,Atlántico,Colombia,CARRERA 7H # 35A-60
3,2024-07-01,1,ENERACTI,84340133,16.7,00:15,2024-07-01 00:15:00,Barranquilla,Atlántico,Colombia,CARRERA 7H # 35A-60
4,2023-09-07,26,ENERACTI,2827835,50.73,06:30,2023-09-07 06:30:00,Medellín,Antioquia,Colombia,02827835


In [36]:
print(f"Fecha mínima: {df_consumo_refined.Date.min()}")

Fecha mínima: 2023-06-22


In [37]:
print(f"Fecha máxima: {df_consumo_refined.Date.max()}")

Fecha máxima: 2024-07-01
