# Modelo y preprocesamiento seleccionado

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

In [None]:
proyect_dir = os.path.dirname(os.path.abspath(os.getcwd()))
data_folder = "data"
file_name = "datos_unidos.csv"
read_dir = os.path.join(proyect_dir,os.path.join(data_folder,file_name))
print(read_dir)
df_total = pd.read_csv(read_dir)
df_total["Fecha"] = pd.to_datetime(df_total["Fecha"])

c:\Users\Paul\Documents\Paul\Cursos\Uniandes\maestria_MIAD\202501\proyecto_final\data\datos_unidos.csv


In [None]:
# Crea el arreglo de fechas 
start_date = df_total["Fecha"].min() #"2019-02-14 00:00:00"
end_date = df_total["Fecha"].max() #"2023-12-31 23:00:00"

datetime_list = pd.date_range(start=start_date, end=end_date, freq='h').to_list()
df_datetime_list = pd.DataFrame(datetime_list ,columns=["Fecha"])

In [None]:
# Une informacion de cada cliente con las fechas 
arr_total_fecha = []
for clt in df_total["cliente"].unique():
    fltr = df_total["cliente"] == clt
    df_temp = pd.merge(df_datetime_list,df_total[fltr], on="Fecha", how = "left")
    df_temp["cliente"] = clt
    arr_total_fecha.append(df_temp)

df_total_fecha = pd.concat(arr_total_fecha)

In [None]:
# Añadir variables temporales al dataset
df_total_fecha_isloation_temp_diff = df_total_fecha.fillna(0)

df_total_fecha_isloation_temp_diff['hora'] = df_total_fecha_isloation_temp_diff['Fecha'].dt.hour
df_total_fecha_isloation_temp_diff['dia_semana'] = df_total_fecha_isloation_temp_diff['Fecha'].dt.dayofweek  # 0 = lunes, 6 = domingo
df_total_fecha_isloation_temp_diff['mes'] = df_total_fecha_isloation_temp_diff['Fecha'].dt.month
df_total_fecha_isloation_temp_diff['dia_mes'] = df_total_fecha_isloation_temp_diff['Fecha'].dt.day


# Step 1: sort by 'cliente' and 'Fecha' globally
df_total_fecha_isloation_temp_diff = df_total_fecha_isloation_temp_diff.sort_values(['cliente', 'Fecha'])

# Step 2: calculate deltas within each group
df_total_fecha_isloation_temp_diff['delta_volumen'] = df_total_fecha_isloation_temp_diff.groupby('cliente')['Volumen'].diff()
df_total_fecha_isloation_temp_diff['delta_presion'] = df_total_fecha_isloation_temp_diff.groupby('cliente')['Presion'].diff()
df_total_fecha_isloation_temp_diff['delta_temperatura'] = df_total_fecha_isloation_temp_diff.groupby('cliente')['Temperatura'].diff()


# Lista para guardar resultados con variables temporales
iforest_temp_results = []

# Entrenar Isolation Forest por cliente con variables temporales
for cliente in df_total_fecha_isloation_temp_diff['cliente'].unique():
    df_cli = df_total_fecha_isloation_temp_diff[df_total_fecha_isloation_temp_diff['cliente'] == cliente].copy()
    features = ['Presion', 'Temperatura', 'Volumen', 'hora', 'dia_semana', 'mes']
    X = df_cli[features]
    
    # Escalamiento
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Modelo Isolation Forest
    model = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
    df_cli['anomaly_label'] = model.fit_predict(X_scaled)
    df_cli['anomaly_score'] = model.decision_function(X_scaled)
    df_cli['anomaly_label'] = df_cli['anomaly_label'].map({1: 'normal', -1: 'anomalía'})

    # Clasificar severidad
    q01 = df_cli['anomaly_score'].quantile(0.01)
    q05 = df_cli['anomaly_score'].quantile(0.05)

    def clasificar_severidad(score):
        if score < q01:
            return 'alta'
        elif score < q05:
            return 'media'
        else:
            return 'leve'

    df_cli['nivel_anomalia_iforest'] = df_cli['anomaly_score'].apply(clasificar_severidad)
    iforest_temp_results.append(df_cli)

# Concatenar resultados
df_iforest_temporal_lags = pd.concat(iforest_temp_results, ignore_index=True)

# Resumen por cliente
resumen_iforest_temporal = df_iforest_temporal_lags\
    .groupby(['cliente', 'anomaly_label','nivel_anomalia_iforest'])[["Fecha"]].count()

resumen_iforest_temporal

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Fecha
cliente,anomaly_label,nivel_anomalia_iforest,Unnamed: 3_level_1
CLIENTE1,anomalía,alta,436
CLIENTE1,anomalía,leve,12674
CLIENTE1,anomalía,media,1740
CLIENTE1,normal,leve,28662
CLIENTE10,anomalía,alta,434
...,...,...,...
CLIENTE8,normal,leve,27571
CLIENTE9,anomalía,alta,436
CLIENTE9,anomalía,leve,3859
CLIENTE9,anomalía,media,1740
