# Modelos por empresa

## Objetivo

Para cada id_empresa en tu dataset:

- Filtrar los datos
- Dividir en entrenamiento/test (80/20)
- Entrenar un TabularPredictor
- Evaluar accuracy, F1, precision, recall
- Guardar resultados por empresa

### Paso 1: Código completo para correr todo el pipeline

In [15]:
import pandas as pd

# Reemplazá el nombre del archivo por la ruta correcta si no está en el mismo directorio
analisis_df = pd.read_csv("data_set_integrado_modelo_final_futuro_binario.csv")

# Ver features
print(analisis_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2909 entries, 0 to 2908
Data columns (total 76 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   id_cotizacion                              2909 non-null   int64  
 1   id_empresa                                 2909 non-null   int64  
 2   fecha                                      2909 non-null   object 
 3   precio_apertura                            2909 non-null   float64
 4   precio_cierre                              2909 non-null   float64
 5   precio_max                                 2909 non-null   float64
 6   precio_min                                 2909 non-null   float64
 7   volumen_operado                            2909 non-null   float64
 8   variacion_porcentaje                       2862 non-null   float64
 9   nombre                                     2909 non-null   object 
 10  label                   

In [9]:
from autogluon.tabular import TabularPredictor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Asegurarse que fecha esté en formato datetime
analisis_df['fecha'] = pd.to_datetime(analisis_df['fecha'])

# Lista para guardar resultados por empresa
resultados_empresas = []

# Lista única de empresas
empresas = analisis_df['id_empresa'].unique()

# Recorrer cada empresa
for empresa_id in empresas:
    df_empresa = analisis_df[analisis_df['id_empresa'] == empresa_id].copy()
    df_empresa = df_empresa.sort_values('fecha').reset_index(drop=True)
    
    if df_empresa['label_t_plus_1'].nunique() < 2 or len(df_empresa) < 50:
        # Saltar empresas sin suficientes datos o sin ambas clases
        continue

    # Separar en train/test (80/20)
    split_idx = int(len(df_empresa) * 0.8)
    train_df = df_empresa.iloc[:split_idx]
    test_df = df_empresa.iloc[split_idx:]

    excluded_cols = ['label', 'label_t_plus_1', 'nombre', 'fecha', 'fecha_publicacion', 'id_cotizacion']
    feature_cols = [col for col in train_df.columns if col not in excluded_cols]

    # Entrenar modelo
    predictor = TabularPredictor(label='label_t_plus_1', problem_type='binary', verbosity=0).fit(
        train_data=train_df[feature_cols + ['label_t_plus_1']],
        verbosity=0
    )

    # Evaluar
    y_true = test_df['label_t_plus_1']
    y_pred = predictor.predict(test_df[feature_cols])

    resultados_empresas.append({
        'id_empresa': empresa_id,
        'empresa': df_empresa['nombre'].iloc[0],
        'n_muestras': len(df_empresa),
        'accuracy': accuracy_score(y_true, y_pred),
        'precision_sube': precision_score(y_true, y_pred, pos_label='SUBE', zero_division=0),
        'recall_sube': recall_score(y_true, y_pred, pos_label='SUBE', zero_division=0),
        'f1_sube': f1_score(y_true, y_pred, pos_label='SUBE', zero_division=0),
    })


		ColumnTransformer.__init__() got an unexpected keyword argument 'force_int_remainder_cols'
		ColumnTransformer.__init__() got an unexpected keyword argument 'force_int_remainder_cols'
		ColumnTransformer.__init__() got an unexpected keyword argument 'force_int_remainder_cols'
		ColumnTransformer.__init__() got an unexpected keyword argument 'force_int_remainder_cols'
		ColumnTransformer.__init__() got an unexpected keyword argument 'force_int_remainder_cols'
		ColumnTransformer.__init__() got an unexpected keyword argument 'force_int_remainder_cols'
		ColumnTransformer.__init__() got an unexpected keyword argument 'force_int_remainder_cols'
		ColumnTransformer.__init__() got an unexpected keyword argument 'force_int_remainder_cols'
		ColumnTransformer.__init__() got an unexpected keyword argument 'force_int_remainder_cols'


### Paso 2: Paso final: Ver los resultados ordenados

In [10]:
# Convertir a DataFrame y ordenar por F1
df_resultados = pd.DataFrame(resultados_empresas)
df_resultados = df_resultados.sort_values(by='f1_sube', ascending=False)

# Mostrar top 10
print("📈 Empresas más predecibles (según F1 para SUBE):")
display(df_resultados.head(10))


📈 Empresas más predecibles (según F1 para SUBE):


Unnamed: 0,id_empresa,empresa,n_muestras,accuracy,precision_sube,recall_sube,f1_sube
8,18,XOMD,324,0.569231,0.435897,0.73913,0.548387
4,14,MSFTD,324,0.384615,0.37931,0.846154,0.52381
3,13,MELID,324,0.492308,0.457143,0.533333,0.492308
7,17,VISTD,324,0.461538,0.372093,0.666667,0.477612
5,15,NVDAD,324,0.492308,0.405405,0.576923,0.47619
6,16,TEND,320,0.53125,0.36,0.391304,0.375
1,11,DESPD,321,0.569231,0.4,0.333333,0.363636
0,10,AAPLD,324,0.446154,0.32,0.296296,0.307692
2,12,KOD,324,0.476923,0.333333,0.172414,0.227273


## Uso de optimización con hiperparámetros

In [13]:
from autogluon.tabular import TabularPredictor
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import pandas as pd

empresas = analisis_df['id_empresa'].unique()
resultados_empresas = []

for empresa_id in empresas:
    df_empresa = analisis_df[analisis_df['id_empresa'] == empresa_id].copy()
    df_empresa = df_empresa.sort_values('fecha').reset_index(drop=True)

    if df_empresa['label_t_plus_1'].nunique() < 2 or len(df_empresa) < 100:
        continue

    # Train/Test split
    split_idx = int(len(df_empresa) * 0.7)
    train_df = df_empresa.iloc[:split_idx]
    test_df = df_empresa.iloc[split_idx:]

    excluded_cols = ['label', 'label_t_plus_1', 'nombre', 'fecha', 'fecha_publicacion', 'id_cotizacion']
    feature_cols = [col for col in train_df.columns if col not in excluded_cols]

    try:
        # AutoML con tuning de hiperparámetros
        predictor = TabularPredictor(label='label_t_plus_1', problem_type='binary').fit(
            train_data=train_df[feature_cols + ['label_t_plus_1']],
            hyperparameter_tune_kwargs='auto',
            time_limit=1800,  # Limita a 30 minutos por empresa (ajustable)
            verbosity=0
        )

        y_true = test_df['label_t_plus_1']
        y_pred = predictor.predict(test_df[feature_cols])

        resultados_empresas.append({
            'id_empresa': empresa_id,
            'empresa': df_empresa['nombre'].iloc[0],
            'n_muestras': len(df_empresa),
            'accuracy': accuracy_score(y_true, y_pred),
            'precision_sube': precision_score(y_true, y_pred, pos_label='SUBE', zero_division=0),
            'recall_sube': recall_score(y_true, y_pred, pos_label='SUBE', zero_division=0),
            'f1_sube': f1_score(y_true, y_pred, pos_label='SUBE', zero_division=0)
        })

    except Exception as e:
        print(f"❌ Error con empresa {empresa_id}: {e}")


## Resultados

In [14]:
df_resultados = pd.DataFrame(resultados_empresas).sort_values(by='f1_sube', ascending=False)
display(df_resultados.head(10))

Unnamed: 0,id_empresa,empresa,n_muestras,accuracy,precision_sube,recall_sube,f1_sube
8,18,XOMD,324,0.581633,0.476923,0.815789,0.601942
0,10,AAPLD,324,0.428571,0.419355,0.95122,0.58209
1,11,DESPD,321,0.556701,0.465517,0.692308,0.556701
2,12,KOD,324,0.520408,0.45,0.658537,0.534653
3,13,MELID,324,0.479592,0.446154,0.659091,0.53211
7,17,VISTD,324,0.459184,0.397059,0.692308,0.504673
6,16,TEND,320,0.4375,0.333333,0.459459,0.386364
4,14,MSFTD,324,0.663265,0.769231,0.25,0.377358
5,15,NVDAD,324,0.571429,0.0,0.0,0.0
