In [8]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor
import joblib

# Crear la carpeta para guardar los modelos si no existe
modelos_directory = 'modelos_voting'
resultados_directory = 'Resultados'
os.makedirs(modelos_directory, exist_ok=True)
os.makedirs(resultados_directory, exist_ok=True)

# Cargar los datos desde los archivos proporcionados
costos_df = pd.read_excel('Costos_Subprocesos.xlsx')
indicadores_minas_df = pd.read_excel('Indicadores_Minas.xlsx')
indicadores_carguio_df = pd.read_excel('Indicadores_Carguio.xlsx')
indicadores_perforacion_df = pd.read_excel('Indicadores_Perforación.xlsx')
indicadores_transporte_df = pd.read_excel('Indicadores_Transporte.xlsx')

# Convertir las fechas a formato datetime en costos
costos_df['Subproceso_Costo'] = pd.to_datetime(costos_df['Subproceso_Costo'])

# Convertir las fechas a formato datetime en los DataFrames de indicadores
for df in [indicadores_minas_df, indicadores_carguio_df, indicadores_perforacion_df, indicadores_transporte_df]:
    df['Fecha'] = pd.to_datetime(df['Fecha'])

# Unir todos los DataFrames de indicadores en uno solo
indicadores_df = indicadores_minas_df.copy()
indicadores_df = pd.merge(indicadores_df, indicadores_carguio_df, on='Fecha', how='inner', suffixes=('_minas', '_carguio'))
indicadores_df = pd.merge(indicadores_df, indicadores_perforacion_df, on='Fecha', how='inner', suffixes=('', '_perforacion'))
indicadores_df = pd.merge(indicadores_df, indicadores_transporte_df, on='Fecha', how='inner', suffixes=('', '_transporte'))

# Filtrar indicadores para separar entrenamiento (hasta 2022) y predicción (2023)
indicadores_train = indicadores_df[indicadores_df['Fecha'] < '2023-01-01']
indicadores_test = indicadores_df[indicadores_df['Fecha'] >= '2023-01-01']

# Unir el DataFrame de costos con el DataFrame de indicadores de entrenamiento
df_unificado = pd.merge(costos_df, indicadores_train, left_on='Subproceso_Costo', right_on='Fecha', how='inner')
df_unificado = df_unificado.drop(columns=['Fecha'])  # Eliminar la columna duplicada de Fecha

# Calcular la correlación entre todos los subprocesos y los indicadores usando Spearman
subprocesos_cols = costos_df.columns[1:]  # Excluyendo la columna de fecha
indicadores_cols = indicadores_df.columns[1:]  # Excluyendo la columna de fecha
correlation_matrix = df_unificado.corr(method='spearman')
correlation_filtered = correlation_matrix.loc[subprocesos_cols, indicadores_cols]
top_correlations = correlation_filtered.apply(lambda x: x.nlargest(3), axis=1)

# Definir los modelos base para el Voting Regressor
modelos_base = [
    ('lr', LinearRegression()),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=100, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, random_state=42)),
    ('lgbm', LGBMRegressor(n_estimators=100, random_state=42)),
    ('mlp', MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)),
    ('svm', SVR())
]

# Crear el Voting Regressor
voting_regressor = VotingRegressor(estimators=modelos_base)

# Crear un diccionario para almacenar los resultados del Voting Regressor
resultados_voting = {}
indicadores_utilizados = {}
predicciones_por_subproceso = {}

# Escalar los datos
scaler = StandardScaler()

# Crear imputador para reemplazar NaNs con la media
imputer = SimpleImputer(strategy='mean')

# Realizar predicciones usando el Voting Regressor
for subproceso in subprocesos_cols:
    indicadores_seleccionados = top_correlations.loc[subproceso].dropna().index.tolist()
    
    if len(indicadores_seleccionados) > 0:
        # Guardar los indicadores seleccionados
        indicadores_utilizados[subproceso] = indicadores_seleccionados
        
        # Utilizar todos los indicadores seleccionados para el entrenamiento
        X_train = df_unificado[indicadores_seleccionados]
        y_train = df_unificado[subproceso]
        X_test = indicadores_test[indicadores_seleccionados]
        
        # Imputar valores faltantes en los conjuntos de datos
        X_train = imputer.fit_transform(X_train)
        X_test = imputer.transform(X_test)
        
        # Escalar los datos
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Verificar si y_train tiene valores constantes
        if y_train.nunique() == 1:
            print(f"Advertencia: El subproceso {subproceso} tiene un objetivo constante en el conjunto de entrenamiento.")
            continue  # Omitir este subproceso
        
        # Entrenar el Voting Regressor
        voting_regressor.fit(X_train_scaled, y_train)
        
        # Guardar el modelo entrenado para futuras predicciones
        joblib.dump(voting_regressor, os.path.join(modelos_directory, f'{subproceso}_voting_model.pkl'))
        
        # Predecir para 2023
        y_pred_2023 = voting_regressor.predict(X_test_scaled)

        # Guardar predicciones por subproceso para el archivo de resultados
        predicciones_por_subproceso[subproceso] = y_pred_2023

# Crear un DataFrame con las predicciones por subproceso y mes
predicciones_df = pd.DataFrame(predicciones_por_subproceso, index=pd.date_range('2023-01-01', '2023-12-01', freq='MS').strftime("%B-%Y"))

# Guardar las predicciones de 2023 en un archivo Excel
costos_output_path = os.path.join(resultados_directory, 'costos_output_2023.xlsx')
with pd.ExcelWriter(costos_output_path) as writer:
    predicciones_df.to_excel(writer, sheet_name='Predicciones 2023')

# Crear un DataFrame para almacenar las top 3 correlaciones por subproceso
correlaciones_df = pd.DataFrame.from_dict(top_correlations.to_dict(), orient='index', columns=['Top1', 'Top2', 'Top3'])

# Guardar las correlaciones en un archivo Excel
correlaciones_output_path = os.path.join(resultados_directory, 'correlaciones_subprocesos.xlsx')
with pd.ExcelWriter(correlaciones_output_path) as writer:
    correlaciones_df.to_excel(writer, sheet_name='Top Correlaciones')

print(f"Predicciones guardadas en: {costos_output_path}")
print(f"Correlaciones guardadas en: {correlaciones_output_path}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 1998231.633283




[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 0
[LightGBM] [Info] Start training from score 33665.406360




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 70
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 87764.844716




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 86
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 580903.853143




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 188781.297193




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000019 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 44
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 53907.620200




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 12928.879971




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 1
[LightGBM] [Info] Start training from score 14025.993336




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score -1136742.861446




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 23484.313025




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 3076089.258660




[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 0
[LightGBM] [Info] Start training from score 102579.310359




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 368250.853110




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000015 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 704192.339926




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 180458.878112




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 57
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score -440581.854198




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 64
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score -29595.461043




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 72
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 1169711.581325




[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 0




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 54
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score -450438.945595




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000024 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 2
[LightGBM] [Info] Start training from score 20721.672863




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 42
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 1645774.379142




[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 0
[LightGBM] [Info] Start training from score 67201.799746




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 102790.597821




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 85
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 166036.517035




[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 0
[LightGBM] [Info] Start training from score 6.172276




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 49971.224239




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score -276912.251859




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 10719.254932




[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 0
[LightGBM] [Info] Start training from score 1604729.312381




[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 0
[LightGBM] [Info] Start training from score 1247.291792




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 192.465515




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 6168689.096386




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000020 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 1
[LightGBM] [Info] Start training from score 2230.652575




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000021 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 42
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 6309.674950




[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 0
[LightGBM] [Info] Start training from score 133116.949689




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 3326189.775465




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000023 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 308853.036898




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000021 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 39
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 157755.056382




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000015 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 602588.743270




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score -2158775.199548




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000019 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 39
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 2
[LightGBM] [Info] Start training from score 53802.577207




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 10724524.409639




[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 0
[LightGBM] [Info] Start training from score 188252.929205




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 668510.882718




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 1139563.361822




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 328962.216632




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 601377.650320




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 2023471.567583




[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 0
[LightGBM] [Info] Start training from score 252109.030026




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000016 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 158498.811392




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 254198.500812




[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 0
[LightGBM] [Info] Start training from score 80.056420




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000019 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 83, number of used features: 3
[LightGBM] [Info] Start training from score 60165.457850
Predicciones guardadas en: Resultados\costos_output_2023.xlsx
Correlaciones guardadas en: Resultados\correlaciones_subprocesos.xlsx


