In [1]:
import os
import sys

# Agrega el directorio base de tu proyecto al path
project_dir = os.path.abspath('..')  # asumiendo que el notebook está en el directorio notebooks
if project_dir not in sys.path:
    sys.path.append(project_dir)

In [2]:
# Ruta absoluta al directorio raíz del proyecto
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

In [3]:
# training.py
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from scripts.preprocessing_pipeline import run_data_pipeline
import logging
from sklearn.preprocessing import StandardScaler
import mlflow
import mlflow.xgboost
import sys

In [4]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
DATA_COMPANY = "historical_info_ISA_Interconnection_Electric.csv"
DATA_FOLDER = "../data/"
FINAL_FILES_PATH = os.path.join(DATA_FOLDER, DATA_COMPANY)
PROFILING_REPORTS_PATH = "../profiling_reports/"

In [5]:
# Verifica y crea el directorio si no existe
if not os.path.exists(PROFILING_REPORTS_PATH):
    os.makedirs(PROFILING_REPORTS_PATH)

In [6]:
mlflow.set_tracking_uri(f"sqlite:///mlflow.db")


In [7]:
mlflow.set_experiment("mlops-final-project-michaelmora-case-1")

2024/07/12 16:36:33 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/07/12 16:36:33 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='/home/mlops_zoomcamp/mlops-zoomcamp-project/notebooks/mlruns/1', creation_time=1720802193984, experiment_id='1', last_update_time=1720802193984, lifecycle_stage='active', name='mlops-final-project-michaelmora-case-1', tags={}>

In [8]:
print(FINAL_FILES_PATH)

../data/historical_info_ISA_Interconnection_Electric.csv


In [9]:
def load_and_preprocess_data(path_file: str) -> pd.DataFrame:    
    """
    Load and preprocess the dataset.

    Parameters:
    path_file (str): Path to the dataset file.

    Returns:
    pd.DataFrame: Preprocessed dataset.
    """
    logging.info(f'Leyendo archivo: {DATA_COMPANY}') 

    data_transformed = run_data_pipeline(path_file)
    return data_transformed



In [10]:
def split_data(dataset: pd.DataFrame, target_column: str):
    """
    Split the dataset into training, validation, and test sets.

    Parameters:
    dataset (pd.DataFrame): The preprocessed dataset.
    target_column (str): The name of the target column.

    Returns:
    Tuple containing train, validation, and test sets.
    """
    logging.info("Training, testing and validation sets preparing!")    

    X = dataset[['Apertura', 'Máximo', 'Mínimo', 'Vol.', '% var.']]
    y = dataset[target_column]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    logging.info("Training, testing and validation sets partition has finished!")    

    return X_train, X_val, X_test, y_train, y_val, y_test



In [11]:
def train_xgboost_model(X_train, y_train, X_val, y_val):
    """
    Train an XGBoost model.

    Parameters:
    X_train (pd.DataFrame): Training features.
    y_train (pd.Series): Training labels.
    X_val (pd.DataFrame): Validation features.
    y_val (pd.Series): Validation labels.

    Returns:
    XGBRegressor: The trained XGBoost model.
    """
    logging.info("Training XGBoost model has started!")  
    
    with mlflow.start_run():
        # Definir el modelo
        model = XGBRegressor(
            objective='reg:squarederror',
            n_estimators=100,
            max_depth=3,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.01,
            reg_lambda=1
            )    
    
    
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=False)

        # Registrar parámetros y métricas en MLflow
        mlflow.xgboost.log_model(model, "model")
        mlflow.log_params(model.get_params())

        # Evaluar el modelo
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        mlflow.log_metric("mse", mse)

        logging.info(f"Mean Squared Error: {mse}")
        
    return model
    

In [12]:
def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model on the test set.

    Parameters:
    model (XGBRegressor): The trained model.
    X_test (pd.DataFrame): Test features.
    y_test (pd.Series): Test labels.

    Returns:
    float: Mean Squared Error of the model on the test set.
    """
    logging.info("Training XGBoost Evaluation model has started!")    

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse



In [13]:
def main():
    """
    Main function to run the training pipeline.
    """
    dataset = load_and_preprocess_data(FINAL_FILES_PATH)
    target_column = 'Último'
    
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(dataset, target_column)
    
    model = train_xgboost_model(X_train, y_train, X_val, y_val)
    
    mse = evaluate_model(model, X_test, y_test)
    print(f"Mean Squared Error on the test set: {mse}")
    
    # Save the model
    model.save_model('../models/xgboost_model_exp_1.pkl')


In [14]:
main()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]



Mean Squared Error on the test set: 60526.19722094978




In [15]:
# mlflow.end_run()