In [7]:
import pandas as pd
import xgboost as xgb
import numpy as np
import yaml
import sys
import os
import joblib
from sklearn.model_selection import train_test_split
import logging




"""
XGBoost Model Module

This module handles the training, evaluation, and configuration loading for XGBoost models
on the California housing dataset. It includes data loading, model training with hyperparameter
tuning, and performance metrics calculation.
"""



def load_config() -> dict:
    """
    Load configuration settings from the YAML config file.
    
    Returns:
        dict: Configuration dictionary loaded from config.yaml.
        
    Raises:
        SystemExit: If the config file cannot be loaded or parsed.
    """
    config_path = "config/config.yaml"
    try:
        with open(config_path, 'r', encoding='utf-8') as file:
            config = yaml.safe_load(file)
        logger.info(f"Config loaded from {config_path}")
        return config
    except Exception as e:
        logger.error(f"Failed to load config: {e}")
        sys.exit(1)

def train_model() -> None:
    """
    Train the XGBoost model on the California housing dataset.
    
    Loads configuration, validates data files, loads the dataset,
    separates features and target, and prepares for model training.
    """
    logger.info("Starting model training...")

    config = load_config()

    raw_data_path = config['data']['raw_path']
    validate_input_file(raw_data_path)
    logger.info(f"Validated input file at {raw_data_path}")

    processed_data_path = config['data']['processed_path']
    validate_input_file(processed_data_path)
    logger.info(f"Validated input file at {processed_data_path}")

    logger.info("Loading dataset...")

    try:
        df = pd.read_csv(raw_data_path, encoding='utf-8')
        logger.info("Dataset loaded successfully.")
        logger.info(f"Dataset shape: {df.shape}")
        logger.info(f"Dataset preview:\n{df.head()}")
    except Exception as e:
        logger.error(f"Failed to load dataset: {e}")
        sys.exit(1)     

    logger.info("Dropping the target column from features")
    x = df.drop(columns=[config['model']['target']])
    y = df[config['model']['target']]
    logger.info("Features and target variable separated.")
    logger.info(f"Features shape: {x.shape}, Target shape: {y.shape}")
    logger.info(f"Features preview:\n{x.head()}")

In [None]:
df[]