In [53]:
import os
import logging
import yaml
import numpy as np
import pandas as pd
import pytest

# Data processing
from pipeline.data import import_data, process_data

# Model training and evaluation
from pipeline.model import train_model, train
from pipeline.run_evaluation import run_evaluate_model

# Machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline

# XGBoost
from xgboost import XGBClassifier

In [14]:
def check_file_exists(file_path, file_name):
    return os.path.exists(os.path.join(os.path.abspath(file_path), file_name))

In [16]:
with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

In [50]:
def get_model_from_config(model_config: dict):
    """Given a model configuration dict, return the corresponding model instance."""

    if "XGBClassifier" in model_config:
        model_params = model_config["XGBClassifier"]
        model = XGBClassifier(**model_params)  # Use parameters from config
        return model

    elif "LogisticRegression" in model_config:
        model_params = model_config["LogisticRegression"]
        model = LogisticRegression(**model_params)  # Use parameters from config
        return model

    else:
        raise ValueError("Unsupported model configuration.")

### prepare for test data and train-test-split

In [12]:
@pytest.fixture(scope="session")
def load_data():
    """
    Data loaded from csv file used for tests

    Returns:
       X and y for tests
    """
    if not os.path.exists(config["main"]["data"]["pth"]):
        pytest.fail(f"Data not found at path: {config['main']['data']['pth']}")
    df = import_data(config["main"]["data"]["pth"])

    X, y = process_data(
        df,
        config["main"]["data"]["categorical_features"],
        config["main"]["data"]["label"],
    )
    return X, y

In [13]:
@pytest.fixture(scope="session")
def train_test_split(load_data):
    if not os.path.exists(config["main"]["data"]["pth"]):
        pytest.fail(f"Data not found at path: {config['main']['data']['pth']}")

    df = import_data(config["main"]["data"]["pth"])
    X, y = process_data(
        df,
        config["main"]["data"]["categorical_features"],
        config["main"]["data"]["label"],
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X,
        y,
        test_size=config["main"]["train_test_split"]["test_size"],
        random_state=config["main"]["train_test_split"]["random_state"],
    )

    return X_train, X_val, y_train, y_val

### Start test section 

In [14]:
def test_columns_exist(load_data):
    """
    Test if all expected columns exist in the DataFrame.

    Args:
      load_data: Data to be tested (pandas DataFrame)
    """
    # Assume load_data is a DataFrame or similar structure
    X, y = load_data  # load_data is passed automatically by pytest as a fixture

    # List of expected columns
    expected_columns = [
        "age",
        "workclass",
        "fnlgt",
        "education",
        "education_num",
        "marital_status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "capital-gain",
        "capital-loss",
        "hours-per_week",
        "native-country",
        "salary",
    ]

    # Check if each expected column is in the DataFrame
    for column in expected_columns:
        assert column in X.columns, f"Column '{column}' is missing in the DataFrame."

    # If necessary, you can also print out the columns that were found for verification
    print("Found columns in DataFrame:", X.columns)

In [15]:
!pytest --maxfail=1 --disable-warnings -q

[32m.[0m[32m.[0m[32m.[0m[32m                                                                      [100%][0m
[32m[32m[1m3 passed[0m[32m in 0.16s[0m[0m


### Test for train section

In [59]:
df = import_data(pth=config["main"]["data"]["pth"])
X, y = process_data(
    df,
    config["main"]["data"]["categorical_features"],
    config["main"]["data"]["label"],
)
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=config["main"]["train_test_split"]["test_size"],
    random_state=config["main"]["train_test_split"]["random_state"],
)

2024-12-30 11:48:24,748 Downloading data set
2024-12-30 11:48:24,824 Handle categorical features
2024-12-30 11:48:24,888 Encode response variable


In [57]:
def test_train_and_evaluate_model(X_train, X_test, y_train, y_test, config):
    try:
        model = get_model_from_config(config["main"]["modeling"]["MODEL"])
        best_model, best_params = train(
            model,
            X_train,
            y_train,
            config["main"]["modeling"]["param_grid"],
            config["main"]["modeling"]["FEATURES"],
        )

        run_evaluate_model(
            y_test,
            best_model,
            X_test,
            output_dir=os.getcwd() + config["main"]["modeling"]["output_dir"],
            model_dir=os.getcwd() + config["main"]["modeling"]["model_dir"],
            slice_evaluation_by_feature=config["main"]["modeling"]["slice_output"][
                "slice_evaluation_by_feature"
            ],
            categorical_features=config["main"]["modeling"]["slice_output"][
                "categorical_features"
            ],
        )

        logging.info("SUCCESS: Training and evaluating model")

    except BaseException as e:
        logging.error("ERROR: Model training/evaluating failed. %s", str(e))

    model_name = "best_model.pkl"
    try:
        model_path = os.getcwd() + config["main"]["modeling"]["model_dir"]
        assert check_file_exists(model_path, model_name)
        logging.info("SUCCESS: Best model %s is saved.", model_name)
    except AssertionError:
        logging.error(
            "ERROR: Best model %s not found in path %s.",
            model_name,
            model_path,
        )

    # Check if ROC image exists
    roc_image_name = "roc_curve.png"
    try:
        image_path = os.getcwd() + config["main"]["modeling"]["output_dir"]
        assert check_file_exists(image_path, roc_image_name)
        logging.info("SUCCESS: ROC image %s saved.", roc_image_name)
    except AssertionError:
        logging.error(
            "ERROR: ROC image %s not found in path %s",
            roc_image_name,
            image_path,
        )

    # Check if Feature Importance image exists
    feature_importance_image_name = "feature_importance.png"
    try:
        image_path = os.getcwd() + config["main"]["modeling"]["output_dir"]
        assert check_file_exists(image_path, feature_importance_image_name)
        logging.info(
            "SUCCESS: Feature importance image %s saved.",
            feature_importance_image_name,
        )
    except AssertionError:
        logging.error(
            "ERROR: Feature importance image %s not found in path %s",
            feature_importance_image_name,
            image_path,
        )

    # Check if model evaluation file exists
    model_eval_file_name = "slice_output.txt"
    try:
        image_path = os.getcwd() + config["main"]["modeling"]["output_dir"]
        assert check_file_exists(image_path, model_eval_file_name)
        logging.info(
            "SUCCESS: Model evaluation metrics %s saved.", model_eval_file_name
        )
    except AssertionError:
        logging.error(
            "ERROR: Model evaluation metrics %s not found in path %s",
            model_eval_file_name,
            image_path,
        )

    # Check if overall evaluation file exists
    overall_eval_file_name = "model_metrics.csv"
    try:
        image_path = os.getcwd() + config["main"]["modeling"]["output_dir"]
        assert check_file_exists(image_path, overall_eval_file_name)
        logging.info(
            "SUCCESS: Overall model evaluation metrics %s saved.",
            overall_eval_file_name,
        )
    except AssertionError:
        logging.error(
            "ERROR: Overall model evaluation metrics %s not found in path %s",
            overall_eval_file_name,
            image_path,
        )

In [61]:
test_train_and_evaluate_model(X_train, X_val, y_train, y_val, config)

2024-12-30 11:48:44,116 Creating model pipeline
2024-12-30 11:48:44,119 Training XGBClassifier model


Fitting 5 folds for each of 108 candidates, totalling 540 fits


2024-12-30 11:50:23,284 ERROR: Model training/evaluating failed. run_evaluate_model() missing 2 required positional arguments: 'best_model' and 'X'
2024-12-30 11:50:23,285 SUCCESS: Best model best_model.pkl is saved.
2024-12-30 11:50:23,286 SUCCESS: ROC image roc_curve.png saved.
2024-12-30 11:50:23,286 SUCCESS: Feature importance image feature_importance.png saved.
2024-12-30 11:50:23,287 SUCCESS: Model evaluation metrics slice_output.txt saved.
2024-12-30 11:50:23,287 SUCCESS: Overall model evaluation metrics model_metrics.csv saved.
