In [1]:
!pip install wandb




[notice] A new release of pip is available: 23.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!pip install pytest pytest-sugar




[notice] A new release of pip is available: 23.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import wandb

In [4]:
# Login to Weights & Biases
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmlcolongmay[0m ([33mmlcolongmay-neu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## 1.2 Pytest

### 1.2.3 Create and run a test file

In [5]:
import wandb
import pytest
import pandas as pd
import os

# Initialize W&B run with specified entity and project
print("Initializing W&B...")
try:
    run = wandb.init(
        entity="ngocnhi-p4work-national-economics-university",  
        project="diabetes",  
        job_type="data_checks"
    )
    print("W&B initialized successfully")
except Exception as e:
    print(f"Error initializing W&B: {e}")
    raise

@pytest.fixture(scope="session")
def data():
    """
    Fixture to download the diabetes dataset from W&B artifact.
    """
    print("Downloading artifact...")
    try:
        artifact = run.use_artifact("preprocessed_data.csv:latest", type="clean_data")
        artifact_dir = artifact.download()
        local_path = os.path.join(artifact_dir, "preprocessed_data.csv")
        print(f"Artifact downloaded at {local_path}")
        df = pd.read_csv(local_path)
        print(f"Data loaded successfully: {df.shape}")
        print("Columns:", df.columns.tolist())
        return df
    except Exception as e:
        print(f"Error downloading artifact or reading data: {e}")
        raise

def test_no_missing_values(data):
    assert data.isnull().sum().sum() == 0, "Data contains missing values"
    
def test_class_balance(data, threshold=0.9):
    class_counts = data['OUTCOME'].value_counts(normalize=True)
    max_class_ratio = class_counts.max()
    assert max_class_ratio < threshold, f"Data imbalance: {class_counts.to_dict()}"

def test_duplicate_rows(data):
    duplicate_count = data.duplicated().sum()
    assert duplicate_count == 0, f"Data contains {duplicate_count} duplicate rows"

def test_data_length(data):
    """
    Ensure dataset has a sufficient number of records.
    """
    print("Running test_data_length...")
    assert len(data) > 500, f"Dataset has only {len(data)} records; requires > 500"

def test_number_of_columns(data):
    """
    Ensure dataset has at least the expected number of columns.
    """
    print("Running test_number_of_columns...")
    expected_min_columns = 9
    assert data.shape[1] >= expected_min_columns, f"Dataset has {data.shape[1]} columns; requires >= {expected_min_columns}"

def test_column_presence_and_type(data):
    """
    Validate that OUTCOME is integer and all other features are numeric.
    """
    print("Running test_column_presence_and_type...")

    assert "OUTCOME" in data.columns, "Missing 'OUTCOME' column"
    assert pd.api.types.is_integer_dtype(data["OUTCOME"]), "'OUTCOME' column is not integer type"

    feature_cols = [col for col in data.columns if col != "OUTCOME"]
    for col in feature_cols:
        assert pd.api.types.is_numeric_dtype(data[col]), f"Column '{col}' is not numeric"

def test_class_names(data):
    """
    Ensure OUTCOME column contains only valid classes (0 or 1).
    """
    print("Running test_class_names...")
    known_classes = [0, 1]
    assert data["OUTCOME"].isin(known_classes).all(), \
        f"'OUTCOME' column contains invalid values: {data['OUTCOME'].unique()}"

def test_column_ranges(data):
    """
    Ensure numerical columns contain reasonable values (non-negative and not extreme).
    """
    print("Running test_column_ranges...")
    feature_cols = [col for col in data.columns if col != "OUTCOME" and pd.api.types.is_numeric_dtype(data[col])]

    for col in feature_cols:
        min_val, max_val = data[col].min(), data[col].max()
        assert min_val >= 0, f"Column '{col}' contains negative values (min={min_val})"
        assert max_val < 1e6, f"Column '{col}' has extremely high values (max={max_val})"



Initializing W&B...


W&B initialized successfully


In [6]:
!pytest test_data.py -vvv

platform win32 -- Python 3.11.4, pytest-8.3.5, pluggy-1.5.0 -- C:\Users\x-hp\OneDrive - National Economics University\Desktop\ML Ops\.venv\Scripts\python.exe
cachedir: .pytest_cache
rootdir: c:\Users\x-hp\OneDrive - National Economics University\Desktop\ML Ops\FINAL\diabetes-prediction-main\diabetes-prediction-main\notebooks
plugins: sugar-1.0.0, typeguard-4.4.2
[1mcollecting ... [0mcollected 8 items

test_data.py::test_no_missing_values [32mPASSED[0m[32m                              [ 12%][0m
test_data.py::test_class_balance [32mPASSED[0m[32m                                  [ 25%][0m
test_data.py::test_duplicate_rows [32mPASSED[0m[32m                                 [ 37%][0m
test_data.py::test_data_length [32mPASSED[0m[32m                                    [ 50%][0m
test_data.py::test_number_of_columns [32mPASSED[0m[32m                              [ 62%][0m
test_data.py::test_column_presence_and_type [32mPASSED[0m[32m                       [ 75%][0m
test_

In [7]:
import wandb
wandb.finish()