##1.2 Install and load libraries

In [None]:
!pip install wandb

In [None]:
!pip install pytest pytest-sugar

In [None]:
import wandb

In [None]:
# Login to Weights & Biases
!wandb login --relogin

##1.3 Pytest


###1.3.1 How pytest discovers tests

pytests uses the following conventions to automatically discovering tests:

1- files with tests should be called test_*.py or *_test.py

2- test function name should start with test_

###1.3.2 Fixture

An important aspect when using pytest is understanding the fixture's scope works.

The scope of the fixture can have a few legal values, described here. We are going to consider only **session** and **function**: with the former, the fixture is executed only once in a pytest session and the value it returns is used for all the tests that need it; with the latter, every test function gets a fresh copy of the data. This is useful if the tests modify the input in a way that make the other tests fail, for example.

###1.3.3 Create and run a test file

In [None]:
%%file test_data.py
import pytest
import wandb
import pandas as pd

# This is global so all tests are collected under the same run
run = wandb.init(project="diabetes_nn", job_type="data_checks")

@pytest.fixture(scope="session")
def data():

    local_path = run.use_artifact("diabetes_nn/preprocessed_data.csv:latest").file()
    df = pd.read_csv(local_path)

    return df

def test_data_length(data):
    """
    We test that we have enough data to continue
    """
    assert len(data) > 500


def test_number_of_columns(data):
    """
    We test that we have enough data to continue
    """
    assert data.shape[1] == 9

def test_column_presence_and_type(data):

    required_columns = {
        "Pregnancies": pd.api.types.is_int64_dtype,
        "Glucose": pd.api.types.is_int64_dtype,
        "BloodPressure": pd.api.types.is_int64_dtype,
        "SkinThickness": pd.api.types.is_int64_dtype,
        "Insulin": pd.api.types.is_int64_dtype,
        "BMI": pd.api.types.is_float_dtype,
        "DiabetesPedigreeFunction": pd.api.types.is_float_dtype,
        "Age": pd.api.types.is_int64_dtype,
        "Outcome": pd.api.types.is_int64_dtype
    }

    # Check column presence
    assert set(data.columns.values).issuperset(set(required_columns.keys()))

    for col_name, format_verification_funct in required_columns.items():

        assert format_verification_funct(data[col_name]), f"Column {col_name} failed test {format_verification_funct}"


def test_class_names(data):

    # Check that only the known classes are present
    known_classes = [0,1]

    assert data["Outcome"].isin(known_classes).all()


def test_column_ranges(data):

    ranges = {
        "Age": (18, 90)
    }

    for col_name, (minimum, maximum) in ranges.items():

        assert data[col_name].dropna().between(minimum, maximum).all(), (
            f"Column {col_name} failed the test. Should be between {minimum} and {maximum}, "
            f"instead min={data[col_name].min()} and max={data[col_name].max()}"
        )

Run test File

In [None]:
!pytest . -vv