In [1]:
import pytest
import wandb
import pandas as pd

In [2]:
run = wandb.init(project="exercise_7", job_type="data_tests")

[34m[1mwandb[0m: Currently logged in as: [33mlurui0421[0m ([33mruilu[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [19]:
@pytest.fixture(scope="session")
def data():

    local_path = run.use_artifact("exercise_5/preprocessed_data.csv:latest").file()
    df = pd.read_csv(local_path, low_memory=False)

    return df

def test_column_presence_and_type(data):

    required_columns = {
        "time_signature": pd.api.types.is_integer_dtype,
        "key": pd.api.types.is_integer_dtype,
        "danceability": pd.api.types.is_float_dtype,
        "energy": pd.api.types.is_float_dtype,
        "loudness": pd.api.types.is_float_dtype,
        "speechiness": pd.api.types.is_float_dtype,
        "acousticness": pd.api.types.is_float_dtype,
        "instrumentalness": pd.api.types.is_float_dtype,
        "liveness": pd.api.types.is_float_dtype,
        "valence": pd.api.types.is_float_dtype,
        "tempo": pd.api.types.is_float_dtype,
        "duration_ms": pd.api.types.is_integer_dtype,  # This is integer, not float as one might expect
        "text_feature": pd.api.types.is_string_dtype,
        "genre": pd.api.types.is_string_dtype
    }

    # Check column presence
    assert set(data.columns.values).issuperset(set(required_columns.keys()))

    for col_name, format_verification_funct in required_columns.items():

        assert format_verification_funct(data[col_name]), f"Column {col_name} failed test {format_verification_funct}"


In [18]:
def test_class_names(data):

    # Check that only the known classes are present
    known_classes = [
        "Dark Trap",
        "Underground Rap",
        "Trap Metal",
        "Emo",
        "Rap",
        "RnB",
        "Pop",
        "Hiphop",
        "techhouse",
        "techno",
        "trance",
        "psytrance",
        "trap",
        "dnb",
        "hardstyle",
    ]

    assert data["genre"].isin(known_classes).all()


def test_column_ranges(data):

    ranges = {
        "time_signature": (1, 5),
        "key": (0, 11),
        "danceability": (0, 1),
        "energy": (0, 1),
        "loudness": (-35, 5),
        "speechiness": (0, 1),
        "acousticness": (0, 1),
        "instrumentalness": (0, 1),
        "liveness": (0, 1),
        "valence": (0, 1),
        "tempo": (50, 250),
        "duration_ms": (20000, 1000000),
    }

    for col_name, (minimum, maximum) in ranges.items():

        assert data[col_name].dropna().between(minimum, maximum).all(), (
            f"Column {col_name} failed the test. Should be between {minimum} and {maximum}, "
            f"instead min={data[col_name].min()} and max={data[col_name].max()}"
        )


In [37]:
!pytest . -vv 

platform darwin -- Python 3.11.5, pytest-7.4.0, pluggy-1.0.0 -- /Users/ruilu/anaconda3/bin/python
cachedir: .pytest_cache
rootdir: /Users/ruilu/udacity_repo/exercise_7
plugins: anyio-3.5.0, typeguard-4.1.5
collected 3 items                                                              [0m[1m

test_data.py::test_column_presence_and_type [32mPASSED[0m[33m                       [ 33%][0m
test_data.py::test_class_names [32mPASSED[0m[33m                                    [ 66%][0m
test_data.py::test_column_ranges [32mPASSED[0m[33m                                  [100%][0m

../../anaconda3/lib/python3.11/site-packages/wandb/sdk/launch/builder/build.py:11
    import pkg_resources

../../anaconda3/lib/python3.11/site-packages/pkg_resources/__init__.py:2871
  Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
    declare_n