Skip to content

Commit

Permalink
Adds regression tests for #2007 (#2018)
Browse files Browse the repository at this point in the history
* first commit

* added regression test

* cleanup

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add ray tests

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: Geoffrey Angus <geoffrey@predibase.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people committed May 13, 2022
1 parent 2af21e4 commit 7e2b464
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 13 deletions.
20 changes: 17 additions & 3 deletions tests/integration_tests/test_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from ludwig.api import LudwigModel
from ludwig.backend import create_ray_backend, LOCAL_BACKEND
from ludwig.backend.ray import get_trainer_kwargs, RayBackend
from ludwig.constants import BALANCE_PERCENTAGE_TOLERANCE, NAME, TRAINER
from ludwig.constants import BACKFILL, BALANCE_PERCENTAGE_TOLERANCE, NAME, TRAINER
from ludwig.data.dataframe.dask import DaskEngine
from ludwig.data.preprocessing import balance_data
from ludwig.utils.data_utils import read_parquet
Expand Down Expand Up @@ -260,11 +260,25 @@ def test_ray_sequence():
run_test_with_features(input_features, output_features)


@pytest.mark.parametrize("feature_type", ["raw", "stft", "stft_phase", "group_delay", "fbank"])
@pytest.mark.distributed
def test_ray_audio():
def test_ray_audio(feature_type):
with tempfile.TemporaryDirectory() as tmpdir:
preprocessing_params = {
"audio_file_length_limit_in_s": 3.0,
"missing_value_strategy": BACKFILL,
"in_memory": True,
"padding_value": 0,
"norm": "per_file",
"audio_feature": {
"type": feature_type,
"window_length_in_s": 0.04,
"window_shift_in_s": 0.02,
"num_filter_bands": 80,
},
}
audio_dest_folder = os.path.join(tmpdir, "generated_audio")
input_features = [audio_feature(folder=audio_dest_folder)]
input_features = [audio_feature(folder=audio_dest_folder, preprocessing=preprocessing_params)]
output_features = [binary_feature()]
run_test_with_features(input_features, output_features)

Expand Down
64 changes: 54 additions & 10 deletions tests/ludwig/features/test_audio_feature.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import os
from random import choice
from string import ascii_lowercase, ascii_uppercase, digits
from typing import Dict

import pandas as pd
import pytest
import torch

from ludwig.features.audio_feature import AudioInputFeature
from ludwig.backend import LOCAL_BACKEND
from ludwig.constants import BACKFILL, PROC_COLUMN
from ludwig.features.audio_feature import AudioFeatureMixin, AudioInputFeature
from tests.integration_tests.utils import audio_feature, category_feature, generate_data

BATCH_SIZE = 2
SEQ_SIZE = 20
Expand All @@ -16,9 +20,9 @@
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


@pytest.fixture(scope="module")
def audio_config():
return {
@pytest.mark.parametrize("encoder", ["rnn", "stacked_cnn", "parallel_cnn", "stacked_parallel_cnn", "rnn", "cnnrnn"])
def test_audio_input_feature(encoder: str) -> None:
audio_config = {
"name": "audio_feature",
"type": "audio",
"preprocessing": {
Expand All @@ -34,13 +38,53 @@ def audio_config():
"vocab": VOCAB,
"max_sequence_length": SEQ_SIZE,
"embedding_size": AUDIO_W_SIZE,
"encoder": encoder,
}


@pytest.mark.parametrize("encoder", ["rnn", "stacked_cnn", "parallel_cnn", "stacked_parallel_cnn", "rnn", "cnnrnn"])
def test_audio_input_feature(audio_config: Dict, encoder: str) -> None:
audio_config.update({"encoder": encoder})
audio_input_feature = AudioInputFeature(audio_config).to(DEVICE)
audio_tensor = torch.randn([BATCH_SIZE, SEQ_SIZE, AUDIO_W_SIZE], dtype=torch.float32).to(DEVICE)
encoder_output = audio_input_feature(audio_tensor)
assert encoder_output["encoder_output"].shape[1:] == audio_input_feature.output_shape


@pytest.mark.parametrize("feature_type", ["raw", "stft", "stft_phase", "group_delay", "fbank"])
def test_add_feature_data(feature_type, tmpdir):
preprocessing_params = {
"audio_file_length_limit_in_s": 3.0,
"missing_value_strategy": BACKFILL,
"in_memory": True,
"padding_value": 0,
"norm": "per_file",
"audio_feature": {
"type": feature_type,
"window_length_in_s": 0.04,
"window_shift_in_s": 0.02,
"num_filter_bands": 80,
},
}
audio_dest_folder = os.path.join(tmpdir, "generated_audio")
audio_feature_config = audio_feature(audio_dest_folder, preprocessing=preprocessing_params)
data_df_path = generate_data(
[audio_feature_config],
[category_feature(vocab_size=5, reduce_input="sum")],
os.path.join(tmpdir, "data.csv"),
num_examples=10,
)
data_df = pd.read_csv(data_df_path)
metadata = {
audio_feature_config["name"]: AudioFeatureMixin.get_feature_meta(
data_df[audio_feature_config["name"]], preprocessing_params, LOCAL_BACKEND
)
}

proc_df = {}
AudioFeatureMixin.add_feature_data(
feature_config=audio_feature_config,
input_df=data_df,
proc_df=proc_df,
metadata=metadata,
preprocessing_parameters=preprocessing_params,
backend=LOCAL_BACKEND,
skip_save_processed_input=False,
)

assert len(proc_df[audio_feature_config[PROC_COLUMN]]) == 10

0 comments on commit 7e2b464

Please sign in to comment.