Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixed street address transformer and POC for seeds #1339

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 19 additions & 7 deletions worker/internal/benthos/transformers/generate_street_address.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,28 @@ type Address struct {
}

func init() {
spec := bloblang.NewPluginSpec().Param(bloblang.NewInt64Param("max_length"))
spec := bloblang.NewPluginSpec().Param(bloblang.NewInt64Param("max_length")).Param(bloblang.NewInt64Param("seed"))
nickzelei marked this conversation as resolved.
Show resolved Hide resolved

err := bloblang.RegisterFunctionV2("generate_street_address", spec, func(args *bloblang.ParsedParams) (bloblang.Function, error) {
maxLength, err := args.GetInt64("max_length")
if err != nil {
return nil, err
}

seed, err := args.GetOptionalInt64("seed")
if err != nil {
return nil, err
}

var randomizer *rand.Rand
if seed == nil {
randomizer = rand.New(rand.NewSource(int64(rand.Int())))
nickzelei marked this conversation as resolved.
Show resolved Hide resolved
} else {
randomizer = rand.New(rand.NewSource(*seed))
}

return func() (any, error) {
res, err := GenerateRandomStreetAddress(maxLength)
res, err := GenerateRandomStreetAddress(maxLength, randomizer)
if err != nil {
return nil, err
}
Expand All @@ -42,31 +54,31 @@ func init() {
}

/* Generates a random street address in the United States in the format <house_number> <street name> <street ending>*/
func GenerateRandomStreetAddress(maxLength int64) (string, error) {
func GenerateRandomStreetAddress(maxLength int64, randomizer *rand.Rand) (string, error) {
addresses := transformers_dataset.Addresses
var filteredAddresses []string

for _, address := range addresses {
if len(address.Address1) <= int(maxLength) {
filteredAddresses = append(filteredAddresses, address.City)
filteredAddresses = append(filteredAddresses, address.Address1)
}
}

if len(filteredAddresses) == 0 {
if maxLength > 3 {
hn, err := transformer_utils.GenerateRandomInt64InValueRange(1, 20)
hn, err := transformer_utils.GenerateRandomInt64InValueRange(1, 20, randomizer)
if err != nil {
return "", err
}

street, err := transformer_utils.GenerateRandomStringWithDefinedLength(maxLength - 3)
street, err := transformer_utils.GenerateRandomStringWithDefinedLength(maxLength-3, randomizer)
if err != nil {
return "", err
}

return fmt.Sprintf("%d %s", hn, street), nil
} else {
street, err := transformer_utils.GenerateRandomStringWithDefinedLength(maxLength)
street, err := transformer_utils.GenerateRandomStringWithDefinedLength(maxLength, randomizer)
if err != nil {
return "", err
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,47 +2,64 @@ package transformers

import (
"fmt"
"math/rand"
"testing"

"github.com/benthosdev/benthos/v4/public/bloblang"
"github.com/stretchr/testify/assert"
)

var randomizer = rand.New(rand.NewSource(int64(20)))

func Test_GenerateStreetAddress(t *testing.T) {
res, err := GenerateRandomStreetAddress(maxLength)
res, err := GenerateRandomStreetAddress(maxLength, randomizer)
assert.NoError(t, err)

assert.IsType(t, "", res, "The returned street address should be a string")

assert.LessOrEqual(t, int64(len(res)), maxLength, fmt.Sprintf("The city should be less than or equal to the max length. This is the error street address:%s", res))
assert.LessOrEqual(t, int64(len(res)), maxLength, fmt.Sprintf("The street should be less than or equal to the max length. This is the error street address:%s", res))
}

func Test_GenerateStreetAddressShortMax(t *testing.T) {
res, err := GenerateRandomStreetAddress(int64(5))
res, err := GenerateRandomStreetAddress(int64(5), randomizer)
assert.NoError(t, err)

assert.IsType(t, "", res, "The returned street address should be a string")

assert.LessOrEqual(t, int64(len(res)), maxLength, fmt.Sprintf("The city should be less than or equal to the max length. This is the error street address:%s", res))
assert.LessOrEqual(t, int64(len(res)), maxLength, fmt.Sprintf("The street should be less than or equal to the max length. This is the error street address:%s", res))
}

func Test_GenerateStreetAddressSVeryhortMax(t *testing.T) {
res, err := GenerateRandomStreetAddress(int64(2))
res, err := GenerateRandomStreetAddress(int64(2), randomizer)
assert.NoError(t, err)

assert.IsType(t, "", res, "The returned street address should be a string")

assert.LessOrEqual(t, int64(len(res)), maxLength, fmt.Sprintf("The city should be less than or equal to the max length. This is the error street address:%s", res))
assert.LessOrEqual(t, int64(len(res)), maxLength, fmt.Sprintf("The street should be less than or equal to the max length. This is the error street address:%s", res))
}

func Test_StreetAddressTransformer(t *testing.T) {
mapping := fmt.Sprintf(`root = generate_street_address(max_length:%d)`, maxLength)
mapping := fmt.Sprintf(`root = generate_street_address(max_length:%d: seed:%d)`, maxLength, randomizer)
ex, err := bloblang.Parse(mapping)
assert.NoError(t, err, "failed to parse the street address transformer")

res, err := ex.Query(nil)
assert.NoError(t, err)

assert.IsType(t, Address{}.Address1, res, "The returned street address should be a string")
assert.LessOrEqual(t, int64(len(res.(string))), maxLength, fmt.Sprintf("The street should be less than or equal to the max length. This is the error street address:%s", res))
}

func Test_StreetAddressTransformerNoSeed(t *testing.T) {

var noSeed *int64
mapping := fmt.Sprintf(`root = generate_street_address(max_length:%d: seed:%d)`, maxLength, noSeed)
nickzelei marked this conversation as resolved.
Show resolved Hide resolved
ex, err := bloblang.Parse(mapping)
assert.NoError(t, err, "failed to parse the street address transformer")

res, err := ex.Query(nil)
assert.NoError(t, err)

assert.IsType(t, Address{}.Address1, res, "The returned street address should be a string")
assert.LessOrEqual(t, int64(len(res.(string))), maxLength, fmt.Sprintf("The city should be less than or equal to the max length. This is the error street address:%s", res))
assert.LessOrEqual(t, int64(len(res.(string))), maxLength, fmt.Sprintf("The street should be less than or equal to the max length. This is the error street address:%s", res))
}
8 changes: 4 additions & 4 deletions worker/internal/benthos/transformers/utils/integer_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ func GenerateRandomInt64FixedLength(l int64) (int64, error) {
/*
Generates a random int64 with length in the inclusive range of [min, max]. For example, given a length range of [4, 7], possible values will have a length ranging from 4 -> 7 digits.
*/
func GenerateRandomInt64InLengthRange(min, max int64) (int64, error) {
func GenerateRandomInt64InLengthRange(min, max int64, randomizer *rand.Rand) (int64, error) {
if min > max {
min, max = max, min
}
Expand All @@ -39,7 +39,7 @@ func GenerateRandomInt64InLengthRange(min, max int64) (int64, error) {
return 0, fmt.Errorf("length is too large")
}

val, err := GenerateRandomInt64InValueRange(min, max)
val, err := GenerateRandomInt64InValueRange(min, max, randomizer)
if err != nil {
return 0, fmt.Errorf("unable to generate a value in the range provided")
}
Expand All @@ -53,7 +53,7 @@ func GenerateRandomInt64InLengthRange(min, max int64) (int64, error) {
}

/* Generates a random int64 in the inclusive range of [min, max]. For example, given a range of [40, 50], possible values range from 40 -> 50, inclusive. */
func GenerateRandomInt64InValueRange(min, max int64) (int64, error) {
func GenerateRandomInt64InValueRange(min, max int64, randomizer *rand.Rand) (int64, error) {
if min > max {
min, max = max, min
}
Expand All @@ -65,7 +65,7 @@ func GenerateRandomInt64InValueRange(min, max int64) (int64, error) {
rangeVal := max - min + 1

//nolint:gosec
return min + rand.Int63n(rangeVal), nil
return min + randomizer.Int63n(rangeVal), nil
}

func FirstDigitIsNine(n int64) bool {
Expand Down
8 changes: 4 additions & 4 deletions worker/internal/benthos/transformers/utils/string_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ func SliceString(s string, l int) string {
}

// Generate a random alphanumeric string of length l
func GenerateRandomStringWithDefinedLength(length int64) (string, error) {
func GenerateRandomStringWithDefinedLength(length int64, randomizer *rand.Rand) (string, error) {
if length < 1 {
return "", fmt.Errorf("the length of the string can't be less than 1")
}
Expand All @@ -38,7 +38,7 @@ func GenerateRandomStringWithDefinedLength(length int64) (string, error) {
for i := int64(0); i < length; i++ {
// Generate a random index in the range [0, len(alphabet))
//nolint:all
index := rand.Intn(len(alphanumeric))
index := randomizer.Intn(len(alphanumeric))

// Get the character at the generated index and append it to the result
result[i] = alphanumeric[index]
Expand All @@ -48,7 +48,7 @@ func GenerateRandomStringWithDefinedLength(length int64) (string, error) {
}

// Generate a random alphanumeric string within the interval [min, max]
func GenerateRandomStringWithInclusiveBounds(min, max int64) (string, error) {
func GenerateRandomStringWithInclusiveBounds(min, max int64, randomizer *rand.Rand) (string, error) {
if min < 0 || max < 0 || min > max {
return "", fmt.Errorf("the min and max can't be less than 0 and the min can't be greater than the max")
}
Expand All @@ -58,7 +58,7 @@ func GenerateRandomStringWithInclusiveBounds(min, max int64) (string, error) {
if min == max {
length = min
} else {
randlength, err := GenerateRandomInt64InValueRange(min, max)
randlength, err := GenerateRandomInt64InValueRange(min, max, randomizer)
if err != nil {
return "", fmt.Errorf("unable to generate a random length for the string")
}
Expand Down
8 changes: 7 additions & 1 deletion worker/pkg/workflows/datasync/activities/activities.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"log/slog"
"math/rand"
"net/http"
"os"
"strings"
Expand Down Expand Up @@ -727,6 +728,11 @@ func computeMutationFunction(col *mgmtv1alpha1.JobMapping, colInfo *dbschemas_ut
maxLen = *colInfo.CharacterMaximumLength
}

// Use the global RNG to seed the transformer RNGs
seed := rand.Int()

seedValue := rand.New(rand.NewSource(int64(seed)))
nickzelei marked this conversation as resolved.
Show resolved Hide resolved

switch col.Transformer.Source {
case "generate_categorical":
categories := col.Transformer.Config.GetGenerateCategoricalConfig().Categories
Expand Down Expand Up @@ -787,7 +793,7 @@ func computeMutationFunction(col *mgmtv1alpha1.JobMapping, colInfo *dbschemas_ut
case "generate_state":
return "generate_state()", nil
case "generate_street_address":
return fmt.Sprintf(`generate_street_address(max_length:%d)`, maxLen), nil
return fmt.Sprintf(`generate_street_address(max_length:%d, seed:%d)`, maxLen, seedValue), nil
case "generate_string_phone_number":
min := col.Transformer.Config.GetGenerateStringPhoneNumberConfig().Min
max := col.Transformer.Config.GetGenerateStringPhoneNumberConfig().Max
Expand Down
Loading